diff --git a/CHANGELOG.md b/CHANGELOG.md index 416706f..8f37d7b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,17 @@ This is the changelog for the open source version of tiktoken. +## [v0.5.1] +- Add `encoding_name_for_model`, undo some renames to variables that are implementation details + +## [v0.5.0] +- Add `tiktoken._educational` submodule to better document how byte pair encoding works +- Ensure `encoding_for_model` knows about several new models +- Add `decode_with_offets` +- Better error for failures with the plugin mechanism +- Make more tests public +- Update versions of dependencies + ## [v0.4.0] - Add `decode_batch` and `decode_bytes_batch` - Improve error messages and handling diff --git a/pyproject.toml b/pyproject.toml index 9b7b919..b144ad0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "tiktoken" -version = "0.4.0" +version = "0.5.1" description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models" readme = "README.md" license = {file = "LICENSE"} @@ -44,3 +44,6 @@ test-skip = "*-macosx_arm64" before-test = "pip install pytest hypothesis" test-command = "pytest {project}/tests --import-mode=append" +[[tool.cibuildwheel.overrides]] +select = "*linux_aarch64" +test-command = """python -c 'import tiktoken; enc = tiktoken.get_encoding("gpt2"); assert enc.encode("hello world") == [31373, 995]'""" diff --git a/python/Cargo.toml b/python/Cargo.toml index ebb81ad..19ea251 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tiktoken" -version = "0.2.0" +version = "0.5.1" edition = "2021" rust-version = "1.57.0" diff --git a/tiktoken/__init__.py b/tiktoken/__init__.py index 9ad09a3..3a531b1 100644 --- a/tiktoken/__init__.py +++ b/tiktoken/__init__.py @@ -1,4 +1,6 @@ +# This is the public API of tiktoken from .core import Encoding as Encoding from .model import encoding_for_model as encoding_for_model +from .model import encoding_name_for_model as encoding_name_for_model from .registry import get_encoding as get_encoding from .registry import list_encoding_names as list_encoding_names diff --git a/tiktoken/_educational.py b/tiktoken/_educational.py index 0aba8f3..a6f5fcc 100644 --- a/tiktoken/_educational.py +++ b/tiktoken/_educational.py @@ -1,8 +1,5 @@ """This is an educational implementation of the byte pair encoding algorithm.""" -from __future__ import annotations - import collections -import itertools from typing import Optional import regex @@ -187,11 +184,23 @@ def bpe_train( def visualise_tokens(token_values: list[bytes]) -> None: - backgrounds = itertools.cycle( - [f"\u001b[48;5;{i}m".encode() for i in [167, 179, 185, 77, 80, 68, 134]] - ) - interleaved = itertools.chain.from_iterable(zip(backgrounds, token_values)) - print((b"".join(interleaved) + "\u001b[0m".encode()).decode("utf-8")) + background = [f"\u001b[48;5;{i}m" for i in [167, 179, 185, 77, 80, 68, 134]] + # If token boundaries do not occur at unicode character boundaries, it's unclear how best to + # visualise the token. Here, we'll just use the unicode replacement character to represent some + # fraction of a character. + unicode_token_values = [x.decode("utf-8", errors="replace") for x in token_values] + + running_length = 0 + last_color = None + for token in unicode_token_values: + color = background[running_length % len(background)] + if color == last_color: + color = background[(running_length + 1) % len(background)] + assert color != last_color + last_color = color + running_length += len(token) + print(color + token, end="") + print("\u001b[0m") def train_simple_encoding(): diff --git a/tiktoken/model.py b/tiktoken/model.py index 011a466..2cf4224 100644 --- a/tiktoken/model.py +++ b/tiktoken/model.py @@ -15,14 +15,22 @@ MODEL_PREFIX_TO_ENCODING: dict[str, str] = { # chat "gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc. - "gpt-35-turbo": "cl100k_base", # Azure deployment name + "gpt-35-turbo-": "cl100k_base", # Azure deployment name + # fine-tuned + "ft:gpt-4": "cl100k_base", + "ft:gpt-3.5-turbo": "cl100k_base", + "ft:davinci-002": "cl100k_base", + "ft:babbage-002": "cl100k_base", } MODEL_TO_ENCODING: dict[str, str] = json.loads(pkg_resources.read_text("tiktoken", "model_to_encoding.json")) -def encoding_for_model(model_name: str) -> Encoding: - """Returns the encoding used by a model.""" +def encoding_name_for_model(model_name: str) -> str: + """Returns the name of the encoding used by a model. + + Raises a KeyError if the model name is not recognised. + """ encoding_name = None if model_name in MODEL_TO_ENCODING: encoding_name = MODEL_TO_ENCODING[model_name] @@ -32,7 +40,7 @@ def encoding_for_model(model_name: str) -> Encoding: # Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE) for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items(): if model_name.startswith(model_prefix): - return get_encoding(model_encoding_name) + return model_encoding_name if encoding_name is None: raise KeyError( @@ -40,4 +48,12 @@ def encoding_for_model(model_name: str) -> Encoding: "Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect." ) from None - return get_encoding(encoding_name) + return encoding_name + + +def encoding_for_model(model_name: str) -> Encoding: + """Returns the encoding used by a model. + + Raises a KeyError if the model name is not recognised. + """ + return get_encoding(encoding_name_for_model(model_name)) diff --git a/tiktoken/model_to_encoding.json b/tiktoken/model_to_encoding.json index a7a244b..f40f5d9 100644 --- a/tiktoken/model_to_encoding.json +++ b/tiktoken/model_to_encoding.json @@ -1,4 +1,6 @@ { + "davinci-002": "cl100k_base", + "babbage-002": "cl100k_base", "text-davinci-003": "p50k_base", "text-davinci-002": "p50k_base", "text-davinci-001": "r50k_base", @@ -30,6 +32,7 @@ "code-search-ada-code-001": "r50k_base", "gpt2": "gpt2", "gpt-3.5-turbo": "cl100k_base", + "gpt-35-turbo": "cl100k_base", "gpt-3.5-turbo-0301": "cl100k_base", "gpt-3.5-turbo-0613": "cl100k_base", "gpt-3.5-turbo-16k": "cl100k_base", diff --git a/tiktoken/registry.py b/tiktoken/registry.py index 0a55d27..f7e5385 100644 --- a/tiktoken/registry.py +++ b/tiktoken/registry.py @@ -1,5 +1,6 @@ from __future__ import annotations +import functools import importlib import pkgutil import threading