From a7937793e7460ae8e815f506600b7248fad2a803 Mon Sep 17 00:00:00 2001 From: Shantanu Date: Mon, 11 Sep 2023 14:50:26 -0700 Subject: [PATCH 1/4] Sync codebase --- CHANGELOG.md | 8 ++++++++ Cargo.toml | 2 +- pyproject.toml | 2 +- tiktoken/_educational.py | 25 +++++++++++++++++-------- tiktoken/model.py | 33 +++++++++++++++++++++------------ tiktoken/registry.py | 30 ++++++++++++++++++++---------- 6 files changed, 68 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 416706f..2618724 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,14 @@ This is the changelog for the open source version of tiktoken. +## [v0.5.0] +- Add `tiktoken._educational` submodule to better document how byte pair encoding works +- Ensure `encoding_for_model` knows about several new models +- Add `decode_with_offets` +- Better error for failures with the plugin mechanism +- Make more tests public +- Update versions of dependencies + ## [v0.4.0] - Add `decode_batch` and `decode_bytes_batch` - Improve error messages and handling diff --git a/Cargo.toml b/Cargo.toml index 948b9f1..d789da9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tiktoken" -version = "0.4.0" +version = "0.5.0" edition = "2021" rust-version = "1.57.0" diff --git a/pyproject.toml b/pyproject.toml index d8d600d..8d1187e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "tiktoken" -version = "0.4.0" +version = "0.5.0" description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models" readme = "README.md" license = {file = "LICENSE"} diff --git a/tiktoken/_educational.py b/tiktoken/_educational.py index 0aba8f3..a6f5fcc 100644 --- a/tiktoken/_educational.py +++ b/tiktoken/_educational.py @@ -1,8 +1,5 @@ """This is an educational implementation of the byte pair encoding algorithm.""" -from __future__ import annotations - import collections -import itertools from typing import Optional import regex @@ -187,11 +184,23 @@ def bpe_train( def visualise_tokens(token_values: list[bytes]) -> None: - backgrounds = itertools.cycle( - [f"\u001b[48;5;{i}m".encode() for i in [167, 179, 185, 77, 80, 68, 134]] - ) - interleaved = itertools.chain.from_iterable(zip(backgrounds, token_values)) - print((b"".join(interleaved) + "\u001b[0m".encode()).decode("utf-8")) + background = [f"\u001b[48;5;{i}m" for i in [167, 179, 185, 77, 80, 68, 134]] + # If token boundaries do not occur at unicode character boundaries, it's unclear how best to + # visualise the token. Here, we'll just use the unicode replacement character to represent some + # fraction of a character. + unicode_token_values = [x.decode("utf-8", errors="replace") for x in token_values] + + running_length = 0 + last_color = None + for token in unicode_token_values: + color = background[running_length % len(background)] + if color == last_color: + color = background[(running_length + 1) % len(background)] + assert color != last_color + last_color = color + running_length += len(token) + print(color + token, end="") + print("\u001b[0m") def train_simple_encoding(): diff --git a/tiktoken/model.py b/tiktoken/model.py index 26201ce..a67830c 100644 --- a/tiktoken/model.py +++ b/tiktoken/model.py @@ -4,19 +4,30 @@ from .core import Encoding from .registry import get_encoding # TODO: these will likely be replaced by an API endpoint -MODEL_PREFIX_TO_ENCODING: dict[str, str] = { +_MODEL_PREFIX_TO_ENCODING: dict[str, str] = { # chat "gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc. - "gpt-35-turbo": "cl100k_base", # Azure deployment name + "gpt-35-turbo-": "cl100k_base", # Azure deployment name + # fine-tuned + "ft:gpt-4": "cl100k_base", + "ft:gpt-3.5-turbo": "cl100k_base", + "ft:davinci-002": "cl100k_base", + "ft:babbage-002": "cl100k_base", } -MODEL_TO_ENCODING: dict[str, str] = { +_MODEL_TO_ENCODING: dict[str, str] = { # chat "gpt-4": "cl100k_base", "gpt-3.5-turbo": "cl100k_base", "gpt-35-turbo": "cl100k_base", # Azure deployment name - # text + # base + "davinci-002": "cl100k_base", + "babbage-002": "cl100k_base", + # embeddings + "text-embedding-ada-002": "cl100k_base", + # DEPRECATED MODELS + # text (DEPRECATED) "text-davinci-003": "p50k_base", "text-davinci-002": "p50k_base", "text-davinci-001": "r50k_base", @@ -27,19 +38,17 @@ MODEL_TO_ENCODING: dict[str, str] = { "curie": "r50k_base", "babbage": "r50k_base", "ada": "r50k_base", - # code + # code (DEPRECATED) "code-davinci-002": "p50k_base", "code-davinci-001": "p50k_base", "code-cushman-002": "p50k_base", "code-cushman-001": "p50k_base", "davinci-codex": "p50k_base", "cushman-codex": "p50k_base", - # edit + # edit (DEPRECATED) "text-davinci-edit-001": "p50k_edit", "code-davinci-edit-001": "p50k_edit", - # embeddings - "text-embedding-ada-002": "cl100k_base", - # old embeddings + # old embeddings (DEPRECATED) "text-similarity-davinci-001": "r50k_base", "text-similarity-curie-001": "r50k_base", "text-similarity-babbage-001": "r50k_base", @@ -58,13 +67,13 @@ MODEL_TO_ENCODING: dict[str, str] = { def encoding_for_model(model_name: str) -> Encoding: """Returns the encoding used by a model.""" encoding_name = None - if model_name in MODEL_TO_ENCODING: - encoding_name = MODEL_TO_ENCODING[model_name] + if model_name in _MODEL_TO_ENCODING: + encoding_name = _MODEL_TO_ENCODING[model_name] else: # Check if the model matches a known prefix # Prefix matching avoids needing library updates for every model version release # Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE) - for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items(): + for model_prefix, model_encoding_name in _MODEL_PREFIX_TO_ENCODING.items(): if model_name.startswith(model_prefix): return get_encoding(model_encoding_name) diff --git a/tiktoken/registry.py b/tiktoken/registry.py index 52d8ec2..a753ce6 100644 --- a/tiktoken/registry.py +++ b/tiktoken/registry.py @@ -1,9 +1,10 @@ from __future__ import annotations +import functools import importlib import pkgutil import threading -from typing import Any, Callable, Optional +from typing import Any, Callable, Optional, Sequence import tiktoken_ext @@ -14,6 +15,20 @@ ENCODINGS: dict[str, Encoding] = {} ENCODING_CONSTRUCTORS: Optional[dict[str, Callable[[], dict[str, Any]]]] = None +@functools.lru_cache() +def _available_plugin_modules() -> Sequence[str]: + # tiktoken_ext is a namespace package + # submodules inside tiktoken_ext will be inspected for ENCODING_CONSTRUCTORS attributes + # - we use namespace package pattern so `pkgutil.iter_modules` is fast + # - it's a separate top-level package because namespace subpackages of non-namespace + # packages don't quite do what you want with editable installs + mods = [] + plugin_mods = pkgutil.iter_modules(tiktoken_ext.__path__, tiktoken_ext.__name__ + ".") + for _, mod_name, _ in plugin_mods: + mods.append(mod_name) + return mods + + def _find_constructors() -> None: global ENCODING_CONSTRUCTORS with _lock: @@ -21,14 +36,7 @@ def _find_constructors() -> None: return ENCODING_CONSTRUCTORS = {} - # tiktoken_ext is a namespace package - # submodules inside tiktoken_ext will be inspected for ENCODING_CONSTRUCTORS attributes - # - we use namespace package pattern so `pkgutil.iter_modules` is fast - # - it's a separate top-level package because namespace subpackages of non-namespace - # packages don't quite do what you want with editable installs - plugin_mods = pkgutil.iter_modules(tiktoken_ext.__path__, tiktoken_ext.__name__ + ".") - - for _, mod_name, _ in plugin_mods: + for mod_name in _available_plugin_modules(): mod = importlib.import_module(mod_name) try: constructors = mod.ENCODING_CONSTRUCTORS @@ -57,7 +65,9 @@ def get_encoding(encoding_name: str) -> Encoding: assert ENCODING_CONSTRUCTORS is not None if encoding_name not in ENCODING_CONSTRUCTORS: - raise ValueError(f"Unknown encoding {encoding_name}") + raise ValueError( + f"Unknown encoding {encoding_name}. Plugins found: {_available_plugin_modules()}" + ) constructor = ENCODING_CONSTRUCTORS[encoding_name] enc = Encoding(**constructor()) From cc1848c08a8694d9db24ea817da368de63690542 Mon Sep 17 00:00:00 2001 From: Shantanu Date: Mon, 11 Sep 2023 18:51:34 -0700 Subject: [PATCH 2/4] Run only the most minimal test in emulated builds --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 8d1187e..2584947 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,3 +39,6 @@ test-skip = "*-macosx_arm64" before-test = "pip install pytest hypothesis" test-command = "pytest {project}/tests --import-mode=append" +[[tool.cibuildwheel.overrides]] +select = "*linux_aarch64" +test-command = """python -c 'import tiktoken; enc = tiktoken.get_encoding("gpt2"); assert enc.encode("hello world") == [31373, 995]'""" From 52fceb8fa1d287680e81c84bd300dbd1a1acd0cc Mon Sep 17 00:00:00 2001 From: Alvaro Bartolome Date: Tue, 12 Sep 2023 06:18:00 +0200 Subject: [PATCH 3/4] Replace `<|endoftext|>` with constant (#186) --- tiktoken_ext/openai_public.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tiktoken_ext/openai_public.py b/tiktoken_ext/openai_public.py index 16a6ec5..ba25cbb 100644 --- a/tiktoken_ext/openai_public.py +++ b/tiktoken_ext/openai_public.py @@ -17,7 +17,7 @@ def gpt2(): "explicit_n_vocab": 50257, "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", "mergeable_ranks": mergeable_ranks, - "special_tokens": {"<|endoftext|>": 50256}, + "special_tokens": {ENDOFTEXT: 50256}, } From 39f29cecdb6fc38d9a3434e5dd15e4de58cf3c80 Mon Sep 17 00:00:00 2001 From: Shantanu Date: Tue, 12 Sep 2023 17:39:23 -0700 Subject: [PATCH 4/4] Sync codebase --- CHANGELOG.md | 3 +++ Cargo.toml | 2 +- pyproject.toml | 2 +- tiktoken/__init__.py | 2 ++ tiktoken/model.py | 29 ++++++++++++++++++++--------- 5 files changed, 27 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2618724..8f37d7b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ This is the changelog for the open source version of tiktoken. +## [v0.5.1] +- Add `encoding_name_for_model`, undo some renames to variables that are implementation details + ## [v0.5.0] - Add `tiktoken._educational` submodule to better document how byte pair encoding works - Ensure `encoding_for_model` knows about several new models diff --git a/Cargo.toml b/Cargo.toml index d789da9..0486639 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tiktoken" -version = "0.5.0" +version = "0.5.1" edition = "2021" rust-version = "1.57.0" diff --git a/pyproject.toml b/pyproject.toml index 2584947..e3df78a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "tiktoken" -version = "0.5.0" +version = "0.5.1" description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models" readme = "README.md" license = {file = "LICENSE"} diff --git a/tiktoken/__init__.py b/tiktoken/__init__.py index 9ad09a3..3a531b1 100644 --- a/tiktoken/__init__.py +++ b/tiktoken/__init__.py @@ -1,4 +1,6 @@ +# This is the public API of tiktoken from .core import Encoding as Encoding from .model import encoding_for_model as encoding_for_model +from .model import encoding_name_for_model as encoding_name_for_model from .registry import get_encoding as get_encoding from .registry import list_encoding_names as list_encoding_names diff --git a/tiktoken/model.py b/tiktoken/model.py index a67830c..3f36693 100644 --- a/tiktoken/model.py +++ b/tiktoken/model.py @@ -4,7 +4,7 @@ from .core import Encoding from .registry import get_encoding # TODO: these will likely be replaced by an API endpoint -_MODEL_PREFIX_TO_ENCODING: dict[str, str] = { +MODEL_PREFIX_TO_ENCODING: dict[str, str] = { # chat "gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc. @@ -16,7 +16,7 @@ _MODEL_PREFIX_TO_ENCODING: dict[str, str] = { "ft:babbage-002": "cl100k_base", } -_MODEL_TO_ENCODING: dict[str, str] = { +MODEL_TO_ENCODING: dict[str, str] = { # chat "gpt-4": "cl100k_base", "gpt-3.5-turbo": "cl100k_base", @@ -64,18 +64,21 @@ _MODEL_TO_ENCODING: dict[str, str] = { } -def encoding_for_model(model_name: str) -> Encoding: - """Returns the encoding used by a model.""" +def encoding_name_for_model(model_name: str) -> str: + """Returns the name of the encoding used by a model. + + Raises a KeyError if the model name is not recognised. + """ encoding_name = None - if model_name in _MODEL_TO_ENCODING: - encoding_name = _MODEL_TO_ENCODING[model_name] + if model_name in MODEL_TO_ENCODING: + encoding_name = MODEL_TO_ENCODING[model_name] else: # Check if the model matches a known prefix # Prefix matching avoids needing library updates for every model version release # Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE) - for model_prefix, model_encoding_name in _MODEL_PREFIX_TO_ENCODING.items(): + for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items(): if model_name.startswith(model_prefix): - return get_encoding(model_encoding_name) + return model_encoding_name if encoding_name is None: raise KeyError( @@ -83,4 +86,12 @@ def encoding_for_model(model_name: str) -> Encoding: "Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect." ) from None - return get_encoding(encoding_name) + return encoding_name + + +def encoding_for_model(model_name: str) -> Encoding: + """Returns the encoding used by a model. + + Raises a KeyError if the model name is not recognised. + """ + return get_encoding(encoding_name_for_model(model_name))