Merge remote-tracking branch 'upstream/main'

2026-07-01 18:48:04 -04:00 · 2023-10-02 00:33:09 +02:00
parent 072dd12962 39f29cecdb
commit 4a91a815a3
8 changed files with 60 additions and 15 deletions
@@ -2,6 +2,17 @@

 This is the changelog for the open source version of tiktoken.

+## [v0.5.1]
+- Add `encoding_name_for_model`, undo some renames to variables that are implementation details
+
+## [v0.5.0]
+- Add `tiktoken._educational` submodule to better document how byte pair encoding works
+- Ensure `encoding_for_model` knows about several new models
+- Add `decode_with_offets`
+- Better error for failures with the plugin mechanism
+- Make more tests public
+- Update versions of dependencies
+
 ## [v0.4.0]
 - Add `decode_batch` and `decode_bytes_batch`
 - Improve error messages and handling
@@ -1,6 +1,6 @@
 [project]
 name = "tiktoken"
-version = "0.4.0"
+version = "0.5.1"
 description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
 readme = "README.md"
 license = {file = "LICENSE"}
@@ -44,3 +44,6 @@ test-skip = "*-macosx_arm64"
 before-test = "pip install pytest hypothesis"
 test-command = "pytest {project}/tests --import-mode=append"

+[[tool.cibuildwheel.overrides]]
+select = "*linux_aarch64"
+test-command = """python -c 'import tiktoken; enc = tiktoken.get_encoding("gpt2"); assert enc.encode("hello world") == [31373, 995]'"""
@@ -1,6 +1,6 @@
 [package]
 name = "tiktoken"
-version = "0.2.0"
+version = "0.5.1"
 edition = "2021"
 rust-version = "1.57.0"

@@ -1,4 +1,6 @@
+# This is the public API of tiktoken
 from .core import Encoding as Encoding
 from .model import encoding_for_model as encoding_for_model
+from .model import encoding_name_for_model as encoding_name_for_model
 from .registry import get_encoding as get_encoding
 from .registry import list_encoding_names as list_encoding_names
@@ -1,8 +1,5 @@
 """This is an educational implementation of the byte pair encoding algorithm."""
-from __future__ import annotations
-
 import collections
-import itertools
 from typing import Optional

 import regex
@@ -187,11 +184,23 @@ def bpe_train(


 def visualise_tokens(token_values: list[bytes]) -> None:
-    backgrounds = itertools.cycle(
-        [f"\u001b[48;5;{i}m".encode() for i in [167, 179, 185, 77, 80, 68, 134]]
-    )
-    interleaved = itertools.chain.from_iterable(zip(backgrounds, token_values))
-    print((b"".join(interleaved) + "\u001b[0m".encode()).decode("utf-8"))
+    background = [f"\u001b[48;5;{i}m" for i in [167, 179, 185, 77, 80, 68, 134]]
+    # If token boundaries do not occur at unicode character boundaries, it's unclear how best to
+    # visualise the token. Here, we'll just use the unicode replacement character to represent some
+    # fraction of a character.
+    unicode_token_values = [x.decode("utf-8", errors="replace") for x in token_values]
+
+    running_length = 0
+    last_color = None
+    for token in unicode_token_values:
+        color = background[running_length % len(background)]
+        if color == last_color:
+            color = background[(running_length + 1) % len(background)]
+            assert color != last_color
+        last_color = color
+        running_length += len(token)
+        print(color + token, end="")
+    print("\u001b[0m")


 def train_simple_encoding():
@@ -15,14 +15,22 @@ MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
    # chat
    "gpt-4-": "cl100k_base",  # e.g., gpt-4-0314, etc., plus gpt-4-32k
    "gpt-3.5-turbo-": "cl100k_base",  # e.g, gpt-3.5-turbo-0301, -0401, etc.
-    "gpt-35-turbo": "cl100k_base",  # Azure deployment name
+    "gpt-35-turbo-": "cl100k_base",  # Azure deployment name
+    # fine-tuned
+    "ft:gpt-4": "cl100k_base",
+    "ft:gpt-3.5-turbo": "cl100k_base",
+    "ft:davinci-002": "cl100k_base",
+    "ft:babbage-002": "cl100k_base",
 }

 MODEL_TO_ENCODING: dict[str, str] = json.loads(pkg_resources.read_text("tiktoken", "model_to_encoding.json"))


-def encoding_for_model(model_name: str) -> Encoding:
-    """Returns the encoding used by a model."""
+def encoding_name_for_model(model_name: str) -> str:
+    """Returns the name of the encoding used by a model.
+
+    Raises a KeyError if the model name is not recognised.
+    """
    encoding_name = None
    if model_name in MODEL_TO_ENCODING:
        encoding_name = MODEL_TO_ENCODING[model_name]
@@ -32,7 +40,7 @@ def encoding_for_model(model_name: str) -> Encoding:
        # Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE)
        for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items():
            if model_name.startswith(model_prefix):
-                return get_encoding(model_encoding_name)
+                return model_encoding_name

    if encoding_name is None:
        raise KeyError(
@@ -40,4 +48,12 @@ def encoding_for_model(model_name: str) -> Encoding:
            "Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect."
        ) from None

-    return get_encoding(encoding_name)
+    return encoding_name
+
+
+def encoding_for_model(model_name: str) -> Encoding:
+    """Returns the encoding used by a model.
+
+    Raises a KeyError if the model name is not recognised.
+    """
+    return get_encoding(encoding_name_for_model(model_name))
@@ -1,4 +1,6 @@
 {
+    "davinci-002": "cl100k_base",
+    "babbage-002": "cl100k_base",
    "text-davinci-003": "p50k_base",
    "text-davinci-002": "p50k_base",
    "text-davinci-001": "r50k_base",
@@ -30,6 +32,7 @@
    "code-search-ada-code-001": "r50k_base",
    "gpt2": "gpt2",
    "gpt-3.5-turbo": "cl100k_base",
+    "gpt-35-turbo": "cl100k_base",
    "gpt-3.5-turbo-0301": "cl100k_base",
    "gpt-3.5-turbo-0613": "cl100k_base",
    "gpt-3.5-turbo-16k": "cl100k_base",
@@ -1,5 +1,6 @@
 from __future__ import annotations

+import functools
 import importlib
 import pkgutil
 import threading