From a7937793e7460ae8e815f506600b7248fad2a803 Mon Sep 17 00:00:00 2001
From: Shantanu <shantanu@openai.com>
Date: Mon, 11 Sep 2023 14:50:26 -0700
Subject: [PATCH 1/4] Sync codebase

---
 CHANGELOG.md             |  8 ++++++++
 Cargo.toml               |  2 +-
 pyproject.toml           |  2 +-
 tiktoken/_educational.py | 25 +++++++++++++++++--------
 tiktoken/model.py        | 33 +++++++++++++++++++++------------
 tiktoken/registry.py     | 30 ++++++++++++++++++++----------
 6 files changed, 68 insertions(+), 32 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 416706f..2618724 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,14 @@
 
 This is the changelog for the open source version of tiktoken.
 
+## [v0.5.0]
+- Add `tiktoken._educational` submodule to better document how byte pair encoding works
+- Ensure `encoding_for_model` knows about several new models
+- Add `decode_with_offets`
+- Better error for failures with the plugin mechanism
+- Make more tests public
+- Update versions of dependencies
+
 ## [v0.4.0]
 - Add `decode_batch` and `decode_bytes_batch`
 - Improve error messages and handling
diff --git a/Cargo.toml b/Cargo.toml
index 948b9f1..d789da9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tiktoken"
-version = "0.4.0"
+version = "0.5.0"
 edition = "2021"
 rust-version = "1.57.0"
 
diff --git a/pyproject.toml b/pyproject.toml
index d8d600d..8d1187e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "tiktoken"
-version = "0.4.0"
+version = "0.5.0"
 description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
 readme = "README.md"
 license = {file = "LICENSE"}
diff --git a/tiktoken/_educational.py b/tiktoken/_educational.py
index 0aba8f3..a6f5fcc 100644
--- a/tiktoken/_educational.py
+++ b/tiktoken/_educational.py
@@ -1,8 +1,5 @@
 """This is an educational implementation of the byte pair encoding algorithm."""
-from __future__ import annotations
-
 import collections
-import itertools
 from typing import Optional
 
 import regex
@@ -187,11 +184,23 @@ def bpe_train(
 
 
 def visualise_tokens(token_values: list[bytes]) -> None:
-    backgrounds = itertools.cycle(
-        [f"\u001b[48;5;{i}m".encode() for i in [167, 179, 185, 77, 80, 68, 134]]
-    )
-    interleaved = itertools.chain.from_iterable(zip(backgrounds, token_values))
-    print((b"".join(interleaved) + "\u001b[0m".encode()).decode("utf-8"))
+    background = [f"\u001b[48;5;{i}m" for i in [167, 179, 185, 77, 80, 68, 134]]
+    # If token boundaries do not occur at unicode character boundaries, it's unclear how best to
+    # visualise the token. Here, we'll just use the unicode replacement character to represent some
+    # fraction of a character.
+    unicode_token_values = [x.decode("utf-8", errors="replace") for x in token_values]
+
+    running_length = 0
+    last_color = None
+    for token in unicode_token_values:
+        color = background[running_length % len(background)]
+        if color == last_color:
+            color = background[(running_length + 1) % len(background)]
+            assert color != last_color
+        last_color = color
+        running_length += len(token)
+        print(color + token, end="")
+    print("\u001b[0m")
 
 
 def train_simple_encoding():
diff --git a/tiktoken/model.py b/tiktoken/model.py
index 26201ce..a67830c 100644
--- a/tiktoken/model.py
+++ b/tiktoken/model.py
@@ -4,19 +4,30 @@ from .core import Encoding
 from .registry import get_encoding
 
 # TODO: these will likely be replaced by an API endpoint
-MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
+_MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
     # chat
     "gpt-4-": "cl100k_base",  # e.g., gpt-4-0314, etc., plus gpt-4-32k
     "gpt-3.5-turbo-": "cl100k_base",  # e.g, gpt-3.5-turbo-0301, -0401, etc.
-    "gpt-35-turbo": "cl100k_base",  # Azure deployment name
+    "gpt-35-turbo-": "cl100k_base",  # Azure deployment name
+    # fine-tuned
+    "ft:gpt-4": "cl100k_base",
+    "ft:gpt-3.5-turbo": "cl100k_base",
+    "ft:davinci-002": "cl100k_base",
+    "ft:babbage-002": "cl100k_base",
 }
 
-MODEL_TO_ENCODING: dict[str, str] = {
+_MODEL_TO_ENCODING: dict[str, str] = {
     # chat
     "gpt-4": "cl100k_base",
     "gpt-3.5-turbo": "cl100k_base",
     "gpt-35-turbo": "cl100k_base",  # Azure deployment name
-    # text
+    # base
+    "davinci-002": "cl100k_base",
+    "babbage-002": "cl100k_base",
+    # embeddings
+    "text-embedding-ada-002": "cl100k_base",
+    # DEPRECATED MODELS
+    # text (DEPRECATED)
     "text-davinci-003": "p50k_base",
     "text-davinci-002": "p50k_base",
     "text-davinci-001": "r50k_base",
@@ -27,19 +38,17 @@ MODEL_TO_ENCODING: dict[str, str] = {
     "curie": "r50k_base",
     "babbage": "r50k_base",
     "ada": "r50k_base",
-    # code
+    # code (DEPRECATED)
     "code-davinci-002": "p50k_base",
     "code-davinci-001": "p50k_base",
     "code-cushman-002": "p50k_base",
     "code-cushman-001": "p50k_base",
     "davinci-codex": "p50k_base",
     "cushman-codex": "p50k_base",
-    # edit
+    # edit (DEPRECATED)
     "text-davinci-edit-001": "p50k_edit",
     "code-davinci-edit-001": "p50k_edit",
-    # embeddings
-    "text-embedding-ada-002": "cl100k_base",
-    # old embeddings
+    # old embeddings (DEPRECATED)
     "text-similarity-davinci-001": "r50k_base",
     "text-similarity-curie-001": "r50k_base",
     "text-similarity-babbage-001": "r50k_base",
@@ -58,13 +67,13 @@ MODEL_TO_ENCODING: dict[str, str] = {
 def encoding_for_model(model_name: str) -> Encoding:
     """Returns the encoding used by a model."""
     encoding_name = None
-    if model_name in MODEL_TO_ENCODING:
-        encoding_name = MODEL_TO_ENCODING[model_name]
+    if model_name in _MODEL_TO_ENCODING:
+        encoding_name = _MODEL_TO_ENCODING[model_name]
     else:
         # Check if the model matches a known prefix
         # Prefix matching avoids needing library updates for every model version release
         # Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE)
-        for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items():
+        for model_prefix, model_encoding_name in _MODEL_PREFIX_TO_ENCODING.items():
             if model_name.startswith(model_prefix):
                 return get_encoding(model_encoding_name)
 
diff --git a/tiktoken/registry.py b/tiktoken/registry.py
index 52d8ec2..a753ce6 100644
--- a/tiktoken/registry.py
+++ b/tiktoken/registry.py
@@ -1,9 +1,10 @@
 from __future__ import annotations
 
+import functools
 import importlib
 import pkgutil
 import threading
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, Sequence
 
 import tiktoken_ext
 
@@ -14,6 +15,20 @@ ENCODINGS: dict[str, Encoding] = {}
 ENCODING_CONSTRUCTORS: Optional[dict[str, Callable[[], dict[str, Any]]]] = None
 
 
+@functools.lru_cache()
+def _available_plugin_modules() -> Sequence[str]:
+    # tiktoken_ext is a namespace package
+    # submodules inside tiktoken_ext will be inspected for ENCODING_CONSTRUCTORS attributes
+    # - we use namespace package pattern so `pkgutil.iter_modules` is fast
+    # - it's a separate top-level package because namespace subpackages of non-namespace
+    #   packages don't quite do what you want with editable installs
+    mods = []
+    plugin_mods = pkgutil.iter_modules(tiktoken_ext.__path__, tiktoken_ext.__name__ + ".")
+    for _, mod_name, _ in plugin_mods:
+        mods.append(mod_name)
+    return mods
+
+
 def _find_constructors() -> None:
     global ENCODING_CONSTRUCTORS
     with _lock:
@@ -21,14 +36,7 @@ def _find_constructors() -> None:
             return
         ENCODING_CONSTRUCTORS = {}
 
-        # tiktoken_ext is a namespace package
-        # submodules inside tiktoken_ext will be inspected for ENCODING_CONSTRUCTORS attributes
-        # - we use namespace package pattern so `pkgutil.iter_modules` is fast
-        # - it's a separate top-level package because namespace subpackages of non-namespace
-        #   packages don't quite do what you want with editable installs
-        plugin_mods = pkgutil.iter_modules(tiktoken_ext.__path__, tiktoken_ext.__name__ + ".")
-
-        for _, mod_name, _ in plugin_mods:
+        for mod_name in _available_plugin_modules():
             mod = importlib.import_module(mod_name)
             try:
                 constructors = mod.ENCODING_CONSTRUCTORS
@@ -57,7 +65,9 @@ def get_encoding(encoding_name: str) -> Encoding:
             assert ENCODING_CONSTRUCTORS is not None
 
         if encoding_name not in ENCODING_CONSTRUCTORS:
-            raise ValueError(f"Unknown encoding {encoding_name}")
+            raise ValueError(
+                f"Unknown encoding {encoding_name}. Plugins found: {_available_plugin_modules()}"
+            )
 
         constructor = ENCODING_CONSTRUCTORS[encoding_name]
         enc = Encoding(**constructor())

From cc1848c08a8694d9db24ea817da368de63690542 Mon Sep 17 00:00:00 2001
From: Shantanu <shantanu@openai.com>
Date: Mon, 11 Sep 2023 18:51:34 -0700
Subject: [PATCH 2/4] Run only the most minimal test in emulated builds

---
 pyproject.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 8d1187e..2584947 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,3 +39,6 @@ test-skip = "*-macosx_arm64"
 before-test = "pip install pytest hypothesis"
 test-command = "pytest {project}/tests --import-mode=append"
 
+[[tool.cibuildwheel.overrides]]
+select = "*linux_aarch64"
+test-command = """python -c 'import tiktoken; enc = tiktoken.get_encoding("gpt2"); assert enc.encode("hello world") == [31373, 995]'"""

From 52fceb8fa1d287680e81c84bd300dbd1a1acd0cc Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <alvarobartt@gmail.com>
Date: Tue, 12 Sep 2023 06:18:00 +0200
Subject: [PATCH 3/4] Replace `<|endoftext|>` with constant (#186)

---
 tiktoken_ext/openai_public.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tiktoken_ext/openai_public.py b/tiktoken_ext/openai_public.py
index 16a6ec5..ba25cbb 100644
--- a/tiktoken_ext/openai_public.py
+++ b/tiktoken_ext/openai_public.py
@@ -17,7 +17,7 @@ def gpt2():
         "explicit_n_vocab": 50257,
         "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
         "mergeable_ranks": mergeable_ranks,
-        "special_tokens": {"<|endoftext|>": 50256},
+        "special_tokens": {ENDOFTEXT: 50256},
     }
 
 

From 39f29cecdb6fc38d9a3434e5dd15e4de58cf3c80 Mon Sep 17 00:00:00 2001
From: Shantanu <shantanu@openai.com>
Date: Tue, 12 Sep 2023 17:39:23 -0700
Subject: [PATCH 4/4] Sync codebase

---
 CHANGELOG.md         |  3 +++
 Cargo.toml           |  2 +-
 pyproject.toml       |  2 +-
 tiktoken/__init__.py |  2 ++
 tiktoken/model.py    | 29 ++++++++++++++++++++---------
 5 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2618724..8f37d7b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 This is the changelog for the open source version of tiktoken.
 
+## [v0.5.1]
+- Add `encoding_name_for_model`, undo some renames to variables that are implementation details
+
 ## [v0.5.0]
 - Add `tiktoken._educational` submodule to better document how byte pair encoding works
 - Ensure `encoding_for_model` knows about several new models
diff --git a/Cargo.toml b/Cargo.toml
index d789da9..0486639 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tiktoken"
-version = "0.5.0"
+version = "0.5.1"
 edition = "2021"
 rust-version = "1.57.0"
 
diff --git a/pyproject.toml b/pyproject.toml
index 2584947..e3df78a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "tiktoken"
-version = "0.5.0"
+version = "0.5.1"
 description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
 readme = "README.md"
 license = {file = "LICENSE"}
diff --git a/tiktoken/__init__.py b/tiktoken/__init__.py
index 9ad09a3..3a531b1 100644
--- a/tiktoken/__init__.py
+++ b/tiktoken/__init__.py
@@ -1,4 +1,6 @@
+# This is the public API of tiktoken
 from .core import Encoding as Encoding
 from .model import encoding_for_model as encoding_for_model
+from .model import encoding_name_for_model as encoding_name_for_model
 from .registry import get_encoding as get_encoding
 from .registry import list_encoding_names as list_encoding_names
diff --git a/tiktoken/model.py b/tiktoken/model.py
index a67830c..3f36693 100644
--- a/tiktoken/model.py
+++ b/tiktoken/model.py
@@ -4,7 +4,7 @@ from .core import Encoding
 from .registry import get_encoding
 
 # TODO: these will likely be replaced by an API endpoint
-_MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
+MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
     # chat
     "gpt-4-": "cl100k_base",  # e.g., gpt-4-0314, etc., plus gpt-4-32k
     "gpt-3.5-turbo-": "cl100k_base",  # e.g, gpt-3.5-turbo-0301, -0401, etc.
@@ -16,7 +16,7 @@ _MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
     "ft:babbage-002": "cl100k_base",
 }
 
-_MODEL_TO_ENCODING: dict[str, str] = {
+MODEL_TO_ENCODING: dict[str, str] = {
     # chat
     "gpt-4": "cl100k_base",
     "gpt-3.5-turbo": "cl100k_base",
@@ -64,18 +64,21 @@ _MODEL_TO_ENCODING: dict[str, str] = {
 }
 
 
-def encoding_for_model(model_name: str) -> Encoding:
-    """Returns the encoding used by a model."""
+def encoding_name_for_model(model_name: str) -> str:
+    """Returns the name of the encoding used by a model.
+
+    Raises a KeyError if the model name is not recognised.
+    """
     encoding_name = None
-    if model_name in _MODEL_TO_ENCODING:
-        encoding_name = _MODEL_TO_ENCODING[model_name]
+    if model_name in MODEL_TO_ENCODING:
+        encoding_name = MODEL_TO_ENCODING[model_name]
     else:
         # Check if the model matches a known prefix
         # Prefix matching avoids needing library updates for every model version release
         # Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE)
-        for model_prefix, model_encoding_name in _MODEL_PREFIX_TO_ENCODING.items():
+        for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items():
             if model_name.startswith(model_prefix):
-                return get_encoding(model_encoding_name)
+                return model_encoding_name
 
     if encoding_name is None:
         raise KeyError(
@@ -83,4 +86,12 @@ def encoding_for_model(model_name: str) -> Encoding:
             "Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect."
         ) from None
 
-    return get_encoding(encoding_name)
+    return encoding_name
+
+
+def encoding_for_model(model_name: str) -> Encoding:
+    """Returns the encoding used by a model.
+
+    Raises a KeyError if the model name is not recognised.
+    """
+    return get_encoding(encoding_name_for_model(model_name))