Merge remote-tracking branch 'upstream/main'

This commit is contained in:
Tat Dat Duong
2023-10-02 00:33:09 +02:00
8 changed files with 60 additions and 15 deletions
+11
View File
@@ -2,6 +2,17 @@
This is the changelog for the open source version of tiktoken.
## [v0.5.1]
- Add `encoding_name_for_model`, undo some renames to variables that are implementation details
## [v0.5.0]
- Add `tiktoken._educational` submodule to better document how byte pair encoding works
- Ensure `encoding_for_model` knows about several new models
- Add `decode_with_offets`
- Better error for failures with the plugin mechanism
- Make more tests public
- Update versions of dependencies
## [v0.4.0]
- Add `decode_batch` and `decode_bytes_batch`
- Improve error messages and handling
+4 -1
View File
@@ -1,6 +1,6 @@
[project]
name = "tiktoken"
version = "0.4.0"
version = "0.5.1"
description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
readme = "README.md"
license = {file = "LICENSE"}
@@ -44,3 +44,6 @@ test-skip = "*-macosx_arm64"
before-test = "pip install pytest hypothesis"
test-command = "pytest {project}/tests --import-mode=append"
[[tool.cibuildwheel.overrides]]
select = "*linux_aarch64"
test-command = """python -c 'import tiktoken; enc = tiktoken.get_encoding("gpt2"); assert enc.encode("hello world") == [31373, 995]'"""
+1 -1
View File
@@ -1,6 +1,6 @@
[package]
name = "tiktoken"
version = "0.2.0"
version = "0.5.1"
edition = "2021"
rust-version = "1.57.0"
+2
View File
@@ -1,4 +1,6 @@
# This is the public API of tiktoken
from .core import Encoding as Encoding
from .model import encoding_for_model as encoding_for_model
from .model import encoding_name_for_model as encoding_name_for_model
from .registry import get_encoding as get_encoding
from .registry import list_encoding_names as list_encoding_names
+17 -8
View File
@@ -1,8 +1,5 @@
"""This is an educational implementation of the byte pair encoding algorithm."""
from __future__ import annotations
import collections
import itertools
from typing import Optional
import regex
@@ -187,11 +184,23 @@ def bpe_train(
def visualise_tokens(token_values: list[bytes]) -> None:
backgrounds = itertools.cycle(
[f"\u001b[48;5;{i}m".encode() for i in [167, 179, 185, 77, 80, 68, 134]]
)
interleaved = itertools.chain.from_iterable(zip(backgrounds, token_values))
print((b"".join(interleaved) + "\u001b[0m".encode()).decode("utf-8"))
background = [f"\u001b[48;5;{i}m" for i in [167, 179, 185, 77, 80, 68, 134]]
# If token boundaries do not occur at unicode character boundaries, it's unclear how best to
# visualise the token. Here, we'll just use the unicode replacement character to represent some
# fraction of a character.
unicode_token_values = [x.decode("utf-8", errors="replace") for x in token_values]
running_length = 0
last_color = None
for token in unicode_token_values:
color = background[running_length % len(background)]
if color == last_color:
color = background[(running_length + 1) % len(background)]
assert color != last_color
last_color = color
running_length += len(token)
print(color + token, end="")
print("\u001b[0m")
def train_simple_encoding():
+21 -5
View File
@@ -15,14 +15,22 @@ MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
# chat
"gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
"gpt-35-turbo": "cl100k_base", # Azure deployment name
"gpt-35-turbo-": "cl100k_base", # Azure deployment name
# fine-tuned
"ft:gpt-4": "cl100k_base",
"ft:gpt-3.5-turbo": "cl100k_base",
"ft:davinci-002": "cl100k_base",
"ft:babbage-002": "cl100k_base",
}
MODEL_TO_ENCODING: dict[str, str] = json.loads(pkg_resources.read_text("tiktoken", "model_to_encoding.json"))
def encoding_for_model(model_name: str) -> Encoding:
"""Returns the encoding used by a model."""
def encoding_name_for_model(model_name: str) -> str:
"""Returns the name of the encoding used by a model.
Raises a KeyError if the model name is not recognised.
"""
encoding_name = None
if model_name in MODEL_TO_ENCODING:
encoding_name = MODEL_TO_ENCODING[model_name]
@@ -32,7 +40,7 @@ def encoding_for_model(model_name: str) -> Encoding:
# Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE)
for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items():
if model_name.startswith(model_prefix):
return get_encoding(model_encoding_name)
return model_encoding_name
if encoding_name is None:
raise KeyError(
@@ -40,4 +48,12 @@ def encoding_for_model(model_name: str) -> Encoding:
"Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect."
) from None
return get_encoding(encoding_name)
return encoding_name
def encoding_for_model(model_name: str) -> Encoding:
"""Returns the encoding used by a model.
Raises a KeyError if the model name is not recognised.
"""
return get_encoding(encoding_name_for_model(model_name))
+3
View File
@@ -1,4 +1,6 @@
{
"davinci-002": "cl100k_base",
"babbage-002": "cl100k_base",
"text-davinci-003": "p50k_base",
"text-davinci-002": "p50k_base",
"text-davinci-001": "r50k_base",
@@ -30,6 +32,7 @@
"code-search-ada-code-001": "r50k_base",
"gpt2": "gpt2",
"gpt-3.5-turbo": "cl100k_base",
"gpt-35-turbo": "cl100k_base",
"gpt-3.5-turbo-0301": "cl100k_base",
"gpt-3.5-turbo-0613": "cl100k_base",
"gpt-3.5-turbo-16k": "cl100k_base",
+1
View File
@@ -1,5 +1,6 @@
from __future__ import annotations
import functools
import importlib
import pkgutil
import threading