mirror of
https://github.com/Mintplex-Labs/tiktoken.git
synced 2026-07-01 18:48:04 -04:00
Merge remote-tracking branch 'upstream/main'
This commit is contained in:
@@ -2,6 +2,17 @@
|
||||
|
||||
This is the changelog for the open source version of tiktoken.
|
||||
|
||||
## [v0.5.1]
|
||||
- Add `encoding_name_for_model`, undo some renames to variables that are implementation details
|
||||
|
||||
## [v0.5.0]
|
||||
- Add `tiktoken._educational` submodule to better document how byte pair encoding works
|
||||
- Ensure `encoding_for_model` knows about several new models
|
||||
- Add `decode_with_offets`
|
||||
- Better error for failures with the plugin mechanism
|
||||
- Make more tests public
|
||||
- Update versions of dependencies
|
||||
|
||||
## [v0.4.0]
|
||||
- Add `decode_batch` and `decode_bytes_batch`
|
||||
- Improve error messages and handling
|
||||
|
||||
+4
-1
@@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "tiktoken"
|
||||
version = "0.4.0"
|
||||
version = "0.5.1"
|
||||
description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
|
||||
readme = "README.md"
|
||||
license = {file = "LICENSE"}
|
||||
@@ -44,3 +44,6 @@ test-skip = "*-macosx_arm64"
|
||||
before-test = "pip install pytest hypothesis"
|
||||
test-command = "pytest {project}/tests --import-mode=append"
|
||||
|
||||
[[tool.cibuildwheel.overrides]]
|
||||
select = "*linux_aarch64"
|
||||
test-command = """python -c 'import tiktoken; enc = tiktoken.get_encoding("gpt2"); assert enc.encode("hello world") == [31373, 995]'"""
|
||||
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tiktoken"
|
||||
version = "0.2.0"
|
||||
version = "0.5.1"
|
||||
edition = "2021"
|
||||
rust-version = "1.57.0"
|
||||
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
# This is the public API of tiktoken
|
||||
from .core import Encoding as Encoding
|
||||
from .model import encoding_for_model as encoding_for_model
|
||||
from .model import encoding_name_for_model as encoding_name_for_model
|
||||
from .registry import get_encoding as get_encoding
|
||||
from .registry import list_encoding_names as list_encoding_names
|
||||
|
||||
@@ -1,8 +1,5 @@
|
||||
"""This is an educational implementation of the byte pair encoding algorithm."""
|
||||
from __future__ import annotations
|
||||
|
||||
import collections
|
||||
import itertools
|
||||
from typing import Optional
|
||||
|
||||
import regex
|
||||
@@ -187,11 +184,23 @@ def bpe_train(
|
||||
|
||||
|
||||
def visualise_tokens(token_values: list[bytes]) -> None:
|
||||
backgrounds = itertools.cycle(
|
||||
[f"\u001b[48;5;{i}m".encode() for i in [167, 179, 185, 77, 80, 68, 134]]
|
||||
)
|
||||
interleaved = itertools.chain.from_iterable(zip(backgrounds, token_values))
|
||||
print((b"".join(interleaved) + "\u001b[0m".encode()).decode("utf-8"))
|
||||
background = [f"\u001b[48;5;{i}m" for i in [167, 179, 185, 77, 80, 68, 134]]
|
||||
# If token boundaries do not occur at unicode character boundaries, it's unclear how best to
|
||||
# visualise the token. Here, we'll just use the unicode replacement character to represent some
|
||||
# fraction of a character.
|
||||
unicode_token_values = [x.decode("utf-8", errors="replace") for x in token_values]
|
||||
|
||||
running_length = 0
|
||||
last_color = None
|
||||
for token in unicode_token_values:
|
||||
color = background[running_length % len(background)]
|
||||
if color == last_color:
|
||||
color = background[(running_length + 1) % len(background)]
|
||||
assert color != last_color
|
||||
last_color = color
|
||||
running_length += len(token)
|
||||
print(color + token, end="")
|
||||
print("\u001b[0m")
|
||||
|
||||
|
||||
def train_simple_encoding():
|
||||
|
||||
+21
-5
@@ -15,14 +15,22 @@ MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
|
||||
# chat
|
||||
"gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
|
||||
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
|
||||
"gpt-35-turbo": "cl100k_base", # Azure deployment name
|
||||
"gpt-35-turbo-": "cl100k_base", # Azure deployment name
|
||||
# fine-tuned
|
||||
"ft:gpt-4": "cl100k_base",
|
||||
"ft:gpt-3.5-turbo": "cl100k_base",
|
||||
"ft:davinci-002": "cl100k_base",
|
||||
"ft:babbage-002": "cl100k_base",
|
||||
}
|
||||
|
||||
MODEL_TO_ENCODING: dict[str, str] = json.loads(pkg_resources.read_text("tiktoken", "model_to_encoding.json"))
|
||||
|
||||
|
||||
def encoding_for_model(model_name: str) -> Encoding:
|
||||
"""Returns the encoding used by a model."""
|
||||
def encoding_name_for_model(model_name: str) -> str:
|
||||
"""Returns the name of the encoding used by a model.
|
||||
|
||||
Raises a KeyError if the model name is not recognised.
|
||||
"""
|
||||
encoding_name = None
|
||||
if model_name in MODEL_TO_ENCODING:
|
||||
encoding_name = MODEL_TO_ENCODING[model_name]
|
||||
@@ -32,7 +40,7 @@ def encoding_for_model(model_name: str) -> Encoding:
|
||||
# Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE)
|
||||
for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items():
|
||||
if model_name.startswith(model_prefix):
|
||||
return get_encoding(model_encoding_name)
|
||||
return model_encoding_name
|
||||
|
||||
if encoding_name is None:
|
||||
raise KeyError(
|
||||
@@ -40,4 +48,12 @@ def encoding_for_model(model_name: str) -> Encoding:
|
||||
"Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect."
|
||||
) from None
|
||||
|
||||
return get_encoding(encoding_name)
|
||||
return encoding_name
|
||||
|
||||
|
||||
def encoding_for_model(model_name: str) -> Encoding:
|
||||
"""Returns the encoding used by a model.
|
||||
|
||||
Raises a KeyError if the model name is not recognised.
|
||||
"""
|
||||
return get_encoding(encoding_name_for_model(model_name))
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
{
|
||||
"davinci-002": "cl100k_base",
|
||||
"babbage-002": "cl100k_base",
|
||||
"text-davinci-003": "p50k_base",
|
||||
"text-davinci-002": "p50k_base",
|
||||
"text-davinci-001": "r50k_base",
|
||||
@@ -30,6 +32,7 @@
|
||||
"code-search-ada-code-001": "r50k_base",
|
||||
"gpt2": "gpt2",
|
||||
"gpt-3.5-turbo": "cl100k_base",
|
||||
"gpt-35-turbo": "cl100k_base",
|
||||
"gpt-3.5-turbo-0301": "cl100k_base",
|
||||
"gpt-3.5-turbo-0613": "cl100k_base",
|
||||
"gpt-3.5-turbo-16k": "cl100k_base",
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import functools
|
||||
import importlib
|
||||
import pkgutil
|
||||
import threading
|
||||
|
||||
Reference in New Issue
Block a user