mirror of
https://github.com/Mintplex-Labs/tiktoken.git
synced 2026-07-01 18:48:04 -04:00
Sync codebase
This commit is contained in:
@@ -276,6 +276,31 @@ class Encoding:
|
||||
"""
|
||||
return [self.decode_single_token_bytes(token) for token in tokens]
|
||||
|
||||
def decode_with_offsets(self, tokens: list[int]) -> tuple[str, list[int]]:
|
||||
"""Decodes a list of tokens into a string and a list of offsets.
|
||||
|
||||
Each offset is the index into text corresponding to the start of each token.
|
||||
If UTF-8 character boundaries do not line up with token boundaries, the offset is the index
|
||||
of the first character that contains bytes from the token.
|
||||
|
||||
This will currently raise if given tokens that decode to invalid UTF-8; this behaviour may
|
||||
change in the future to be more permissive.
|
||||
|
||||
>>> enc.decode_with_offsets([31373, 995])
|
||||
('hello world', [0, 5])
|
||||
"""
|
||||
token_bytes = self.decode_tokens_bytes(tokens)
|
||||
|
||||
text_len = 0
|
||||
offsets = []
|
||||
for token in token_bytes:
|
||||
offsets.append(max(0, text_len - (0x80 <= token[0] < 0xC0)))
|
||||
text_len += sum(1 for c in token if not 0x80 <= c < 0xC0)
|
||||
|
||||
# TODO: assess correctness for errors="ignore" and errors="replace"
|
||||
text = b"".join(token_bytes).decode("utf-8", errors="strict")
|
||||
return text, offsets
|
||||
|
||||
def decode_batch(
|
||||
self, batch: list[list[int]], *, errors: str = "replace", num_threads: int = 8
|
||||
) -> list[str]:
|
||||
|
||||
+3
-1
@@ -8,12 +8,14 @@ MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
|
||||
# chat
|
||||
"gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
|
||||
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
|
||||
"gpt-35-turbo": "cl100k_base", # Azure deployment name
|
||||
}
|
||||
|
||||
MODEL_TO_ENCODING: dict[str, str] = {
|
||||
# chat
|
||||
"gpt-4": "cl100k_base",
|
||||
"gpt-3.5-turbo": "cl100k_base",
|
||||
"gpt-35-turbo": "cl100k_base", # Azure deployment name
|
||||
# text
|
||||
"text-davinci-003": "p50k_base",
|
||||
"text-davinci-002": "p50k_base",
|
||||
@@ -69,7 +71,7 @@ def encoding_for_model(model_name: str) -> Encoding:
|
||||
if encoding_name is None:
|
||||
raise KeyError(
|
||||
f"Could not automatically map {model_name} to a tokeniser. "
|
||||
"Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect."
|
||||
"Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect."
|
||||
) from None
|
||||
|
||||
return get_encoding(encoding_name)
|
||||
|
||||
Reference in New Issue
Block a user