Sync codebase

This commit is contained in:
Shantanu Jain
2023-05-31 20:03:03 -07:00
parent 095924e02c
commit affbd6e50d
2 changed files with 28 additions and 1 deletions
+25
View File
@@ -276,6 +276,31 @@ class Encoding:
"""
return [self.decode_single_token_bytes(token) for token in tokens]
def decode_with_offsets(self, tokens: list[int]) -> tuple[str, list[int]]:
"""Decodes a list of tokens into a string and a list of offsets.
Each offset is the index into text corresponding to the start of each token.
If UTF-8 character boundaries do not line up with token boundaries, the offset is the index
of the first character that contains bytes from the token.
This will currently raise if given tokens that decode to invalid UTF-8; this behaviour may
change in the future to be more permissive.
>>> enc.decode_with_offsets([31373, 995])
('hello world', [0, 5])
"""
token_bytes = self.decode_tokens_bytes(tokens)
text_len = 0
offsets = []
for token in token_bytes:
offsets.append(max(0, text_len - (0x80 <= token[0] < 0xC0)))
text_len += sum(1 for c in token if not 0x80 <= c < 0xC0)
# TODO: assess correctness for errors="ignore" and errors="replace"
text = b"".join(token_bytes).decode("utf-8", errors="strict")
return text, offsets
def decode_batch(
self, batch: list[list[int]], *, errors: str = "replace", num_threads: int = 8
) -> list[str]:
+3 -1
View File
@@ -8,12 +8,14 @@ MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
# chat
"gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
"gpt-35-turbo": "cl100k_base", # Azure deployment name
}
MODEL_TO_ENCODING: dict[str, str] = {
# chat
"gpt-4": "cl100k_base",
"gpt-3.5-turbo": "cl100k_base",
"gpt-35-turbo": "cl100k_base", # Azure deployment name
# text
"text-davinci-003": "p50k_base",
"text-davinci-002": "p50k_base",
@@ -69,7 +71,7 @@ def encoding_for_model(model_name: str) -> Encoding:
if encoding_name is None:
raise KeyError(
f"Could not automatically map {model_name} to a tokeniser. "
"Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect."
"Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect."
) from None
return get_encoding(encoding_name)