Sync codebase

2026-07-01 18:48:04 -04:00 · 2023-05-31 20:03:03 -07:00
parent 095924e02c
commit affbd6e50d
2 changed files with 28 additions and 1 deletions
@@ -276,6 +276,31 @@ class Encoding:
        """
        return [self.decode_single_token_bytes(token) for token in tokens]

+    def decode_with_offsets(self, tokens: list[int]) -> tuple[str, list[int]]:
+        """Decodes a list of tokens into a string and a list of offsets.
+
+        Each offset is the index into text corresponding to the start of each token.
+        If UTF-8 character boundaries do not line up with token boundaries, the offset is the index
+        of the first character that contains bytes from the token.
+
+        This will currently raise if given tokens that decode to invalid UTF-8; this behaviour may
+        change in the future to be more permissive.
+
+        >>> enc.decode_with_offsets([31373, 995])
+        ('hello world', [0, 5])
+        """
+        token_bytes = self.decode_tokens_bytes(tokens)
+
+        text_len = 0
+        offsets = []
+        for token in token_bytes:
+            offsets.append(max(0, text_len - (0x80 <= token[0] < 0xC0)))
+            text_len += sum(1 for c in token if not 0x80 <= c < 0xC0)
+
+        # TODO: assess correctness for errors="ignore" and errors="replace"
+        text = b"".join(token_bytes).decode("utf-8", errors="strict")
+        return text, offsets
+
    def decode_batch(
        self, batch: list[list[int]], *, errors: str = "replace", num_threads: int = 8
    ) -> list[str]:
@@ -8,12 +8,14 @@ MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
    # chat
    "gpt-4-": "cl100k_base",  # e.g., gpt-4-0314, etc., plus gpt-4-32k
    "gpt-3.5-turbo-": "cl100k_base",  # e.g, gpt-3.5-turbo-0301, -0401, etc.
+    "gpt-35-turbo": "cl100k_base",  # Azure deployment name
 }

 MODEL_TO_ENCODING: dict[str, str] = {
    # chat
    "gpt-4": "cl100k_base",
    "gpt-3.5-turbo": "cl100k_base",
+    "gpt-35-turbo": "cl100k_base",  # Azure deployment name
    # text
    "text-davinci-003": "p50k_base",
    "text-davinci-002": "p50k_base",
@@ -69,7 +71,7 @@ def encoding_for_model(model_name: str) -> Encoding:
    if encoding_name is None:
        raise KeyError(
            f"Could not automatically map {model_name} to a tokeniser. "
-            "Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect."
+            "Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect."
        ) from None

    return get_encoding(encoding_name)