Use a JS based tokenizer for token counting (#1239)

* Use a rewritten tiktoken for token counting

* Fix lint, add test

* Fix type import, typo

* Remove @dqbd/tiktoken as "weak" dependency

* Move JS implementation of tiktoken into a separate package

* Upgrade to js-tiktoken@1.0.2

* Replace all occurrences of @dqbd/tiktoken

* Avoid bundling ranks, fix test with CRA

* Use AsyncCaller, cache promises

* Remove WASM configuration changes

* Upgrade to 1.0.6 for CRA fixes, mark js-tiktoken/lite as an external dependency

* Remove next.config.js
This commit is contained in:
David Duong
2023-05-17 15:44:29 +02:00
committed by GitHub
parent fcd3f4d2e3
commit d60eae5995
14 changed files with 87 additions and 308 deletions
-37
View File
@@ -83,21 +83,6 @@ LangChain can be used in Vercel / Next.js. We support using LangChain in fronten
import { OpenAI } from "langchain/llms/openai";
```
To use LangChain with Next.js (either with app/ or pages/), add the following to your `next.config.js` to enable support for WebAssembly modules (which is required by the tokenizer library `@dqbd/tiktoken`):
```js title="next.config.js"
const nextConfig = {
webpack(config) {
config.experiments = {
asyncWebAssembly: true,
layers: true,
};
return config;
},
};
```
### Deno / Supabase Edge Functions
LangChain can be used in Deno / Supabase Edge Functions. You can import it using the following syntax:
@@ -116,28 +101,6 @@ LangChain can be used in the browser. In our CI we test bundling LangChain with
import { OpenAI } from "langchain/llms/openai";
```
#### Create React App
If you're using `create-react-app` by default it doesn't support WebAssembly modules, so the tokenizer library `@dqbd/tiktoken` will not work in the browser. You can follow the instructions [here](https://github.com/dqbd/tiktoken/tree/main/js#create-react-app) to enable support for WebAssembly modules.
#### Vite
If you're using Vite, you need to add the following to your `vite.config.js` to enable support for WebAssembly modules (which is required by the tokenizer library `@dqbd/tiktoken`):
```bash npm2yarn
npm install -D vite-plugin-wasm vite-plugin-top-level-await
```
```js title="vite.config.js"
import wasm from "vite-plugin-wasm";
import topLevelAwait from "vite-plugin-top-level-await";
import { defineConfig } from "vite";
export default defineConfig({
plugins: [wasm(), topLevelAwait()],
});
```
## Updating from <0.0.52
If you are updating from a version of LangChain prior to 0.0.52, you will need to update your imports to use the new path structure.
@@ -6,14 +6,6 @@ hide_table_of_contents: true
Finally, `TokenTextSplitter` splits a raw text string by first converting the text into BPE tokens, then split these tokens into chunks and convert the tokens within a single chunk back into text.
To utilize the `TokenTextSplitter`, first install the accompanying required library
```bash npm2yarn
npm install -S @dqbd/tiktoken
```
Then, you can use it like so:
```typescript
import { Document } from "langchain/document";
import { TokenTextSplitter } from "langchain/text_splitter";
+1 -1
View File
@@ -521,11 +521,11 @@
},
"dependencies": {
"@anthropic-ai/sdk": "^0.4.3",
"@dqbd/tiktoken": "^1.0.7",
"ansi-styles": "^5.0.0",
"binary-extensions": "^2.2.0",
"expr-eval": "^2.0.2",
"flat": "^5.0.2",
"js-tiktoken": "^1.0.6",
"jsonpointer": "^5.0.1",
"ml-distance": "^4.0.0",
"object-hash": "^3.0.0",
+1
View File
@@ -26,6 +26,7 @@ export function listExternals() {
...Object.keys(packageJson.dependencies),
...Object.keys(packageJson.peerDependencies),
/node\:/,
/js-tiktoken/,
"axios", // axios is a dependency of openai
"pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js",
"@zilliz/milvus2-sdk-node/dist/milvus/const/Milvus.js",
+5 -26
View File
@@ -1,6 +1,7 @@
import type { TiktokenModel } from "@dqbd/tiktoken";
import { type TiktokenModel } from "js-tiktoken/lite";
import { encodingForModel } from "../util/tiktoken.js";
// https://www.npmjs.com/package/@dqbd/tiktoken
// https://www.npmjs.com/package/js-tiktoken
export const getModelNameForTiktoken = (modelName: string): TiktokenModel => {
if (modelName.startsWith("gpt-3.5-turbo-")) {
@@ -57,43 +58,21 @@ interface CalculateMaxTokenProps {
modelName: TiktokenModel;
}
export const importTiktoken = async () => {
try {
const { encoding_for_model } = await import("@dqbd/tiktoken");
return { encoding_for_model };
} catch (error) {
console.log(error);
return { encoding_for_model: null };
}
};
export const calculateMaxTokens = async ({
prompt,
modelName,
}: CalculateMaxTokenProps) => {
const { encoding_for_model } = await importTiktoken();
// fallback to approximate calculation if tiktoken is not available
let numTokens = Math.ceil(prompt.length / 4);
try {
if (encoding_for_model) {
const encoding = encoding_for_model(getModelNameForTiktoken(modelName));
const tokenized = encoding.encode(prompt);
numTokens = tokenized.length;
encoding.free();
}
numTokens = (await encodingForModel(modelName)).encode(prompt).length;
} catch (error) {
console.warn(
"Failed to calculate number of tokens with tiktoken, falling back to approximate count",
error
"Failed to calculate number of tokens, falling back to approximate count"
);
}
const maxTokens = getModelContextSize(modelName);
return maxTokens - numTokens;
};
+18 -28
View File
@@ -1,4 +1,4 @@
import type { Tiktoken } from "@dqbd/tiktoken";
import { type Tiktoken } from "js-tiktoken/lite";
import {
BaseChatMessage,
BasePromptValue,
@@ -6,7 +6,8 @@ import {
} from "../schema/index.js";
import { CallbackManager, Callbacks } from "../callbacks/manager.js";
import { AsyncCaller, AsyncCallerParams } from "../util/async_caller.js";
import { getModelNameForTiktoken, importTiktoken } from "./count_tokens.js";
import { getModelNameForTiktoken } from "./count_tokens.js";
import { encodingForModel } from "../util/tiktoken.js";
const getVerbosity = () => false;
@@ -125,38 +126,27 @@ export abstract class BaseLanguageModel
private _encoding?: Tiktoken;
private _registry?: FinalizationRegistry<Tiktoken>;
async getNumTokens(text: string) {
// fallback to approximate calculation if tiktoken is not available
let numTokens = Math.ceil(text.length / 4);
try {
if (!this._encoding) {
const { encoding_for_model } = await importTiktoken();
// modelName only exists in openai subclasses, but tiktoken only supports
// openai tokenisers anyway, so for other subclasses we default to gpt2
if (encoding_for_model) {
this._encoding = encoding_for_model(
"modelName" in this
? getModelNameForTiktoken(this.modelName as string)
: "gpt2"
);
// We need to register a finalizer to free the tokenizer when the
// model is garbage collected.
this._registry = new FinalizationRegistry((t) => t.free());
this._registry.register(this, this._encoding);
}
if (!this._encoding) {
try {
this._encoding = await encodingForModel(
"modelName" in this
? getModelNameForTiktoken(this.modelName as string)
: "gpt2"
);
} catch (error) {
console.warn(
"Failed to calculate number of tokens, falling back to approximate count",
error
);
}
}
if (this._encoding) {
numTokens = this._encoding.encode(text).length;
}
} catch (error) {
console.warn(
"Failed to calculate number of tokens with tiktoken, falling back to approximate count",
error
);
if (this._encoding) {
numTokens = this._encoding.encode(text).length;
}
return numTokens;
+1 -1
View File
@@ -1,4 +1,4 @@
import type { TiktokenModel } from "@dqbd/tiktoken";
import type { TiktokenModel } from "js-tiktoken/lite";
import { DEFAULT_SQL_DATABASE_PROMPT } from "./sql_db_prompt.js";
import { BaseChain, ChainInputs } from "../base.js";
import type { OpenAI } from "../../llms/openai.js";
+1 -1
View File
@@ -1,4 +1,4 @@
import { TiktokenModel } from "@dqbd/tiktoken";
import type { TiktokenModel } from "js-tiktoken/lite";
import {
Configuration,
ConfigurationParameters,
+4 -23
View File
@@ -1,5 +1,6 @@
import type * as tiktoken from "@dqbd/tiktoken";
import type * as tiktoken from "js-tiktoken";
import { Document } from "./document.js";
import { getEncoding } from "./util/tiktoken.js";
export interface TextSplitterParams {
chunkSize: number;
@@ -242,8 +243,6 @@ export class TokenTextSplitter
private tokenizer: tiktoken.Tiktoken;
private registry: FinalizationRegistry<tiktoken.Tiktoken>;
constructor(fields?: Partial<TokenTextSplitterParams>) {
super(fields);
@@ -254,12 +253,7 @@ export class TokenTextSplitter
async splitText(text: string): Promise<string[]> {
if (!this.tokenizer) {
const tiktoken = await TokenTextSplitter.imports();
this.tokenizer = tiktoken.get_encoding(this.encodingName);
// We need to register a finalizer to free the tokenizer when the
// splitter is garbage collected.
this.registry = new FinalizationRegistry((t) => t.free());
this.registry.register(this, this.tokenizer);
this.tokenizer = await getEncoding(this.encodingName);
}
const splits: string[] = [];
@@ -274,10 +268,8 @@ export class TokenTextSplitter
let cur_idx = Math.min(start_idx + this.chunkSize, input_ids.length);
let chunk_ids = input_ids.slice(start_idx, cur_idx);
const decoder = new TextDecoder();
while (start_idx < input_ids.length) {
splits.push(decoder.decode(this.tokenizer.decode(chunk_ids)));
splits.push(this.tokenizer.decode(chunk_ids));
start_idx += this.chunkSize - this.chunkOverlap;
cur_idx = Math.min(start_idx + this.chunkSize, input_ids.length);
@@ -286,17 +278,6 @@ export class TokenTextSplitter
return splits;
}
static async imports(): Promise<typeof tiktoken> {
try {
return await import("@dqbd/tiktoken");
} catch (err) {
console.error(err);
throw new Error(
"Please install @dqbd/tiktoken as a dependency with, e.g. `npm install -S @dqbd/tiktoken`"
);
}
}
}
export type MarkdownTextSplitterParams = TextSplitterParams;
+44
View File
@@ -0,0 +1,44 @@
import {
Tiktoken,
TiktokenBPE,
TiktokenEncoding,
TiktokenModel,
getEncodingNameForModel,
} from "js-tiktoken/lite";
import { AsyncCaller } from "./async_caller.js";
const cache: Record<string, Promise<TiktokenBPE>> = {};
const caller = /* #__PURE__ */ new AsyncCaller({});
export async function getEncoding(
encoding: TiktokenEncoding,
options?: {
signal?: AbortSignal;
extendedSpecialTokens?: Record<string, number>;
}
) {
if (!(encoding in cache)) {
cache[encoding] = caller
.fetch(`https://tiktoken.pages.dev/js/${encoding}.json`, {
signal: options?.signal,
})
.then((res) => res.json())
.catch((e) => {
delete cache[encoding];
throw e;
});
}
return new Tiktoken(await cache[encoding], options?.extendedSpecialTokens);
}
export async function encodingForModel(
model: TiktokenModel,
options?: {
signal?: AbortSignal;
extendedSpecialTokens?: Record<string, number>;
}
) {
return getEncoding(getEncodingNameForModel(model), options);
}
-14
View File
@@ -1,14 +0,0 @@
/** @type {import('next').NextConfig} */
const nextConfig = {
reactStrictMode: true,
webpack(config) {
config.experiments = {
asyncWebAssembly: true,
layers: true,
};
return config;
},
};
module.exports = nextConfig;
+1 -3
View File
@@ -14,8 +14,6 @@
},
"devDependencies": {
"typescript": "^5.0.0",
"vite": "^4.2.0",
"vite-plugin-top-level-await": "^1.3.0",
"vite-plugin-wasm": "^3.2.2"
"vite": "^4.2.0"
}
}
-7
View File
@@ -1,7 +0,0 @@
import wasm from "vite-plugin-wasm";
import topLevelAwait from "vite-plugin-top-level-await";
import { defineConfig } from "vite";
export default defineConfig({
plugins: [wasm(), topLevelAwait()],
});
+11 -159
View File
@@ -4487,13 +4487,6 @@ __metadata:
languageName: node
linkType: hard
"@dqbd/tiktoken@npm:^1.0.7":
version: 1.0.7
resolution: "@dqbd/tiktoken@npm:1.0.7"
checksum: 81049797dcffa101c57b68a78be19b11958edf6e6cfffab7ccd3ec2e5290e3b27bc25f1060a1c34e1cb94f22e0f46d9aafa925d383fb4559738b644aa01c0fb1
languageName: node
linkType: hard
"@esbuild-kit/cjs-loader@npm:^2.4.2":
version: 2.4.2
resolution: "@esbuild-kit/cjs-loader@npm:2.4.2"
@@ -6813,18 +6806,6 @@ __metadata:
languageName: node
linkType: hard
"@rollup/plugin-virtual@npm:^3.0.1":
version: 3.0.1
resolution: "@rollup/plugin-virtual@npm:3.0.1"
peerDependencies:
rollup: ^1.20.0||^2.0.0||^3.0.0
peerDependenciesMeta:
rollup:
optional: true
checksum: 93800884956299b071383e1a051323ed38acfffdb64bbd6f3b909a052e506e236eb9022e43b3a039425aa45a33367c9fd50f85a3a867a1259a9862086143bd42
languageName: node
linkType: hard
"@rollup/pluginutils@npm:^3.1.0":
version: 3.1.0
resolution: "@rollup/pluginutils@npm:3.1.0"
@@ -7335,120 +7316,6 @@ __metadata:
languageName: node
linkType: hard
"@swc/core-darwin-arm64@npm:1.3.49":
version: 1.3.49
resolution: "@swc/core-darwin-arm64@npm:1.3.49"
conditions: os=darwin & cpu=arm64
languageName: node
linkType: hard
"@swc/core-darwin-x64@npm:1.3.49":
version: 1.3.49
resolution: "@swc/core-darwin-x64@npm:1.3.49"
conditions: os=darwin & cpu=x64
languageName: node
linkType: hard
"@swc/core-linux-arm-gnueabihf@npm:1.3.49":
version: 1.3.49
resolution: "@swc/core-linux-arm-gnueabihf@npm:1.3.49"
conditions: os=linux & cpu=arm
languageName: node
linkType: hard
"@swc/core-linux-arm64-gnu@npm:1.3.49":
version: 1.3.49
resolution: "@swc/core-linux-arm64-gnu@npm:1.3.49"
conditions: os=linux & cpu=arm64 & libc=glibc
languageName: node
linkType: hard
"@swc/core-linux-arm64-musl@npm:1.3.49":
version: 1.3.49
resolution: "@swc/core-linux-arm64-musl@npm:1.3.49"
conditions: os=linux & cpu=arm64 & libc=musl
languageName: node
linkType: hard
"@swc/core-linux-x64-gnu@npm:1.3.49":
version: 1.3.49
resolution: "@swc/core-linux-x64-gnu@npm:1.3.49"
conditions: os=linux & cpu=x64 & libc=glibc
languageName: node
linkType: hard
"@swc/core-linux-x64-musl@npm:1.3.49":
version: 1.3.49
resolution: "@swc/core-linux-x64-musl@npm:1.3.49"
conditions: os=linux & cpu=x64 & libc=musl
languageName: node
linkType: hard
"@swc/core-win32-arm64-msvc@npm:1.3.49":
version: 1.3.49
resolution: "@swc/core-win32-arm64-msvc@npm:1.3.49"
conditions: os=win32 & cpu=arm64
languageName: node
linkType: hard
"@swc/core-win32-ia32-msvc@npm:1.3.49":
version: 1.3.49
resolution: "@swc/core-win32-ia32-msvc@npm:1.3.49"
conditions: os=win32 & cpu=ia32
languageName: node
linkType: hard
"@swc/core-win32-x64-msvc@npm:1.3.49":
version: 1.3.49
resolution: "@swc/core-win32-x64-msvc@npm:1.3.49"
conditions: os=win32 & cpu=x64
languageName: node
linkType: hard
"@swc/core@npm:^1.3.10":
version: 1.3.49
resolution: "@swc/core@npm:1.3.49"
dependencies:
"@swc/core-darwin-arm64": 1.3.49
"@swc/core-darwin-x64": 1.3.49
"@swc/core-linux-arm-gnueabihf": 1.3.49
"@swc/core-linux-arm64-gnu": 1.3.49
"@swc/core-linux-arm64-musl": 1.3.49
"@swc/core-linux-x64-gnu": 1.3.49
"@swc/core-linux-x64-musl": 1.3.49
"@swc/core-win32-arm64-msvc": 1.3.49
"@swc/core-win32-ia32-msvc": 1.3.49
"@swc/core-win32-x64-msvc": 1.3.49
peerDependencies:
"@swc/helpers": ^0.5.0
dependenciesMeta:
"@swc/core-darwin-arm64":
optional: true
"@swc/core-darwin-x64":
optional: true
"@swc/core-linux-arm-gnueabihf":
optional: true
"@swc/core-linux-arm64-gnu":
optional: true
"@swc/core-linux-arm64-musl":
optional: true
"@swc/core-linux-x64-gnu":
optional: true
"@swc/core-linux-x64-musl":
optional: true
"@swc/core-win32-arm64-msvc":
optional: true
"@swc/core-win32-ia32-msvc":
optional: true
"@swc/core-win32-x64-msvc":
optional: true
peerDependenciesMeta:
"@swc/helpers":
optional: true
checksum: 7234f38451dd765ea94cb44236f261603014e27bff6ecd133d9ba8a2d39314e9949e43bb77ffccd21f5e6c942ad7acf444de8972744ca24ac8f3ccecaea849a1
languageName: node
linkType: hard
"@swc/helpers@npm:0.4.14":
version: 0.4.14
resolution: "@swc/helpers@npm:0.4.14"
@@ -9952,7 +9819,7 @@ __metadata:
languageName: node
linkType: hard
"base64-js@npm:^1.3.1":
"base64-js@npm:^1.3.1, base64-js@npm:^1.5.1":
version: 1.5.1
resolution: "base64-js@npm:1.5.1"
checksum: 669632eb3745404c2f822a18fc3a0122d2f9a7a13f7fb8b5823ee19d1d2ff9ee5b52c53367176ea4ad093c332fd5ab4bd0ebae5a8e27917a4105a4cfc86b1005
@@ -17721,6 +17588,15 @@ __metadata:
languageName: node
linkType: hard
"js-tiktoken@npm:^1.0.6":
version: 1.0.6
resolution: "js-tiktoken@npm:1.0.6"
dependencies:
base64-js: ^1.5.1
checksum: 05a1b3f895447194a0ac17a56229e47eae3816cd851f12ee947b5cdc2f4439773ed52938f4f6459f5c45dc0f4a15ac1a882cde61e47e31b6a84b7129fe1f63c2
languageName: node
linkType: hard
"js-tokens@npm:^3.0.0 || ^4.0.0, js-tokens@npm:^4.0.0":
version: 4.0.0
resolution: "js-tokens@npm:4.0.0"
@@ -18041,7 +17917,6 @@ __metadata:
"@aws-sdk/client-lambda": ^3.310.0
"@aws-sdk/client-s3": ^3.310.0
"@clickhouse/client": ^0.0.14
"@dqbd/tiktoken": ^1.0.7
"@faker-js/faker": ^7.6.0
"@getmetal/metal-sdk": ^2.0.1
"@huggingface/inference": ^1.5.1
@@ -18085,6 +17960,7 @@ __metadata:
hnswlib-node: ^1.4.2
html-to-text: ^9.0.5
jest: ^29.5.0
js-tiktoken: ^1.0.6
jsonpointer: ^5.0.1
mammoth: ^1.5.1
ml-distance: ^4.0.0
@@ -24891,8 +24767,6 @@ __metadata:
langchain: "workspace:*"
typescript: ^5.0.0
vite: ^4.2.0
vite-plugin-top-level-await: ^1.3.0
vite-plugin-wasm: ^3.2.2
languageName: unknown
linkType: soft
@@ -26235,28 +26109,6 @@ __metadata:
languageName: node
linkType: hard
"vite-plugin-top-level-await@npm:^1.3.0":
version: 1.3.0
resolution: "vite-plugin-top-level-await@npm:1.3.0"
dependencies:
"@rollup/plugin-virtual": ^3.0.1
"@swc/core": ^1.3.10
uuid: ^9.0.0
peerDependencies:
vite: ">=2.8"
checksum: 8b59eaec6ae37b12330e5159d56992921d3116414d037fca62a9c333650dfe5e037c06fcb198c22cede675bf40ba78bc4d38931da60cf8a9aeb4eb0cf57c9f32
languageName: node
linkType: hard
"vite-plugin-wasm@npm:^3.2.2":
version: 3.2.2
resolution: "vite-plugin-wasm@npm:3.2.2"
peerDependencies:
vite: ^2 || ^3 || ^4
checksum: ed9b56b3cd47e6376c21592f4d24f818334d5796226f6c1dfca89a4f744d69a34cb85dd5e06f164a2670827113ccfa5f03d0361e51210012706123374b0f8277
languageName: node
linkType: hard
"vite@npm:^3.0.0 || ^4.0.0, vite@npm:^4.2.0":
version: 4.2.1
resolution: "vite@npm:4.2.1"