Compare commits

...

4 Commits

2 changed files with 131 additions and 12 deletions
+40
View File
@@ -20,3 +20,43 @@ const serviceContext = serviceContextFromDefaults({ llm: openaiLLM });
- [OpenAI](../../api/classes/OpenAI.md)
- [ServiceContext](../../api/interfaces/ServiceContext.md)
## Usage
The LLM object tracks API consumption across all your code. This is done through the `llm.usage` property.
_Note: Usage is not supported for stream calls_
```javascript
import { OpenAI } from "llamaindex";
// Create a new instance of OpenAI with the specified model and temperature
const llm = new OpenAI({ model: "gpt-3.5-turbo", temperature: 0 });
async function ask() {
// Send a chat request to the OpenAI API
const response = await llm.chat([
{
role: "system",
content: "You are a helpful llama.",
},
{
role: "user",
content: "Where do llama live?",
},
]);
// Log the response from the API
console.log(response);
/**
The response includes a message from the assistant and usage information.
The usage information includes the number of prompt tokens, completion tokens, and total tokens used.
*/
// Log the usage information from the LLM object
console.log(llm.usage);
/**
The usage object includes the number of prompt tokens, completion tokens, the cost, and the compute seconds.
*/
}
```
+91 -12
View File
@@ -45,6 +45,8 @@ export interface ChatResponse {
message: ChatMessage;
raw?: Record<string, any>;
delta?: string;
metrics?: any;
usage?: Usage;
}
// NOTE in case we need CompletionResponse to diverge from ChatResponse in the future
@@ -98,19 +100,44 @@ export interface LLM {
* Calculates the number of tokens needed for the given chat messages
*/
tokens(messages: ChatMessage[]): number;
/**
* Returns the usage information of the LLM
*/
usage: Usage;
}
export const GPT4_MODELS = {
"gpt-4": { contextWindow: 8192 },
"gpt-4-32k": { contextWindow: 32768 },
"gpt-4-1106-preview": { contextWindow: 128000 },
"gpt-4-vision-preview": { contextWindow: 8192 },
"gpt-4": { contextWindow: 8192, promptCost: 0.03, completionCost: 0.06 },
"gpt-4-32k": { contextWindow: 32768, promptCost: 0.06, completionCost: 0.12 },
"gpt-4-1106-preview": {
contextWindow: 128000,
promptCost: 0.01,
completionCost: 0.03,
},
"gpt-4-vision-preview": {
contextWindow: 8192,
promptCost: 0.01,
completionCost: 0.03,
},
};
export const GPT35_MODELS = {
"gpt-3.5-turbo": { contextWindow: 4096 },
"gpt-3.5-turbo-16k": { contextWindow: 16384 },
"gpt-3.5-turbo-1106": { contextWindow: 16384 },
"gpt-3.5-turbo": {
contextWindow: 4096,
promptCost: 0.001,
completionCost: 0.002,
},
"gpt-3.5-turbo-16k": {
contextWindow: 16384,
promptCost: 0.001,
completionCost: 0.002,
},
"gpt-3.5-turbo-1106": {
contextWindow: 16384,
promptCost: 0.001,
completionCost: 0.002,
},
};
/**
@@ -121,6 +148,19 @@ export const ALL_AVAILABLE_OPENAI_MODELS = {
...GPT35_MODELS,
};
export class Usage {
promptTokens: number;
completionTokens: number;
computeSeconds: number;
cost: number;
constructor() {
this.promptTokens = 0;
this.completionTokens = 0;
this.cost = 0;
this.computeSeconds = 0;
}
}
/**
* OpenAI LLM implementation
*/
@@ -149,6 +189,7 @@ export class OpenAI implements LLM {
callbackManager?: CallbackManager;
usage: Usage;
constructor(
init?: Partial<OpenAI> & {
azure?: AzureOpenAIConfig;
@@ -164,6 +205,8 @@ export class OpenAI implements LLM {
this.additionalChatOptions = init?.additionalChatOptions;
this.additionalSessionOptions = init?.additionalSessionOptions;
this.usage = new Usage();
if (init?.azure || shouldUseAzure()) {
const azureConfig = getAzureConfigFromEnv({
...init?.azure,
@@ -278,8 +321,21 @@ export class OpenAI implements LLM {
});
const content = response.choices[0].message?.content ?? "";
// Update usage
this.usage.promptTokens += response.usage?.prompt_tokens || 0;
this.usage.completionTokens += response.usage?.completion_tokens || 0;
this.usage.cost +=
((response.usage?.prompt_tokens || 0) *
ALL_AVAILABLE_OPENAI_MODELS[this.model].promptCost) /
1000 +
((response.usage?.completion_tokens || 0) *
ALL_AVAILABLE_OPENAI_MODELS[this.model].completionCost) /
1000;
return {
message: { content, role: response.choices[0].message.role },
usage: response.usage,
} as R;
}
@@ -373,23 +429,27 @@ export const ALL_AVAILABLE_LLAMADEUCE_MODELS = {
replicateApi:
"replicate/llama70b-v2-chat:e951f18578850b652510200860fc4ea62b3b16fac280f83ff32282f87bbd2e48",
//^ Previous 70b model. This is also actually 4 bit, although not exllama.
costPerSecond: 0.0014,
},
"Llama-2-70b-chat-4bit": {
contextWindow: 4096,
replicateApi:
"meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3",
//^ Model is based off of exllama 4bit.
costPerSecond: 0.0014,
},
"Llama-2-13b-chat-old": {
contextWindow: 4096,
replicateApi:
"a16z-infra/llama13b-v2-chat:df7690f1994d94e96ad9d568eac121aecf50684a0b0963b25a41cc40061269e5",
costPerSecond: 0.000725,
},
//^ Last known good 13b non-quantized model. In future versions they add the SYS and INST tags themselves
"Llama-2-13b-chat-4bit": {
contextWindow: 4096,
replicateApi:
"meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d",
costPerSecond: 0.000725,
},
"Llama-2-7b-chat-old": {
contextWindow: 4096,
@@ -399,11 +459,13 @@ export const ALL_AVAILABLE_LLAMADEUCE_MODELS = {
// tags themselves
// https://github.com/replicate/cog-llama-template/commit/fa5ce83912cf82fc2b9c01a4e9dc9bff6f2ef137
// Problem is that they fix the max_new_tokens issue in the same commit. :-(
costPerSecond: 0.000725,
},
"Llama-2-7b-chat-4bit": {
contextWindow: 4096,
replicateApi:
"meta/llama-2-7b-chat:13c3cdee13ee059ab779f0291d29054dab00a47dad8261375654de5540165fb0",
costPerSecond: 0.000725,
},
};
@@ -430,7 +492,7 @@ export class LlamaDeuce implements LLM {
maxTokens?: number;
replicateSession: ReplicateSession;
hasStreaming: boolean;
usage: Usage;
constructor(init?: Partial<LlamaDeuce>) {
this.model = init?.model ?? "Llama-2-70b-chat-4bit";
this.chatStrategy =
@@ -445,6 +507,7 @@ export class LlamaDeuce implements LLM {
ALL_AVAILABLE_LLAMADEUCE_MODELS[this.model].contextWindow; // For Replicate, the default is 500 tokens which is too low.
this.replicateSession = init?.replicateSession ?? new ReplicateSession();
this.hasStreaming = init?.hasStreaming ?? false;
this.usage = new Usage();
}
tokens(messages: ChatMessage[]): number {
@@ -616,16 +679,23 @@ If a question does not make any sense, or is not factually coherent, explain why
//TODO: Add streaming for this
//Non-streaming
const response = await this.replicateSession.replicate.run(
const response = (await this.replicateSession.replicate.run(
api,
replicateOptions,
);
)) as any;
this.usage.computeSeconds += response.metrics?.predict_time;
this.usage.cost +=
response.metrics?.predict_time *
ALL_AVAILABLE_LLAMADEUCE_MODELS[this.model].costPerSecond;
return {
message: {
content: (response as Array<string>).join("").trimStart(),
//^ We need to do this because Replicate returns a list of strings (for streaming functionality which is not exposed by the run function)
role: "assistant",
},
metrics: response.metrics,
} as R;
}
@@ -639,8 +709,12 @@ If a question does not make any sense, or is not factually coherent, explain why
export const ALL_AVAILABLE_ANTHROPIC_MODELS = {
// both models have 100k context window, see https://docs.anthropic.com/claude/reference/selecting-a-model
"claude-2": { contextWindow: 200000 },
"claude-instant-1": { contextWindow: 100000 },
"claude-2": { contextWindow: 200000, promptCost: 8.0, completionCost: 24.0 },
"claude-instant-1": {
contextWindow: 100000,
promptCost: 0.8, // for 1 Million tokens
completionCost: 2.4, // for 1 Million tokens
},
};
/**
@@ -664,6 +738,7 @@ export class Anthropic implements LLM {
callbackManager?: CallbackManager;
usage: Usage;
constructor(init?: Partial<Anthropic>) {
this.model = init?.model ?? "claude-2";
this.temperature = init?.temperature ?? 0.1;
@@ -681,6 +756,7 @@ export class Anthropic implements LLM {
timeout: this.timeout,
});
this.usage = new Usage();
this.callbackManager = init?.callbackManager;
}
@@ -809,6 +885,7 @@ export class Portkey implements LLM {
session: PortkeySession;
callbackManager?: CallbackManager;
usage: Usage;
constructor(init?: Partial<Portkey>) {
this.apiKey = init?.apiKey;
this.baseURL = init?.baseURL;
@@ -821,6 +898,8 @@ export class Portkey implements LLM {
mode: this.mode,
});
this.callbackManager = init?.callbackManager;
this.usage = new Usage();
}
tokens(messages: ChatMessage[]): number {