Change any to Usage for class ChatMessage

Add some doc on usage
Expend support for replicate / [usage/metrics] field in chatCompletion object for tracking of current query consumption
2026-07-01 22:14:03 -04:00 · 2023-12-21 19:17:19 +01:00 · 2023-12-09 18:17:26 +01:00 · 2023-12-05 17:38:40 +01:00 · 2023-12-05 16:20:57 +01:00
2 changed files with 131 additions and 12 deletions
@@ -20,3 +20,43 @@ const serviceContext = serviceContextFromDefaults({ llm: openaiLLM });

 - [OpenAI](../../api/classes/OpenAI.md)
 - [ServiceContext](../../api/interfaces/ServiceContext.md)
+
+## Usage
+
+The LLM object tracks API consumption across all your code. This is done through the `llm.usage` property.
+
+_Note: Usage is not supported for stream calls_
+
+```javascript
+import { OpenAI } from "llamaindex";
+
+// Create a new instance of OpenAI with the specified model and temperature
+const llm = new OpenAI({ model: "gpt-3.5-turbo", temperature: 0 });
+
+async function ask() {
+  // Send a chat request to the OpenAI API
+  const response = await llm.chat([
+    {
+      role: "system",
+      content: "You are a helpful llama.",
+    },
+    {
+      role: "user",
+      content: "Where do llama live?",
+    },
+  ]);
+
+  // Log the response from the API
+  console.log(response);
+  /**
+  The response includes a message from the assistant and usage information.
+  The usage information includes the number of prompt tokens, completion tokens, and total tokens used.
+  */
+
+  // Log the usage information from the LLM object
+  console.log(llm.usage);
+  /** 
+  The usage object includes the number of prompt tokens, completion tokens, the cost, and the compute seconds.
+  */
+}
+```
@@ -45,6 +45,8 @@ export interface ChatResponse {
  message: ChatMessage;
  raw?: Record<string, any>;
  delta?: string;
+  metrics?: any;
+  usage?: Usage;
 }

 // NOTE in case we need CompletionResponse to diverge from ChatResponse in the future
@@ -98,19 +100,44 @@ export interface LLM {
   * Calculates the number of tokens needed for the given chat messages
   */
  tokens(messages: ChatMessage[]): number;
+
+  /**
+   * Returns the usage information of the LLM
+   */
+  usage: Usage;
 }

 export const GPT4_MODELS = {
-  "gpt-4": { contextWindow: 8192 },
-  "gpt-4-32k": { contextWindow: 32768 },
-  "gpt-4-1106-preview": { contextWindow: 128000 },
-  "gpt-4-vision-preview": { contextWindow: 8192 },
+  "gpt-4": { contextWindow: 8192, promptCost: 0.03, completionCost: 0.06 },
+  "gpt-4-32k": { contextWindow: 32768, promptCost: 0.06, completionCost: 0.12 },
+  "gpt-4-1106-preview": {
+    contextWindow: 128000,
+    promptCost: 0.01,
+    completionCost: 0.03,
+  },
+  "gpt-4-vision-preview": {
+    contextWindow: 8192,
+    promptCost: 0.01,
+    completionCost: 0.03,
+  },
 };

 export const GPT35_MODELS = {
-  "gpt-3.5-turbo": { contextWindow: 4096 },
-  "gpt-3.5-turbo-16k": { contextWindow: 16384 },
-  "gpt-3.5-turbo-1106": { contextWindow: 16384 },
+  "gpt-3.5-turbo": {
+    contextWindow: 4096,
+    promptCost: 0.001,
+    completionCost: 0.002,
+  },
+  "gpt-3.5-turbo-16k": {
+    contextWindow: 16384,
+    promptCost: 0.001,
+    completionCost: 0.002,
+  },
+  "gpt-3.5-turbo-1106": {
+    contextWindow: 16384,
+    promptCost: 0.001,
+    completionCost: 0.002,
+  },
 };

 /**
@@ -121,6 +148,19 @@ export const ALL_AVAILABLE_OPENAI_MODELS = {
  ...GPT35_MODELS,
 };

+export class Usage {
+  promptTokens: number;
+  completionTokens: number;
+  computeSeconds: number;
+  cost: number;
+  constructor() {
+    this.promptTokens = 0;
+    this.completionTokens = 0;
+    this.cost = 0;
+    this.computeSeconds = 0;
+  }
+}
+
 /**
 * OpenAI LLM implementation
 */
@@ -149,6 +189,7 @@ export class OpenAI implements LLM {

  callbackManager?: CallbackManager;

+  usage: Usage;
  constructor(
    init?: Partial<OpenAI> & {
      azure?: AzureOpenAIConfig;
@@ -164,6 +205,8 @@ export class OpenAI implements LLM {
    this.additionalChatOptions = init?.additionalChatOptions;
    this.additionalSessionOptions = init?.additionalSessionOptions;

+    this.usage = new Usage();
+
    if (init?.azure || shouldUseAzure()) {
      const azureConfig = getAzureConfigFromEnv({
        ...init?.azure,
@@ -278,8 +321,21 @@ export class OpenAI implements LLM {
    });

    const content = response.choices[0].message?.content ?? "";
+
+    // Update usage
+    this.usage.promptTokens += response.usage?.prompt_tokens || 0;
+    this.usage.completionTokens += response.usage?.completion_tokens || 0;
+    this.usage.cost +=
+      ((response.usage?.prompt_tokens || 0) *
+        ALL_AVAILABLE_OPENAI_MODELS[this.model].promptCost) /
+        1000 +
+      ((response.usage?.completion_tokens || 0) *
+        ALL_AVAILABLE_OPENAI_MODELS[this.model].completionCost) /
+        1000;
+
    return {
      message: { content, role: response.choices[0].message.role },
+      usage: response.usage,
    } as R;
  }

@@ -373,23 +429,27 @@ export const ALL_AVAILABLE_LLAMADEUCE_MODELS = {
    replicateApi:
      "replicate/llama70b-v2-chat:e951f18578850b652510200860fc4ea62b3b16fac280f83ff32282f87bbd2e48",
    //^ Previous 70b model. This is also actually 4 bit, although not exllama.
+    costPerSecond: 0.0014,
  },
  "Llama-2-70b-chat-4bit": {
    contextWindow: 4096,
    replicateApi:
      "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3",
    //^ Model is based off of exllama 4bit.
+    costPerSecond: 0.0014,
  },
  "Llama-2-13b-chat-old": {
    contextWindow: 4096,
    replicateApi:
      "a16z-infra/llama13b-v2-chat:df7690f1994d94e96ad9d568eac121aecf50684a0b0963b25a41cc40061269e5",
+    costPerSecond: 0.000725,
  },
  //^ Last known good 13b non-quantized model. In future versions they add the SYS and INST tags themselves
  "Llama-2-13b-chat-4bit": {
    contextWindow: 4096,
    replicateApi:
      "meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d",
+    costPerSecond: 0.000725,
  },
  "Llama-2-7b-chat-old": {
    contextWindow: 4096,
@@ -399,11 +459,13 @@ export const ALL_AVAILABLE_LLAMADEUCE_MODELS = {
    // tags themselves
    // https://github.com/replicate/cog-llama-template/commit/fa5ce83912cf82fc2b9c01a4e9dc9bff6f2ef137
    // Problem is that they fix the max_new_tokens issue in the same commit. :-(
+    costPerSecond: 0.000725,
  },
  "Llama-2-7b-chat-4bit": {
    contextWindow: 4096,
    replicateApi:
      "meta/llama-2-7b-chat:13c3cdee13ee059ab779f0291d29054dab00a47dad8261375654de5540165fb0",
+    costPerSecond: 0.000725,
  },
 };

@@ -430,7 +492,7 @@ export class LlamaDeuce implements LLM {
  maxTokens?: number;
  replicateSession: ReplicateSession;
  hasStreaming: boolean;
-
+  usage: Usage;
  constructor(init?: Partial<LlamaDeuce>) {
    this.model = init?.model ?? "Llama-2-70b-chat-4bit";
    this.chatStrategy =
@@ -445,6 +507,7 @@ export class LlamaDeuce implements LLM {
      ALL_AVAILABLE_LLAMADEUCE_MODELS[this.model].contextWindow; // For Replicate, the default is 500 tokens which is too low.
    this.replicateSession = init?.replicateSession ?? new ReplicateSession();
    this.hasStreaming = init?.hasStreaming ?? false;
+    this.usage = new Usage();
  }

  tokens(messages: ChatMessage[]): number {
@@ -616,16 +679,23 @@ If a question does not make any sense, or is not factually coherent, explain why
    //TODO: Add streaming for this

    //Non-streaming
-    const response = await this.replicateSession.replicate.run(
+    const response = (await this.replicateSession.replicate.run(
      api,
      replicateOptions,
-    );
+    )) as any;
+
+    this.usage.computeSeconds += response.metrics?.predict_time;
+    this.usage.cost +=
+      response.metrics?.predict_time *
+      ALL_AVAILABLE_LLAMADEUCE_MODELS[this.model].costPerSecond;
+
    return {
      message: {
        content: (response as Array<string>).join("").trimStart(),
        //^ We need to do this because Replicate returns a list of strings (for streaming functionality which is not exposed by the run function)
        role: "assistant",
      },
+      metrics: response.metrics,
    } as R;
  }

@@ -639,8 +709,12 @@ If a question does not make any sense, or is not factually coherent, explain why

 export const ALL_AVAILABLE_ANTHROPIC_MODELS = {
  // both models have 100k context window, see https://docs.anthropic.com/claude/reference/selecting-a-model
-  "claude-2": { contextWindow: 200000 },
-  "claude-instant-1": { contextWindow: 100000 },
+  "claude-2": { contextWindow: 200000, promptCost: 8.0, completionCost: 24.0 },
+  "claude-instant-1": {
+    contextWindow: 100000,
+    promptCost: 0.8, // for 1 Million tokens
+    completionCost: 2.4, // for 1 Million tokens
+  },
 };

 /**
@@ -664,6 +738,7 @@ export class Anthropic implements LLM {

  callbackManager?: CallbackManager;

+  usage: Usage;
  constructor(init?: Partial<Anthropic>) {
    this.model = init?.model ?? "claude-2";
    this.temperature = init?.temperature ?? 0.1;
@@ -681,6 +756,7 @@ export class Anthropic implements LLM {
        timeout: this.timeout,
      });

+    this.usage = new Usage();
    this.callbackManager = init?.callbackManager;
  }

@@ -809,6 +885,7 @@ export class Portkey implements LLM {
  session: PortkeySession;
  callbackManager?: CallbackManager;

+  usage: Usage;
  constructor(init?: Partial<Portkey>) {
    this.apiKey = init?.apiKey;
    this.baseURL = init?.baseURL;
@@ -821,6 +898,8 @@ export class Portkey implements LLM {
      mode: this.mode,
    });
    this.callbackManager = init?.callbackManager;
+
+    this.usage = new Usage();
  }

  tokens(messages: ChatMessage[]): number {
Author	SHA1	Message	Date
Pierre	b41f36c8eb	Change any to Usage for class ChatMessage	2023-12-21 19:17:19 +01:00
Pierre	54a910fc86	Add some doc on usage	2023-12-09 18:17:26 +01:00
Pierre	ca76e0c221	Expend support for replicate / [usage/metrics] field in chatCompletion object for tracking of current query consumption	2023-12-05 17:38:40 +01:00
Pierre	90d259c7b0	Initial: Add a way to track lifetime usage and cost with OpenAI	2023-12-05 16:20:57 +01:00