Release 0.5.12 (#1091 )

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
chore: fix changeset
2026-07-04 03:40:26 -04:00 · 2024-07-30 15:46:39 -07:00 · 2024-07-30 15:32:24 -07:00 · 2024-07-30 15:07:46 -07:00 · 2024-07-30 09:36:58 -07:00 · 2024-07-30 08:19:32 -07:00
35 changed files with 522 additions and 161 deletions
@@ -164,7 +164,7 @@ Check out our NextJS playground at https://llama-playground.vercel.app/. The sou

 - [Node](/packages/llamaindex/src/Node.ts): The basic data building block. Most commonly, these are parts of the document split into manageable pieces that are small enough to be fed into an embedding model and LLM.

- [Embedding](/packages/llamaindex/src/embeddings/OpenAIEmbedding.ts): Embeddings are sets of floating point numbers which represent the data in a Node. By comparing the similarity of embeddings, we can derive an understanding of the similarity of two pieces of data. One use case is to compare the embedding of a question with the embeddings of our Nodes to see which Nodes may contain the data needed to answer that quesiton. Because the default service context is OpenAI, the default embedding is `OpenAIEmbedding`. If using different models, say through Ollama, use this [Embedding](/packages/llamaindex/src/embeddings/OllamaEmbedding.ts) (see all [here](/packages/llamaindex/src/embeddings)).
+- [Embedding](/packages/llamaindex/src/embeddings/OpenAIEmbedding.ts): Embeddings are sets of floating point numbers which represent the data in a Node. By comparing the similarity of embeddings, we can derive an understanding of the similarity of two pieces of data. One use case is to compare the embedding of a question with the embeddings of our Nodes to see which Nodes may contain the data needed to answer that question. Because the default service context is OpenAI, the default embedding is `OpenAIEmbedding`. If using different models, say through Ollama, use this [Embedding](/packages/llamaindex/src/embeddings/OllamaEmbedding.ts) (see all [here](/packages/llamaindex/src/embeddings)).

 - [Indices](/packages/llamaindex/src/indices/): Indices store the Nodes and the embeddings of those nodes. QueryEngines retrieve Nodes from these Indices using embedding similarity.

@@ -1,5 +1,14 @@
 # docs

+## 0.0.53
+
+### Patch Changes
+
+- Updated dependencies [345300f]
+- Updated dependencies [da5cfc4]
+- Updated dependencies [da5cfc4]
+  - llamaindex@0.5.12
+
 ## 0.0.52

 ### Patch Changes
@@ -48,6 +48,7 @@ They can be divided into two groups.
 - `gpt4oApiKey?` Deprecated. Use vendorMultimodal params. Optional. Set the GPT-4o API key. Lowers the cost of parsing by using your own API key. Your OpenAI account will be charged. Can also be set in the environment variable `LLAMA_CLOUD_GPT4O_API_KEY`.
 - `boundingBox?` Optional. Specify an area of the document to parse. Expects the bounding box margins as a string in clockwise order, e.g. `boundingBox = "0.1,0,0,0"` to not parse the top 10% of the document.
 - `targetPages?` Optional. Specify which pages to parse by specifying them as a comma-separated list. First page is `0`.
+- `splitByPage` Wether to split the results, creating one document per page. Uses the set `pageSeparator` or `\n---\n` as fallback. Default is true.
 - `useVendorMultimodalModel` set to true to use a multimodal model. Default is `false`.
 - `vendorMultimodalModel?` Optional. Specify which multimodal model to use. Default is GPT4o. See [here](https://docs.cloud.llamaindex.ai/llamaparse/features/multimodal) for a list of available models and cost.
 - `vendorMultimodalApiKey?` Optional. Set the multimodal model API key. Can also be set in the environment variable `LLAMA_CLOUD_VENDOR_MULTIMODAL_API_KEY`.
@@ -1,6 +1,6 @@
 {
  "name": "docs",
-  "version": "0.0.52",
+  "version": "0.0.53",
  "private": true,
  "scripts": {
    "docusaurus": "docusaurus",
@@ -1,5 +1,15 @@
 # @llamaindex/autotool-02-next-example

+## 0.1.37
+
+### Patch Changes
+
+- Updated dependencies [345300f]
+- Updated dependencies [da5cfc4]
+- Updated dependencies [da5cfc4]
+  - llamaindex@0.5.12
+  - @llamaindex/autotool@2.0.0
+
 ## 0.1.36

 ### Patch Changes
@@ -1,7 +1,7 @@
 {
  "name": "@llamaindex/autotool-02-next-example",
  "private": true,
-  "version": "0.1.36",
+  "version": "0.1.37",
  "scripts": {
    "dev": "next dev",
    "build": "next build",
@@ -51,7 +51,7 @@
    "unplugin": "^1.10.1"
  },
  "peerDependencies": {
-    "llamaindex": "^0.5.11",
+    "llamaindex": "^0.5.12",
    "openai": "^4",
    "typescript": "^4"
  },
@@ -70,7 +70,7 @@
    "@swc/types": "^0.1.8",
    "@types/json-schema": "^7.0.15",
    "@types/node": "^20.12.11",
-    "bunchee": "5.3.0-beta.0",
+    "bunchee": "5.3.1",
    "llamaindex": "workspace:*",
    "next": "14.2.5",
    "rollup": "^4.18.0",
@@ -4,7 +4,7 @@
  "type": "module",
  "license": "MIT",
  "scripts": {
-    "generate": "pnpm dlx @hey-api/openapi-ts",
+    "generate": "pnpm dlx @hey-api/openapi-ts@0.49.0",
    "build": "pnpm run generate && bunchee"
  },
  "files": [
@@ -35,6 +35,6 @@
  },
  "devDependencies": {
    "@hey-api/openapi-ts": "^0.48.0",
-    "bunchee": "5.3.0-beta.0"
+    "bunchee": "5.3.1"
  }
 }
@@ -1,5 +1,12 @@
 # @llamaindex/community

+## 0.0.27
+
+### Patch Changes
+
+- Updated dependencies [0452af9]
+  - @llamaindex/core@0.1.6
+
 ## 0.0.26

 ### Patch Changes
@@ -1,7 +1,7 @@
 {
  "name": "@llamaindex/community",
  "description": "Community package for LlamaIndexTS",
-  "version": "0.0.26",
+  "version": "0.0.27",
  "type": "module",
  "types": "dist/type/index.d.ts",
  "main": "dist/cjs/index.js",
@@ -43,7 +43,7 @@
  },
  "devDependencies": {
    "@types/node": "^20.14.2",
-    "bunchee": "5.3.0-beta.0"
+    "bunchee": "5.3.1"
  },
  "dependencies": {
    "@aws-sdk/client-bedrock-runtime": "^3.613.0",
@@ -1,5 +1,11 @@
 # @llamaindex/core

+## 0.1.6
+
+### Patch Changes
+
+- 0452af9: fix: handling errors in splitBySentenceTokenizer
+
 ## 0.1.5

 ### Patch Changes
@@ -1,7 +1,7 @@
 {
  "name": "@llamaindex/core",
  "type": "module",
-  "version": "0.1.5",
+  "version": "0.1.6",
  "description": "LlamaIndex Core Module",
  "exports": {
    "./node-parser": {
@@ -131,7 +131,7 @@
  },
  "devDependencies": {
    "ajv": "^8.16.0",
-    "bunchee": "5.3.0-beta.0",
+    "bunchee": "5.3.1",
    "natural": "^7.1.0"
  },
  "dependencies": {
@@ -39,7 +39,11 @@ export const splitBySentenceTokenizer = (): TextSplitterFn => {
  }
  const tokenizer = sentenceTokenizer;
  return (text: string) => {
-    return tokenizer.tokenize(text);
+    try {
+      return tokenizer.tokenize(text);
+    } catch {
+      return [text];
+    }
  };
 };

@@ -1,7 +1,10 @@
-import { SentenceSplitter } from "@llamaindex/core/node-parser";
+import {
+  SentenceSplitter,
+  splitBySentenceTokenizer,
+} from "@llamaindex/core/node-parser";
 import { describe, expect, test } from "vitest";

-describe("SentenceSplitter", () => {
+describe("sentence splitter", () => {
  test("initializes", () => {
    const sentenceSplitter = new SentenceSplitter();
    expect(sentenceSplitter).toBeDefined();
@@ -105,4 +108,11 @@ describe("SentenceSplitter", () => {
      "因为他照了人类，连我都在内。",
    ]);
  });
+
+  test("issue 1087 - edge case when input with brackets", () => {
+    const text =
+      "A card must be of uniform thickness and made of unfolded and uncreased paper or cardstock of approximately the quality and weight of a stamped card (i.e., a card available from USPS).";
+    const split = splitBySentenceTokenizer();
+    expect(split(text)).toEqual([text]);
+  });
 });
@@ -1,5 +1,14 @@
 # @llamaindex/experimental

+## 0.0.62
+
+### Patch Changes
+
+- Updated dependencies [345300f]
+- Updated dependencies [da5cfc4]
+- Updated dependencies [da5cfc4]
+  - llamaindex@0.5.12
+
 ## 0.0.61

 ### Patch Changes
@@ -1,7 +1,7 @@
 {
  "name": "@llamaindex/experimental",
  "description": "Experimental package for LlamaIndexTS",
-  "version": "0.0.61",
+  "version": "0.0.62",
  "type": "module",
  "types": "dist/type/index.d.ts",
  "main": "dist/cjs/index.js",
@@ -1,5 +1,15 @@
 # llamaindex

+## 0.5.12
+
+### Patch Changes
+
+- 345300f: feat: add splitByPage mode to LlamaParseReader
+- da5cfc4: Add metadatafilter options to retriever constructors
+- da5cfc4: Fix system prompt not used in ContextChatEngine
+- Updated dependencies [0452af9]
+  - @llamaindex/core@0.1.6
+
 ## 0.5.11

 ### Patch Changes
@@ -1,5 +1,14 @@
 # @llamaindex/cloudflare-worker-agent-test

+## 0.0.46
+
+### Patch Changes
+
+- Updated dependencies [345300f]
+- Updated dependencies [da5cfc4]
+- Updated dependencies [da5cfc4]
+  - llamaindex@0.5.12
+
 ## 0.0.45

 ### Patch Changes
@@ -1,6 +1,6 @@
 {
  "name": "@llamaindex/cloudflare-worker-agent-test",
-  "version": "0.0.45",
+  "version": "0.0.46",
  "type": "module",
  "private": true,
  "scripts": {
@@ -1,5 +1,14 @@
 # @llamaindex/next-agent-test

+## 0.1.46
+
+### Patch Changes
+
+- Updated dependencies [345300f]
+- Updated dependencies [da5cfc4]
+- Updated dependencies [da5cfc4]
+  - llamaindex@0.5.12
+
 ## 0.1.45

 ### Patch Changes
@@ -1,6 +1,6 @@
 {
  "name": "@llamaindex/next-agent-test",
-  "version": "0.1.45",
+  "version": "0.1.46",
  "private": true,
  "scripts": {
    "dev": "next dev",
@@ -1,5 +1,14 @@
 # test-edge-runtime

+## 0.1.45
+
+### Patch Changes
+
+- Updated dependencies [345300f]
+- Updated dependencies [da5cfc4]
+- Updated dependencies [da5cfc4]
+  - llamaindex@0.5.12
+
 ## 0.1.44

 ### Patch Changes
@@ -1,6 +1,6 @@
 {
  "name": "@llamaindex/nextjs-edge-runtime-test",
-  "version": "0.1.44",
+  "version": "0.1.45",
  "private": true,
  "scripts": {
    "dev": "next dev",
@@ -1,5 +1,14 @@
 # @llamaindex/next-node-runtime

+## 0.0.27
+
+### Patch Changes
+
+- Updated dependencies [345300f]
+- Updated dependencies [da5cfc4]
+- Updated dependencies [da5cfc4]
+  - llamaindex@0.5.12
+
 ## 0.0.26

 ### Patch Changes
@@ -1,6 +1,6 @@
 {
  "name": "@llamaindex/next-node-runtime-test",
-  "version": "0.0.26",
+  "version": "0.0.27",
  "private": true,
  "scripts": {
    "dev": "next dev",
@@ -1,5 +1,14 @@
 # @llamaindex/waku-query-engine-test

+## 0.0.46
+
+### Patch Changes
+
+- Updated dependencies [345300f]
+- Updated dependencies [da5cfc4]
+- Updated dependencies [da5cfc4]
+  - llamaindex@0.5.12
+
 ## 0.0.45

 ### Patch Changes
@@ -1,6 +1,6 @@
 {
  "name": "@llamaindex/waku-query-engine-test",
-  "version": "0.0.45",
+  "version": "0.0.46",
  "type": "module",
  "private": true,
  "scripts": {
@@ -1,6 +1,6 @@
 {
  "name": "llamaindex",
-  "version": "0.5.11",
+  "version": "0.5.12",
  "license": "MIT",
  "type": "module",
  "keywords": [
@@ -15,8 +15,8 @@ import { initService } from "./utils.js";

 export type CloudRetrieveParams = Omit<
  RetrievalParams,
-  "query" | "searchFilters" | "className" | "denseSimilarityTopK"
-> & { similarityTopK?: number };
+  "query" | "search_filters" | "dense_similarity_top_k"
+> & { similarityTopK?: number; filters?: MetadataFilters };

 export class LlamaCloudRetriever implements BaseRetriever {
  clientParams: ClientParams;
@@ -84,7 +84,9 @@ export class LlamaCloudRetriever implements BaseRetriever {
        requestBody: {
          ...this.retrieveParams,
          query: extractText(query),
-          search_filters: preFilters as MetadataFilters,
+          search_filters:
+            this.retrieveParams.filters ?? (preFilters as MetadataFilters),
+          dense_similarity_top_k: this.retrieveParams.similarityTopK,
        },
      });

@@ -126,7 +126,7 @@ export class ContextChatEngine extends PromptMixin implements ChatEngine {
    if (!this.systemPrompt) return message;
    return {
      ...message,
-      content: this.systemPrompt.trim() + "\n" + message.content,
+      content: this.systemPrompt.trim() + "\n" + extractText(message.content),
    };
  }
 }
@@ -386,6 +386,7 @@ export type VectorIndexRetrieverOptions = {
  index: VectorStoreIndex;
  similarityTopK?: number;
  topK?: TopKMap;
+  filters?: MetadataFilters;
 };

 export class VectorIndexRetriever implements BaseRetriever {
@@ -393,14 +394,21 @@ export class VectorIndexRetriever implements BaseRetriever {
  topK: TopKMap;

  serviceContext?: ServiceContext;
+  filters?: MetadataFilters;

-  constructor({ index, similarityTopK, topK }: VectorIndexRetrieverOptions) {
+  constructor({
+    index,
+    similarityTopK,
+    topK,
+    filters,
+  }: VectorIndexRetrieverOptions) {
    this.index = index;
    this.serviceContext = this.index.serviceContext;
    this.topK = topK ?? {
      [ModalityType.TEXT]: similarityTopK ?? DEFAULT_SIMILARITY_TOP_K,
      [ModalityType.IMAGE]: DEFAULT_SIMILARITY_TOP_K,
    };
+    this.filters = filters;
  }

  /**
@@ -443,7 +451,7 @@ export class VectorIndexRetriever implements BaseRetriever {
    query: MessageContent,
    type: ModalityType,
    vectorStore: VectorStore,
-    preFilters?: MetadataFilters,
+    filters?: MetadataFilters,
  ): Promise<NodeWithScore[]> {
    // convert string message to multi-modal format
    if (typeof query === "string") {
@@ -460,7 +468,7 @@ export class VectorIndexRetriever implements BaseRetriever {
          queryEmbedding,
          mode: VectorStoreQueryMode.DEFAULT,
          similarityTopK: this.topK[type],
-          filters: preFilters ?? undefined,
+          filters: this.filters ?? filters ?? undefined,
        });
        nodes = nodes.concat(this.buildNodeListFromQueryResult(result));
      }
@@ -143,6 +143,8 @@ export class LlamaParseReader extends FileReader {
  targetPages?: string;
  // Whether or not to ignore and skip errors raised during parsing.
  ignoreErrors: boolean = true;
+  // Whether to split by page using the pageSeparator or '\n---\n' as default.
+  splitByPage: boolean = true;
  // Whether to use the vendor multimodal API.
  useVendorMultimodalModel: boolean = false;
  // The model name for the vendor multimodal API
@@ -326,10 +328,17 @@ export class LlamaParseReader extends FileReader {
      }

      // Return results as Document objects
-      const resultJson = await this.getJobResult(jobId, this.resultType);
+      const jobResults = await this.getJobResult(jobId, this.resultType);
+      const resultText = jobResults[this.resultType];
+
+      // Split the text by separator if splitByPage is true
+      if (this.splitByPage) {
+        return this.splitTextBySeparator(resultText);
+      }
+
      return [
        new Document({
-          text: resultJson[this.resultType],
+          text: resultText,
        }),
      ];
    } catch (e) {
@@ -485,6 +494,17 @@ export class LlamaParseReader extends FileReader {
    return filteredParams;
  }

+  private splitTextBySeparator(text: string): Document[] {
+    const separator = this.pageSeparator ?? "\n---\n";
+    const textChunks = text.split(separator);
+    return textChunks.map(
+      (docChunk: string) =>
+        new Document({
+          text: docChunk,
+        }),
+    );
+  }
+
  static async getMimeType(
    data: Uint8Array,
  ): Promise<{ mime: string; extension: string }> {
@@ -6,6 +6,10 @@ import { FileReader } from "./type.js";
 */
 export class PDFReader extends FileReader {
  async loadDataAsContent(content: Uint8Array): Promise<Document[]> {
+    // XXX: create a new Uint8Array to prevent "Please provide binary data as `Uint8Array`, rather than `Buffer`." error if a Buffer passed
+    if (content instanceof Buffer) {
+      content = new Uint8Array(content);
+    }
    const { totalPages, text } = await readPDF(content);
    return text.map((text, page) => {
      const metadata = {
@@ -18,8 +18,7 @@ export abstract class FileReader implements BaseReader {
  ): Promise<Document[]>;

  async loadData(filePath: string): Promise<Document[]> {
-    // XXX: create a new Uint8Array to prevent "Please provide binary data as `Uint8Array`, rather than `Buffer`." error in PDFReader
-    const fileContent = new Uint8Array(await fs.readFile(filePath));
+    const fileContent = await fs.readFile(filePath);
    const fileName = path.basename(filePath);
    const docs = await this.loadDataAsContent(fileContent, fileName);
    docs.forEach(FileReader.addMetaData(filePath));
Author	SHA1	Message	Date
github-actions[bot]	9c9e9b4e03	Release 0.5.12 (#1091 ) Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>	2024-07-30 15:46:39 -07:00
Alex Yang	e3c307ab55	chore: fix changeset	2024-07-30 15:32:24 -07:00
André Lago	b1b2baa969	docs: fix minor typo (#1092 )	2024-07-30 15:07:46 -07:00
Marcus Schiesser	0452af91cc	fix: handling errors in splitBySentenceTokenizer (#1087 ) Co-authored-by: Alex Yang <himself65@outlook.com>	2024-07-30 09:36:58 -07:00
Marcus Schiesser	da5cfc42e5	fix: integrate with `create-llama` (#1088 ) Co-authored-by: Alex Yang <himself65@outlook.com>	2024-07-30 08:19:32 -07:00
Alex Yang	eb89223386	chore: bump `bunchee@5.3.1` (#1090 )	2024-07-30 08:19:01 -07:00
Alex Yang	93dc3a31b3	fix: lock hey-api version (#1089 )	2024-07-30 08:00:05 -07:00
Fabian Wimmer	345300f110	feat: add split by page mode to LlamaParseReader (#924 )	2024-07-29 16:16:46 +07:00