Compare commits

..

8 Commits

Author SHA1 Message Date
github-actions[bot] 9c9e9b4e03 Release 0.5.12 (#1091)
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
2024-07-30 15:46:39 -07:00
Alex Yang e3c307ab55 chore: fix changeset 2024-07-30 15:32:24 -07:00
André Lago b1b2baa969 docs: fix minor typo (#1092) 2024-07-30 15:07:46 -07:00
Marcus Schiesser 0452af91cc fix: handling errors in splitBySentenceTokenizer (#1087)
Co-authored-by: Alex Yang <himself65@outlook.com>
2024-07-30 09:36:58 -07:00
Marcus Schiesser da5cfc42e5 fix: integrate with create-llama (#1088)
Co-authored-by: Alex Yang <himself65@outlook.com>
2024-07-30 08:19:32 -07:00
Alex Yang eb89223386 chore: bump bunchee@5.3.1 (#1090) 2024-07-30 08:19:01 -07:00
Alex Yang 93dc3a31b3 fix: lock hey-api version (#1089) 2024-07-30 08:00:05 -07:00
Fabian Wimmer 345300f110 feat: add split by page mode to LlamaParseReader (#924) 2024-07-29 16:16:46 +07:00
35 changed files with 522 additions and 161 deletions
+1 -1
View File
@@ -164,7 +164,7 @@ Check out our NextJS playground at https://llama-playground.vercel.app/. The sou
- [Node](/packages/llamaindex/src/Node.ts): The basic data building block. Most commonly, these are parts of the document split into manageable pieces that are small enough to be fed into an embedding model and LLM.
- [Embedding](/packages/llamaindex/src/embeddings/OpenAIEmbedding.ts): Embeddings are sets of floating point numbers which represent the data in a Node. By comparing the similarity of embeddings, we can derive an understanding of the similarity of two pieces of data. One use case is to compare the embedding of a question with the embeddings of our Nodes to see which Nodes may contain the data needed to answer that quesiton. Because the default service context is OpenAI, the default embedding is `OpenAIEmbedding`. If using different models, say through Ollama, use this [Embedding](/packages/llamaindex/src/embeddings/OllamaEmbedding.ts) (see all [here](/packages/llamaindex/src/embeddings)).
- [Embedding](/packages/llamaindex/src/embeddings/OpenAIEmbedding.ts): Embeddings are sets of floating point numbers which represent the data in a Node. By comparing the similarity of embeddings, we can derive an understanding of the similarity of two pieces of data. One use case is to compare the embedding of a question with the embeddings of our Nodes to see which Nodes may contain the data needed to answer that question. Because the default service context is OpenAI, the default embedding is `OpenAIEmbedding`. If using different models, say through Ollama, use this [Embedding](/packages/llamaindex/src/embeddings/OllamaEmbedding.ts) (see all [here](/packages/llamaindex/src/embeddings)).
- [Indices](/packages/llamaindex/src/indices/): Indices store the Nodes and the embeddings of those nodes. QueryEngines retrieve Nodes from these Indices using embedding similarity.
+9
View File
@@ -1,5 +1,14 @@
# docs
## 0.0.53
### Patch Changes
- Updated dependencies [345300f]
- Updated dependencies [da5cfc4]
- Updated dependencies [da5cfc4]
- llamaindex@0.5.12
## 0.0.52
### Patch Changes
@@ -48,6 +48,7 @@ They can be divided into two groups.
- `gpt4oApiKey?` Deprecated. Use vendorMultimodal params. Optional. Set the GPT-4o API key. Lowers the cost of parsing by using your own API key. Your OpenAI account will be charged. Can also be set in the environment variable `LLAMA_CLOUD_GPT4O_API_KEY`.
- `boundingBox?` Optional. Specify an area of the document to parse. Expects the bounding box margins as a string in clockwise order, e.g. `boundingBox = "0.1,0,0,0"` to not parse the top 10% of the document.
- `targetPages?` Optional. Specify which pages to parse by specifying them as a comma-separated list. First page is `0`.
- `splitByPage` Wether to split the results, creating one document per page. Uses the set `pageSeparator` or `\n---\n` as fallback. Default is true.
- `useVendorMultimodalModel` set to true to use a multimodal model. Default is `false`.
- `vendorMultimodalModel?` Optional. Specify which multimodal model to use. Default is GPT4o. See [here](https://docs.cloud.llamaindex.ai/llamaparse/features/multimodal) for a list of available models and cost.
- `vendorMultimodalApiKey?` Optional. Set the multimodal model API key. Can also be set in the environment variable `LLAMA_CLOUD_VENDOR_MULTIMODAL_API_KEY`.
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "docs",
"version": "0.0.52",
"version": "0.0.53",
"private": true,
"scripts": {
"docusaurus": "docusaurus",
@@ -1,5 +1,15 @@
# @llamaindex/autotool-02-next-example
## 0.1.37
### Patch Changes
- Updated dependencies [345300f]
- Updated dependencies [da5cfc4]
- Updated dependencies [da5cfc4]
- llamaindex@0.5.12
- @llamaindex/autotool@2.0.0
## 0.1.36
### Patch Changes
@@ -1,7 +1,7 @@
{
"name": "@llamaindex/autotool-02-next-example",
"private": true,
"version": "0.1.36",
"version": "0.1.37",
"scripts": {
"dev": "next dev",
"build": "next build",
+2 -2
View File
@@ -51,7 +51,7 @@
"unplugin": "^1.10.1"
},
"peerDependencies": {
"llamaindex": "^0.5.11",
"llamaindex": "^0.5.12",
"openai": "^4",
"typescript": "^4"
},
@@ -70,7 +70,7 @@
"@swc/types": "^0.1.8",
"@types/json-schema": "^7.0.15",
"@types/node": "^20.12.11",
"bunchee": "5.3.0-beta.0",
"bunchee": "5.3.1",
"llamaindex": "workspace:*",
"next": "14.2.5",
"rollup": "^4.18.0",
+2 -2
View File
@@ -4,7 +4,7 @@
"type": "module",
"license": "MIT",
"scripts": {
"generate": "pnpm dlx @hey-api/openapi-ts",
"generate": "pnpm dlx @hey-api/openapi-ts@0.49.0",
"build": "pnpm run generate && bunchee"
},
"files": [
@@ -35,6 +35,6 @@
},
"devDependencies": {
"@hey-api/openapi-ts": "^0.48.0",
"bunchee": "5.3.0-beta.0"
"bunchee": "5.3.1"
}
}
+7
View File
@@ -1,5 +1,12 @@
# @llamaindex/community
## 0.0.27
### Patch Changes
- Updated dependencies [0452af9]
- @llamaindex/core@0.1.6
## 0.0.26
### Patch Changes
+2 -2
View File
@@ -1,7 +1,7 @@
{
"name": "@llamaindex/community",
"description": "Community package for LlamaIndexTS",
"version": "0.0.26",
"version": "0.0.27",
"type": "module",
"types": "dist/type/index.d.ts",
"main": "dist/cjs/index.js",
@@ -43,7 +43,7 @@
},
"devDependencies": {
"@types/node": "^20.14.2",
"bunchee": "5.3.0-beta.0"
"bunchee": "5.3.1"
},
"dependencies": {
"@aws-sdk/client-bedrock-runtime": "^3.613.0",
+6
View File
@@ -1,5 +1,11 @@
# @llamaindex/core
## 0.1.6
### Patch Changes
- 0452af9: fix: handling errors in splitBySentenceTokenizer
## 0.1.5
### Patch Changes
+2 -2
View File
@@ -1,7 +1,7 @@
{
"name": "@llamaindex/core",
"type": "module",
"version": "0.1.5",
"version": "0.1.6",
"description": "LlamaIndex Core Module",
"exports": {
"./node-parser": {
@@ -131,7 +131,7 @@
},
"devDependencies": {
"ajv": "^8.16.0",
"bunchee": "5.3.0-beta.0",
"bunchee": "5.3.1",
"natural": "^7.1.0"
},
"dependencies": {
+5 -1
View File
@@ -39,7 +39,11 @@ export const splitBySentenceTokenizer = (): TextSplitterFn => {
}
const tokenizer = sentenceTokenizer;
return (text: string) => {
return tokenizer.tokenize(text);
try {
return tokenizer.tokenize(text);
} catch {
return [text];
}
};
};
@@ -1,7 +1,10 @@
import { SentenceSplitter } from "@llamaindex/core/node-parser";
import {
SentenceSplitter,
splitBySentenceTokenizer,
} from "@llamaindex/core/node-parser";
import { describe, expect, test } from "vitest";
describe("SentenceSplitter", () => {
describe("sentence splitter", () => {
test("initializes", () => {
const sentenceSplitter = new SentenceSplitter();
expect(sentenceSplitter).toBeDefined();
@@ -105,4 +108,11 @@ describe("SentenceSplitter", () => {
"因为他照了人类,连我都在内。",
]);
});
test("issue 1087 - edge case when input with brackets", () => {
const text =
"A card must be of uniform thickness and made of unfolded and uncreased paper or cardstock of approximately the quality and weight of a stamped card (i.e., a card available from USPS).";
const split = splitBySentenceTokenizer();
expect(split(text)).toEqual([text]);
});
});
+9
View File
@@ -1,5 +1,14 @@
# @llamaindex/experimental
## 0.0.62
### Patch Changes
- Updated dependencies [345300f]
- Updated dependencies [da5cfc4]
- Updated dependencies [da5cfc4]
- llamaindex@0.5.12
## 0.0.61
### Patch Changes
+1 -1
View File
@@ -1,7 +1,7 @@
{
"name": "@llamaindex/experimental",
"description": "Experimental package for LlamaIndexTS",
"version": "0.0.61",
"version": "0.0.62",
"type": "module",
"types": "dist/type/index.d.ts",
"main": "dist/cjs/index.js",
+10
View File
@@ -1,5 +1,15 @@
# llamaindex
## 0.5.12
### Patch Changes
- 345300f: feat: add splitByPage mode to LlamaParseReader
- da5cfc4: Add metadatafilter options to retriever constructors
- da5cfc4: Fix system prompt not used in ContextChatEngine
- Updated dependencies [0452af9]
- @llamaindex/core@0.1.6
## 0.5.11
### Patch Changes
@@ -1,5 +1,14 @@
# @llamaindex/cloudflare-worker-agent-test
## 0.0.46
### Patch Changes
- Updated dependencies [345300f]
- Updated dependencies [da5cfc4]
- Updated dependencies [da5cfc4]
- llamaindex@0.5.12
## 0.0.45
### Patch Changes
@@ -1,6 +1,6 @@
{
"name": "@llamaindex/cloudflare-worker-agent-test",
"version": "0.0.45",
"version": "0.0.46",
"type": "module",
"private": true,
"scripts": {
@@ -1,5 +1,14 @@
# @llamaindex/next-agent-test
## 0.1.46
### Patch Changes
- Updated dependencies [345300f]
- Updated dependencies [da5cfc4]
- Updated dependencies [da5cfc4]
- llamaindex@0.5.12
## 0.1.45
### Patch Changes
@@ -1,6 +1,6 @@
{
"name": "@llamaindex/next-agent-test",
"version": "0.1.45",
"version": "0.1.46",
"private": true,
"scripts": {
"dev": "next dev",
@@ -1,5 +1,14 @@
# test-edge-runtime
## 0.1.45
### Patch Changes
- Updated dependencies [345300f]
- Updated dependencies [da5cfc4]
- Updated dependencies [da5cfc4]
- llamaindex@0.5.12
## 0.1.44
### Patch Changes
@@ -1,6 +1,6 @@
{
"name": "@llamaindex/nextjs-edge-runtime-test",
"version": "0.1.44",
"version": "0.1.45",
"private": true,
"scripts": {
"dev": "next dev",
@@ -1,5 +1,14 @@
# @llamaindex/next-node-runtime
## 0.0.27
### Patch Changes
- Updated dependencies [345300f]
- Updated dependencies [da5cfc4]
- Updated dependencies [da5cfc4]
- llamaindex@0.5.12
## 0.0.26
### Patch Changes
@@ -1,6 +1,6 @@
{
"name": "@llamaindex/next-node-runtime-test",
"version": "0.0.26",
"version": "0.0.27",
"private": true,
"scripts": {
"dev": "next dev",
@@ -1,5 +1,14 @@
# @llamaindex/waku-query-engine-test
## 0.0.46
### Patch Changes
- Updated dependencies [345300f]
- Updated dependencies [da5cfc4]
- Updated dependencies [da5cfc4]
- llamaindex@0.5.12
## 0.0.45
### Patch Changes
@@ -1,6 +1,6 @@
{
"name": "@llamaindex/waku-query-engine-test",
"version": "0.0.45",
"version": "0.0.46",
"type": "module",
"private": true,
"scripts": {
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "llamaindex",
"version": "0.5.11",
"version": "0.5.12",
"license": "MIT",
"type": "module",
"keywords": [
@@ -15,8 +15,8 @@ import { initService } from "./utils.js";
export type CloudRetrieveParams = Omit<
RetrievalParams,
"query" | "searchFilters" | "className" | "denseSimilarityTopK"
> & { similarityTopK?: number };
"query" | "search_filters" | "dense_similarity_top_k"
> & { similarityTopK?: number; filters?: MetadataFilters };
export class LlamaCloudRetriever implements BaseRetriever {
clientParams: ClientParams;
@@ -84,7 +84,9 @@ export class LlamaCloudRetriever implements BaseRetriever {
requestBody: {
...this.retrieveParams,
query: extractText(query),
search_filters: preFilters as MetadataFilters,
search_filters:
this.retrieveParams.filters ?? (preFilters as MetadataFilters),
dense_similarity_top_k: this.retrieveParams.similarityTopK,
},
});
@@ -126,7 +126,7 @@ export class ContextChatEngine extends PromptMixin implements ChatEngine {
if (!this.systemPrompt) return message;
return {
...message,
content: this.systemPrompt.trim() + "\n" + message.content,
content: this.systemPrompt.trim() + "\n" + extractText(message.content),
};
}
}
@@ -386,6 +386,7 @@ export type VectorIndexRetrieverOptions = {
index: VectorStoreIndex;
similarityTopK?: number;
topK?: TopKMap;
filters?: MetadataFilters;
};
export class VectorIndexRetriever implements BaseRetriever {
@@ -393,14 +394,21 @@ export class VectorIndexRetriever implements BaseRetriever {
topK: TopKMap;
serviceContext?: ServiceContext;
filters?: MetadataFilters;
constructor({ index, similarityTopK, topK }: VectorIndexRetrieverOptions) {
constructor({
index,
similarityTopK,
topK,
filters,
}: VectorIndexRetrieverOptions) {
this.index = index;
this.serviceContext = this.index.serviceContext;
this.topK = topK ?? {
[ModalityType.TEXT]: similarityTopK ?? DEFAULT_SIMILARITY_TOP_K,
[ModalityType.IMAGE]: DEFAULT_SIMILARITY_TOP_K,
};
this.filters = filters;
}
/**
@@ -443,7 +451,7 @@ export class VectorIndexRetriever implements BaseRetriever {
query: MessageContent,
type: ModalityType,
vectorStore: VectorStore,
preFilters?: MetadataFilters,
filters?: MetadataFilters,
): Promise<NodeWithScore[]> {
// convert string message to multi-modal format
if (typeof query === "string") {
@@ -460,7 +468,7 @@ export class VectorIndexRetriever implements BaseRetriever {
queryEmbedding,
mode: VectorStoreQueryMode.DEFAULT,
similarityTopK: this.topK[type],
filters: preFilters ?? undefined,
filters: this.filters ?? filters ?? undefined,
});
nodes = nodes.concat(this.buildNodeListFromQueryResult(result));
}
@@ -143,6 +143,8 @@ export class LlamaParseReader extends FileReader {
targetPages?: string;
// Whether or not to ignore and skip errors raised during parsing.
ignoreErrors: boolean = true;
// Whether to split by page using the pageSeparator or '\n---\n' as default.
splitByPage: boolean = true;
// Whether to use the vendor multimodal API.
useVendorMultimodalModel: boolean = false;
// The model name for the vendor multimodal API
@@ -326,10 +328,17 @@ export class LlamaParseReader extends FileReader {
}
// Return results as Document objects
const resultJson = await this.getJobResult(jobId, this.resultType);
const jobResults = await this.getJobResult(jobId, this.resultType);
const resultText = jobResults[this.resultType];
// Split the text by separator if splitByPage is true
if (this.splitByPage) {
return this.splitTextBySeparator(resultText);
}
return [
new Document({
text: resultJson[this.resultType],
text: resultText,
}),
];
} catch (e) {
@@ -485,6 +494,17 @@ export class LlamaParseReader extends FileReader {
return filteredParams;
}
private splitTextBySeparator(text: string): Document[] {
const separator = this.pageSeparator ?? "\n---\n";
const textChunks = text.split(separator);
return textChunks.map(
(docChunk: string) =>
new Document({
text: docChunk,
}),
);
}
static async getMimeType(
data: Uint8Array,
): Promise<{ mime: string; extension: string }> {
@@ -6,6 +6,10 @@ import { FileReader } from "./type.js";
*/
export class PDFReader extends FileReader {
async loadDataAsContent(content: Uint8Array): Promise<Document[]> {
// XXX: create a new Uint8Array to prevent "Please provide binary data as `Uint8Array`, rather than `Buffer`." error if a Buffer passed
if (content instanceof Buffer) {
content = new Uint8Array(content);
}
const { totalPages, text } = await readPDF(content);
return text.map((text, page) => {
const metadata = {
+1 -2
View File
@@ -18,8 +18,7 @@ export abstract class FileReader implements BaseReader {
): Promise<Document[]>;
async loadData(filePath: string): Promise<Document[]> {
// XXX: create a new Uint8Array to prevent "Please provide binary data as `Uint8Array`, rather than `Buffer`." error in PDFReader
const fileContent = new Uint8Array(await fs.readFile(filePath));
const fileContent = await fs.readFile(filePath);
const fileName = path.basename(filePath);
const docs = await this.loadDataAsContent(fileContent, fileName);
docs.forEach(FileReader.addMetaData(filePath));
+346 -129
View File
File diff suppressed because it is too large Load Diff