mirror of
https://github.com/Mintplex-Labs/langchainjs.git
synced 2026-07-01 12:17:38 -04:00
Feature: Enhanced Document Splitting with Contextual Chunk Headers (#961)
* improvement: add the option to add headers to chunks so to help LLMs and vector stores better understand linked chunks * docs: update text splitters docs to reference `chunkHeader`, `chunkOverlapHeader`, and `showChunkOverlapHeader` * fix: formatting * Move contextual headers for each chunk out of the TextSplitter constructor and into individual methods * Fix typo * Adds example result --------- Co-authored-by: Jacob Lee <jacoblee93@gmail.com>
This commit is contained in:
@@ -5,6 +5,7 @@ sidebar_position: 2
|
||||
---
|
||||
|
||||
import DocCardList from "@theme/DocCardList";
|
||||
import CodeBlock from "@theme/CodeBlock";
|
||||
|
||||
# Getting Started: Text Splitters
|
||||
|
||||
@@ -16,7 +17,18 @@ Language Models are often limited by the amount of text that you can pass to the
|
||||
|
||||
Using a Text Splitter can also help improve the results from vector store searches, as eg. smaller chunks may sometimes be more likely to match a query. Testing different chunk sizes (and chunk overlap) is a worthwhile exercise to tailor the results to your use case.
|
||||
|
||||
## Parameters
|
||||
|
||||
- `chunkSize?: number = 1000`: The maximum number of characters in each chunk. The default value is 1000 tokens.
|
||||
- `chunkOverlap?: number = 200`: The number of overlapping characters between adjacent chunks. The default value is 200 tokens.
|
||||
|
||||
```typescript
|
||||
type TextSplitterChunkHeaderOptions = {
|
||||
chunkHeader?: string;
|
||||
chunkOverlapHeader?: string;
|
||||
appendChunkOverlapHeader?: boolean;
|
||||
};
|
||||
|
||||
interface TextSplitter {
|
||||
chunkSize: number;
|
||||
|
||||
@@ -24,15 +36,33 @@ interface TextSplitter {
|
||||
|
||||
createDocuments(
|
||||
texts: string[],
|
||||
metadatas?: Record<string, any>[]
|
||||
metadatas?: Record<string, any>[],
|
||||
chunkHeaderOptions: TextSplitterChunkHeaderOptions = {}
|
||||
): Promise<Document[]>;
|
||||
|
||||
splitDocuments(documents: Document[]): Promise<Document[]>;
|
||||
splitDocuments(
|
||||
documents: Document[],
|
||||
chunkHeaderOptions: TextSplitterChunkHeaderOptions = {}
|
||||
): Promise<Document[]>;
|
||||
}
|
||||
```
|
||||
|
||||
Text Splitters expose two methods, `createDocuments` and `splitDocuments`. The former takes a list of raw text strings and returns a list of documents. The latter takes a list of documents and returns a list of documents. The difference is that `createDocuments` will split the raw text strings into chunks, while `splitDocuments` will split the documents into chunks.
|
||||
|
||||
### When to use `chunkHeaderOptions`
|
||||
|
||||
Consider a scenario where you want to store a large, arbitrary collection of documents in a vector store and perform Q&A tasks on them.
|
||||
Simply splitting documents with overlapping text may not provide sufficient context for LLMs to determine if multiple chunks are referencing the same information, or how to resolve information from contradictory sources.
|
||||
|
||||
Tagging each document with metadata is a solution if you know what to filter against, but you may not know ahead of time exactly what kind of queries your vector store will be expected to handle.
|
||||
Including additional contextual information directly in each chunk in the form of headers can help deal with arbitrary queries.
|
||||
|
||||
Here's an example:
|
||||
|
||||
import ChunkHeaderExample from "@examples/indexes/text_splitter_with_chunk_header.ts";
|
||||
|
||||
<CodeBlock language="typescript">{ChunkHeaderExample}</CodeBlock>;
|
||||
|
||||
## All Text Splitters
|
||||
|
||||
<DocCardList />
|
||||
|
||||
@@ -0,0 +1,76 @@
|
||||
import { OpenAI } from "langchain/llms/openai";
|
||||
import { RetrievalQAChain, loadQAStuffChain } from "langchain/chains";
|
||||
import { CharacterTextSplitter } from "langchain/text_splitter";
|
||||
import { OpenAIEmbeddings } from "langchain/embeddings/openai";
|
||||
import { HNSWLib } from "langchain/vectorstores/hnswlib";
|
||||
|
||||
const splitter = new CharacterTextSplitter({
|
||||
chunkSize: 1536,
|
||||
chunkOverlap: 200,
|
||||
});
|
||||
|
||||
const jimDocs = await splitter.createDocuments(
|
||||
[`My favorite color is blue.`],
|
||||
[],
|
||||
{
|
||||
chunkHeader: `DOCUMENT NAME: Jim Interview\n\n---\n\n`,
|
||||
appendChunkOverlapHeader: true,
|
||||
}
|
||||
);
|
||||
|
||||
const pamDocs = await splitter.createDocuments(
|
||||
[`My favorite color is red.`],
|
||||
[],
|
||||
{
|
||||
chunkHeader: `DOCUMENT NAME: Pam Interview\n\n---\n\n`,
|
||||
appendChunkOverlapHeader: true,
|
||||
}
|
||||
);
|
||||
|
||||
const vectorStore = await HNSWLib.fromDocuments(
|
||||
jimDocs.concat(pamDocs),
|
||||
new OpenAIEmbeddings()
|
||||
);
|
||||
|
||||
const model = new OpenAI({ temperature: 0 });
|
||||
|
||||
const chain = new RetrievalQAChain({
|
||||
combineDocumentsChain: loadQAStuffChain(model),
|
||||
retriever: vectorStore.asRetriever(),
|
||||
returnSourceDocuments: true,
|
||||
});
|
||||
const res = await chain.call({
|
||||
query: "What is Pam's favorite color?",
|
||||
});
|
||||
|
||||
console.log(JSON.stringify(res, null, 2));
|
||||
|
||||
/*
|
||||
{
|
||||
"text": " Red.",
|
||||
"sourceDocuments": [
|
||||
{
|
||||
"pageContent": "DOCUMENT NAME: Pam Interview\n\n---\n\nMy favorite color is red.",
|
||||
"metadata": {
|
||||
"loc": {
|
||||
"lines": {
|
||||
"from": 1,
|
||||
"to": 1
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"pageContent": "DOCUMENT NAME: Jim Interview\n\n---\n\nMy favorite color is blue.",
|
||||
"metadata": {
|
||||
"loc": {
|
||||
"lines": {
|
||||
"from": 1,
|
||||
"to": 1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
*/
|
||||
@@ -114,6 +114,39 @@ test("Test create documents with metadata method.", async () => {
|
||||
expect(docs).toEqual(expectedDocs);
|
||||
});
|
||||
|
||||
test("Test create documents method with metadata and an added chunk header.", async () => {
|
||||
const texts = ["foo bar", "baz"];
|
||||
const splitter = new CharacterTextSplitter({
|
||||
separator: " ",
|
||||
chunkSize: 3,
|
||||
chunkOverlap: 0,
|
||||
});
|
||||
const docs = await splitter.createDocuments(
|
||||
texts,
|
||||
[{ source: "1" }, { source: "2" }],
|
||||
{
|
||||
chunkHeader: `SOURCE NAME: testing\n-----\n`,
|
||||
appendChunkOverlapHeader: true,
|
||||
}
|
||||
);
|
||||
const loc = { lines: { from: 1, to: 1 } };
|
||||
const expectedDocs = [
|
||||
new Document({
|
||||
pageContent: "SOURCE NAME: testing\n-----\nfoo",
|
||||
metadata: { source: "1", loc },
|
||||
}),
|
||||
new Document({
|
||||
pageContent: "SOURCE NAME: testing\n-----\n(cont'd) bar",
|
||||
metadata: { source: "1", loc },
|
||||
}),
|
||||
new Document({
|
||||
pageContent: "SOURCE NAME: testing\n-----\nbaz",
|
||||
metadata: { source: "2", loc },
|
||||
}),
|
||||
];
|
||||
expect(docs).toEqual(expectedDocs);
|
||||
});
|
||||
|
||||
test("Test iterative text splitter.", async () => {
|
||||
const text = `Hi.\n\nI'm Harrison.\n\nHow? Are? You?\nOkay then f f f f.
|
||||
This is a weird text to write, but gotta test the splittingggg some how.\n\n
|
||||
|
||||
@@ -4,10 +4,15 @@ import { getEncoding } from "./util/tiktoken.js";
|
||||
|
||||
export interface TextSplitterParams {
|
||||
chunkSize: number;
|
||||
|
||||
chunkOverlap: number;
|
||||
}
|
||||
|
||||
export type TextSplitterChunkHeaderOptions = {
|
||||
chunkHeader?: string;
|
||||
chunkOverlapHeader?: string;
|
||||
appendChunkOverlapHeader?: boolean;
|
||||
};
|
||||
|
||||
export abstract class TextSplitter implements TextSplitterParams {
|
||||
chunkSize = 1000;
|
||||
|
||||
@@ -26,16 +31,25 @@ export abstract class TextSplitter implements TextSplitterParams {
|
||||
async createDocuments(
|
||||
texts: string[],
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
metadatas: Record<string, any>[] = []
|
||||
metadatas: Record<string, any>[] = [],
|
||||
chunkHeaderOptions: TextSplitterChunkHeaderOptions = {}
|
||||
): Promise<Document[]> {
|
||||
// if no metadata is provided, we create an empty one for each text
|
||||
const _metadatas =
|
||||
metadatas.length > 0 ? metadatas : new Array(texts.length).fill({});
|
||||
const {
|
||||
chunkHeader = "",
|
||||
chunkOverlapHeader = "(cont'd) ",
|
||||
appendChunkOverlapHeader = false,
|
||||
} = chunkHeaderOptions;
|
||||
const documents = new Array<Document>();
|
||||
for (let i = 0; i < texts.length; i += 1) {
|
||||
const text = texts[i];
|
||||
let lineCounterIndex = 1;
|
||||
let prevChunk = null;
|
||||
for (const chunk of await this.splitText(text)) {
|
||||
let pageContent = chunkHeader;
|
||||
|
||||
// we need to count the \n that are in the text before getting removed by the splitting
|
||||
let numberOfIntermediateNewLines = 0;
|
||||
if (prevChunk) {
|
||||
@@ -48,6 +62,9 @@ export abstract class TextSplitter implements TextSplitterParams {
|
||||
numberOfIntermediateNewLines = (
|
||||
removedNewlinesFromSplittingText.match(/\n/g) || []
|
||||
).length;
|
||||
if (appendChunkOverlapHeader) {
|
||||
pageContent += chunkOverlapHeader;
|
||||
}
|
||||
}
|
||||
lineCounterIndex += numberOfIntermediateNewLines;
|
||||
const newLinesCount = (chunk.match(/\n/g) || []).length;
|
||||
@@ -64,9 +81,11 @@ export abstract class TextSplitter implements TextSplitterParams {
|
||||
..._metadatas[i],
|
||||
loc,
|
||||
};
|
||||
|
||||
pageContent += chunk;
|
||||
documents.push(
|
||||
new Document({
|
||||
pageContent: chunk,
|
||||
pageContent,
|
||||
metadata: metadataWithLinesNumber,
|
||||
})
|
||||
);
|
||||
@@ -77,13 +96,16 @@ export abstract class TextSplitter implements TextSplitterParams {
|
||||
return documents;
|
||||
}
|
||||
|
||||
async splitDocuments(documents: Document[]): Promise<Document[]> {
|
||||
async splitDocuments(
|
||||
documents: Document[],
|
||||
chunkHeaderOptions: TextSplitterChunkHeaderOptions = {}
|
||||
): Promise<Document[]> {
|
||||
const selectedDocuments = documents.filter(
|
||||
(doc) => doc.pageContent !== undefined
|
||||
);
|
||||
const texts = selectedDocuments.map((doc) => doc.pageContent);
|
||||
const metadatas = selectedDocuments.map((doc) => doc.metadata);
|
||||
return this.createDocuments(texts, metadatas);
|
||||
return this.createDocuments(texts, metadatas, chunkHeaderOptions);
|
||||
}
|
||||
|
||||
private joinDocs(docs: string[], separator: string): string | null {
|
||||
|
||||
Reference in New Issue
Block a user