Compare commits

...

4 Commits

Author SHA1 Message Date
Alex Yang 20644eabc4 fix: ignore error 2024-07-30 09:22:15 -07:00
Alex Yang 93e86469aa Merge branch 'main' into ms/fix-splitter 2024-07-30 08:01:38 -07:00
Marcus Schiesser 2532e0e01f Create thin-pens-deliver.md 2024-07-30 20:55:12 +07:00
Marcus Schiesser b3fda249a0 fix: handling errors in splitBySentenceTokenizer 2024-07-30 15:51:58 +02:00
3 changed files with 23 additions and 3 deletions
+6
View File
@@ -0,0 +1,6 @@
---
"@llamaindex/core": patch
"@llamaindex/core-tests": patch
---
fix: handling errors in splitBySentenceTokenizer
+5 -1
View File
@@ -39,7 +39,11 @@ export const splitBySentenceTokenizer = (): TextSplitterFn => {
}
const tokenizer = sentenceTokenizer;
return (text: string) => {
return tokenizer.tokenize(text);
try {
return tokenizer.tokenize(text);
} catch {
return [text];
}
};
};
@@ -1,7 +1,10 @@
import { SentenceSplitter } from "@llamaindex/core/node-parser";
import {
SentenceSplitter,
splitBySentenceTokenizer,
} from "@llamaindex/core/node-parser";
import { describe, expect, test } from "vitest";
describe("SentenceSplitter", () => {
describe("sentence splitter", () => {
test("initializes", () => {
const sentenceSplitter = new SentenceSplitter();
expect(sentenceSplitter).toBeDefined();
@@ -105,4 +108,11 @@ describe("SentenceSplitter", () => {
"因为他照了人类,连我都在内。",
]);
});
test("issue 1087 - edge case when input with brackets", () => {
const text =
"A card must be of uniform thickness and made of unfolded and uncreased paper or cardstock of approximately the quality and weight of a stamped card (i.e., a card available from USPS).";
const split = splitBySentenceTokenizer();
expect(split(text)).toEqual([text]);
});
});