feat: Add support for the Solidity language (#1616)

This commit is contained in:
0xJordan
2023-06-13 11:32:03 +02:00
committed by GitHub
parent 8e6181f908
commit 85de6e16b9
3 changed files with 61 additions and 1 deletions
@@ -6,7 +6,7 @@ hide_table_of_contents: true
LangChain supports a variety of different markup and programming language-specific text splitters to split your text based on language-specific syntax.
This results in more semantically self-contained chunks that are more useful to a vector store or other retriever.
Popular languages like JavaScript, Python, and Rust are supported as well as Latex, HTML, and Markdown.
Popular languages like JavaScript, Python, Solidity, and Rust are supported as well as Latex, HTML, and Markdown.
## Usage
@@ -287,3 +287,32 @@ test("Rust code splitter", async () => {
"}",
]);
});
test("Solidity code splitter", async () => {
const splitter = RecursiveCharacterTextSplitter.fromLanguage("sol", {
chunkSize: 16,
chunkOverlap: 0,
});
const code = `pragma solidity ^0.8.20;
contract HelloWorld {
function add(uint a, uint b) pure public returns(uint) {
return a + b;
}
}
`;
const chunks = await splitter.splitText(code);
expect(chunks).toStrictEqual([
"pragma solidity",
"^0.8.20;",
"contract",
"HelloWorld {",
"function",
"add(uint a,",
"uint b) pure",
"public",
"returns(uint) {",
"return a",
"+ b;",
"}\n }",
]);
});
+31
View File
@@ -222,6 +222,7 @@ export const SupportedTextSplitterLanguages = [
"markdown",
"latex",
"html",
"sol",
] as const;
export type SupportedTextSplitterLanguage =
@@ -617,6 +618,36 @@ export class RecursiveCharacterTextSplitter
" ",
"",
];
} else if (language === "sol") {
return [
// Split along compiler informations definitions
"\npragma ",
"\nusing ",
// Split along contract definitions
"\ncontract ",
"\ninterface ",
"\nlibrary ",
// Split along method definitions
"\nconstructor ",
"\ntype ",
"\nfunction ",
"\nevent ",
"\nmodifier ",
"\nerror ",
"\nstruct ",
"\nenum ",
// Split along control flow statements
"\nif ",
"\nfor ",
"\nwhile ",
"\ndo while ",
"\nassembly ",
// Split by the normal type of lines
"\n\n",
"\n",
" ",
"",
];
} else {
throw new Error(`Language ${language} is not supported.`);
}