feat: improve reader interfaces (#498)

This commit is contained in:
Alex Yang
2024-02-07 11:44:01 -06:00
committed by GitHub
parent b8173e4c4e
commit 9b00d578bc
35 changed files with 1406 additions and 1416 deletions
+2
View File
@@ -0,0 +1,2 @@
examples/readers/data/** binary
examples/data/** binary
-17
View File
@@ -1,17 +0,0 @@
---
sidebar_position: 3
---
# Reader / Loader
LlamaIndex.TS supports easy loading of files from folders using the `SimpleDirectoryReader` class. Currently, `.txt`, `.pdf`, `.csv`, `.md` and `.docx` files are supported, with more planned in the future!
```typescript
import { SimpleDirectoryReader } from "llamaindex";
documents = new SimpleDirectoryReader().loadData("./data");
```
## API Reference
- [SimpleDirectoryReader](../api/classes/SimpleDirectoryReader.md)
+35
View File
@@ -0,0 +1,35 @@
---
sidebar_position: 3
---
import CodeBlock from "@theme/CodeBlock";
import CodeSource from "!raw-loader!../../../../examples/readers/src/simple-directory-reader";
import CodeSource2 from "!raw-loader!../../../../examples/readers/src/custom-simple-directory-reader";
# Loader
Before you can start indexing your documents, you need to load them into memory.
### SimpleDirectoryReader
[![Open in StackBlitz](https://developer.stackblitz.com/img/open_in_stackblitz.svg)](https://stackblitz.com/github/run-llama/LlamaIndexTS/tree/main/examples/readers?file=src/simple-directory-reader.ts&title=Simple%20Directory%20Reader)
LlamaIndex.TS supports easy loading of files from folders using the `SimpleDirectoryReader` class.
It is a simple reader that reads all files from a directory and its subdirectories.
<CodeBlock language="ts">{CodeSource}</CodeBlock>
Currently, it supports reading `.csv`, `.docx`, `.html`, `.md` and `.pdf` files,
but support for other file types is planned.
Also, you can provide a `defaultReader` as a fallback for files with unsupported extensions.
Or pass new readers for `fileExtToReader` to support more file types.
<CodeBlock language="ts" showLineNumbers metastring="{8-12,17-21}">
{CodeSource2}
</CodeBlock>
## API Reference
- [SimpleDirectoryReader](../api/classes/SimpleDirectoryReader.md)
+3 -59
View File
@@ -1,61 +1,5 @@
## Reader Examples
## LlamaIndex Reader Examples
These examples show how to use a specific reader class by loading a document and running a test query.
1. Make sure you are in `examples` directory
```bash
cd ./examples
```
2. Prepare `OPENAI_API_KEY` environment variable:
```bash
export OPENAI_API_KEY=your_openai_api_key
```
3. Run the following command to load documents and test query:
- MarkdownReader Example
```bash
npx ts-node readers/load-md.ts
```
- DocxReader Example
```bash
npx ts-node readers/load-docx.ts
```
- PdfReader Example
```bash
npx ts-node readers/load-pdf.ts
```
- HtmlReader Example
```bash
npx ts-node readers/load-html.ts
```
- CsvReader Example
```bash
npx ts-node readers/load-csv.ts
```
- NotionReader Example
```bash
export NOTION_TOKEN=your_notion_token
npx ts-node readers/load-notion.ts
```
- AssemblyAI Example
```bash
export ASSEMBLYAI_API_KEY=your_assemblyai_api_key
npx ts-node readers/load-assemblyai.ts
```shell
npm run start
```
+21
View File
@@ -0,0 +1,21 @@
{
"name": "llamaindex-loader-example",
"private": true,
"type": "module",
"scripts": {
"start": "node --loader ts-node/esm ./src/simple-directory-reader.ts",
"start:csv": "node --loader ts-node/esm ./src/csv.ts",
"start:docx": "node --loader ts-node/esm ./src/docx.ts",
"start:html": "node --loader ts-node/esm ./src/html.ts",
"start:markdown": "node --loader ts-node/esm ./src/markdown.ts",
"start:pdf": "node --loader ts-node/esm ./src/pdf.ts"
},
"dependencies": {
"llamaindex": "latest"
},
"devDependencies": {
"@types/node": "^20.11.14",
"ts-node": "^10.9.2",
"typescript": "^5.3.3"
}
}
@@ -2,7 +2,7 @@ import { program } from "commander";
import { TranscribeParams, VectorStoreIndex } from "llamaindex";
import { AudioTranscriptReader } from "llamaindex/readers/AssemblyAIReader";
import { stdin as input, stdout as output } from "node:process";
import readline from "node:readline/promises";
import { createInterface } from "node:readline/promises";
program
.option("-a, --audio [string]", "URL or path of the audio file to transcribe")
@@ -35,7 +35,7 @@ program
// Create query engine
const queryEngine = index.asQueryEngine();
const rl = readline.createInterface({ input, output });
const rl = createInterface({ input, output });
while (true) {
const query = await rl.question("Ask a question: ");
@@ -10,7 +10,7 @@ import { PapaCSVReader } from "llamaindex/readers/CSVReader";
async function main() {
// Load CSV
const reader = new PapaCSVReader();
const path = "data/titanic_train.csv";
const path = "../data/titanic_train.csv";
const documents = await reader.loadData(path);
const serviceContext = serviceContextFromDefaults({
@@ -0,0 +1,26 @@
import type { BaseReader, Document, Metadata } from "llamaindex";
import {
FILE_EXT_TO_READER,
SimpleDirectoryReader,
TextFileReader,
} from "llamaindex/readers/SimpleDirectoryReader";
class ZipReader implements BaseReader {
loadData(...args: any[]): Promise<Document<Metadata>[]> {
throw new Error("Implement me");
}
}
const reader = new SimpleDirectoryReader();
const documents = await reader.loadData({
directoryPath: "../data",
defaultReader: new TextFileReader(),
fileExtToReader: {
...FILE_EXT_TO_READER,
zip: new ZipReader(),
},
});
documents.forEach((doc) => {
console.log(`document (${doc.id_}):`, doc.getText());
});
@@ -1,7 +1,7 @@
import { VectorStoreIndex } from "llamaindex";
import { DocxReader } from "llamaindex/readers/DocxReader";
const FILE_PATH = "./data/stars.docx";
const FILE_PATH = "../data/stars.docx";
const SAMPLE_QUERY = "Information about Zodiac";
async function main() {
@@ -4,7 +4,7 @@ import { HTMLReader } from "llamaindex/readers/HTMLReader";
async function main() {
// Load page
const reader = new HTMLReader();
const documents = await reader.loadData("data/18-1_Changelog.html");
const documents = await reader.loadData("../data/llamaindex.html");
// Split text and create embeddings. Store them in a VectorStoreIndex
const index = await VectorStoreIndex.fromDocuments(documents);
@@ -12,7 +12,7 @@ async function main() {
// Query the index
const queryEngine = index.asQueryEngine();
const response = await queryEngine.query({
query: "What were the notable changes in 18.1?",
query: "What can I do with LlamaIndex?",
});
// Output response
@@ -1,7 +1,7 @@
import { VectorStoreIndex } from "llamaindex";
import { MarkdownReader } from "llamaindex/readers/MarkdownReader";
const FILE_PATH = "./data/planets.md";
const FILE_PATH = "../data/planets.md";
const SAMPLE_QUERY = "List all planets";
async function main() {
@@ -3,7 +3,7 @@ import { program } from "commander";
import { VectorStoreIndex } from "llamaindex";
import { NotionReader } from "llamaindex/readers/NotionReader";
import { stdin as input, stdout as output } from "node:process";
import readline from "node:readline/promises";
import { createInterface } from "node:readline/promises";
program
.argument("[page]", "Notion page id (must be provided)")
@@ -70,7 +70,7 @@ program
// Create query engine
const queryEngine = index.asQueryEngine();
const rl = readline.createInterface({ input, output });
const rl = createInterface({ input, output });
while (true) {
const query = await rl.question("Query: ");
@@ -1,13 +1,10 @@
import { VectorStoreIndex } from "llamaindex";
import { PDFReader } from "llamaindex/readers/PDFReader";
import { resolve } from "node:path";
async function main() {
// Load PDF
const reader = new PDFReader();
const documents = await reader.loadData(
resolve(__dirname, "../data/brk-2022.pdf"),
);
const documents = await reader.loadData("../data/brk-2022.pdf");
// Split text and create embeddings. Store them in a VectorStoreIndex
const index = await VectorStoreIndex.fromDocuments(documents);
@@ -0,0 +1,10 @@
import { SimpleDirectoryReader } from "llamaindex/readers/SimpleDirectoryReader";
// or
// import { SimpleDirectoryReader } from 'llamaindex'
const reader = new SimpleDirectoryReader();
const documents = await reader.loadData("../data");
documents.forEach((doc) => {
console.log(`document (${doc.id_}):`, doc.getText());
});
+11
View File
@@ -0,0 +1,11 @@
{
"compilerOptions": {
"target": "es2017",
"module": "node16",
"moduleResolution": "node16",
"outDir": "./dist",
"types": ["node"],
"skipLibCheck": true
},
"include": ["./src/**/*.ts"]
}
+1 -1
View File
@@ -1,6 +1,6 @@
{
"compilerOptions": {
"target": "es2016",
"target": "es2017",
"module": "esnext",
"moduleResolution": "bundler",
"esModuleInterop": true,
+5
View File
@@ -133,6 +133,11 @@
"import": "./dist/tools.mjs",
"require": "./dist/tools.js"
},
"./readers": {
"types": "./dist/readers.d.mts",
"import": "./dist/readers.mjs",
"require": "./dist/readers.js"
},
"./readers/AssemblyAIReader": {
"types": "./dist/readers/AssemblyAIReader.d.mts",
"import": "./dist/readers/AssemblyAIReader.mjs",
+1 -10
View File
@@ -21,16 +21,7 @@ export * from "./ingestion";
export * from "./llm";
export * from "./nodeParsers";
export * from "./postprocessors";
export * from "./readers/AssemblyAIReader";
export * from "./readers/CSVReader";
export * from "./readers/DocxReader";
export * from "./readers/HTMLReader";
export * from "./readers/MarkdownReader";
export * from "./readers/NotionReader";
export * from "./readers/PDFReader";
export * from "./readers/SimpleDirectoryReader";
export * from "./readers/SimpleMongoReader";
export * from "./readers/base";
export * from "./readers";
export * from "./selectors";
export * from "./storage";
export * from "./synthesizers";
@@ -1,5 +1,5 @@
import { BaseNode, Document } from "../Node";
import { BaseReader } from "../readers/base";
import { BaseReader } from "../readers/type";
import { BaseDocumentStore, VectorStore } from "../storage";
import { IngestionCache, getTransformationHash } from "./IngestionCache";
import { DocStoreStrategy, createDocStoreStrategy } from "./strategies";
@@ -7,7 +7,7 @@ import {
TranscriptSentence,
} from "assemblyai";
import { Document } from "../Node";
import { BaseReader } from "./base";
import { BaseReader } from "./type";
type AssemblyAIOptions = Partial<BaseServiceParams>;
@@ -39,7 +39,7 @@ abstract class AssemblyAIReader implements BaseReader {
this.client = new AssemblyAI(options as BaseServiceParams);
}
abstract loadData(...args: any[]): Promise<Document[]>;
abstract loadData(params: TranscribeParams | string): Promise<Document[]>;
protected async transcribeOrGetTranscript(params: TranscribeParams | string) {
if (typeof params === "string") {
+2 -2
View File
@@ -2,14 +2,14 @@ import Papa, { ParseConfig } from "papaparse";
import { Document } from "../Node";
import { defaultFS } from "../env";
import { GenericFileSystem } from "../storage/FileSystem";
import { BaseReader } from "./base";
import { FileReader } from "./type";
/**
* papaparse-based csv parser
* @class CSVReader
* @implements BaseReader
*/
export class PapaCSVReader implements BaseReader {
export class PapaCSVReader implements FileReader {
private concatRows: boolean;
private colJoiner: string;
private rowJoiner: string;
+2 -2
View File
@@ -2,9 +2,9 @@ import mammoth from "mammoth";
import { Document } from "../Node";
import { defaultFS } from "../env";
import { GenericFileSystem } from "../storage/FileSystem";
import { BaseReader } from "./base";
import { FileReader } from "./type";
export class DocxReader implements BaseReader {
export class DocxReader implements FileReader {
/** DocxParser */
async loadData(
file: string,
+2 -2
View File
@@ -1,7 +1,7 @@
import { Document } from "../Node";
import { defaultFS } from "../env";
import { GenericFileSystem } from "../storage/FileSystem";
import { BaseReader } from "./base";
import { FileReader } from "./type";
/**
* Extract the significant text from an arbitrary HTML document.
@@ -10,7 +10,7 @@ import { BaseReader } from "./base";
* All other tags are removed, and the inner text is kept intact.
* Html entities (e.g., &amp;) are not decoded.
*/
export class HTMLReader implements BaseReader {
export class HTMLReader implements FileReader {
/**
* Public method for this reader.
* Required by BaseReader interface.
+2 -2
View File
@@ -1,12 +1,12 @@
import { Document, ImageDocument } from "../Node";
import { defaultFS } from "../env";
import { GenericFileSystem } from "../storage/FileSystem";
import { BaseReader } from "./base";
import { FileReader } from "./type";
/**
* Reads the content of an image file into a Document object (which stores the image file as a Blob).
*/
export class ImageReader implements BaseReader {
export class ImageReader implements FileReader {
/**
* Public method for this reader.
* Required by BaseReader interface.
+2 -2
View File
@@ -1,7 +1,7 @@
import { Document } from "../Node";
import { defaultFS } from "../env";
import { GenericFileSystem } from "../storage";
import { BaseReader } from "./base";
import { FileReader } from "./type";
type MarkdownTuple = [string | null, string];
@@ -9,7 +9,7 @@ type MarkdownTuple = [string | null, string];
* Extract text from markdown files.
* Returns dictionary with keys as headers and values as the text between headers.
*/
export class MarkdownReader implements BaseReader {
export class MarkdownReader implements FileReader {
private _removeHyperlinks: boolean;
private _removeImages: boolean;
+1 -1
View File
@@ -1,7 +1,7 @@
import { Client } from "@notionhq/client";
import { crawler, Crawler, Pages, pageToString } from "notion-md-crawler";
import { Document } from "../Node";
import { BaseReader } from "./base";
import { BaseReader } from "./type";
type OptionalSerializers = Parameters<Crawler>[number]["serializers"];
+1 -1
View File
@@ -1,7 +1,7 @@
import { Document } from "../Node";
import { createSHA256, defaultFS } from "../env";
import { GenericFileSystem } from "../storage/FileSystem";
import { BaseReader } from "./base";
import { BaseReader } from "./type";
/**
* Read the text of a PDF
@@ -1,6 +1,5 @@
import _ from "lodash";
import { Document } from "../Node";
import { defaultFS } from "../env";
import { defaultFS, path } from "../env";
import { CompleteFileSystem, walk } from "../storage/FileSystem";
import { PapaCSVReader } from "./CSVReader";
import { DocxReader } from "./DocxReader";
@@ -8,7 +7,7 @@ import { HTMLReader } from "./HTMLReader";
import { ImageReader } from "./ImageReader";
import { MarkdownReader } from "./MarkdownReader";
import { PDFReader } from "./PDFReader";
import { BaseReader } from "./base";
import { BaseReader } from "./type";
type ReaderCallback = (
category: "file" | "directory",
@@ -57,13 +56,17 @@ export type SimpleDirectoryReaderLoadDataParams = {
};
/**
* Read all of the documents in a directory.
* Read all the documents in a directory.
* By default, supports the list of file types
* in the FILE_EXT_TO_READER map.
*/
export class SimpleDirectoryReader implements BaseReader {
constructor(private observer?: ReaderCallback) {}
async loadData(
params: SimpleDirectoryReaderLoadDataParams,
): Promise<Document[]>;
async loadData(directoryPath: string): Promise<Document[]>;
async loadData(
params: SimpleDirectoryReaderLoadDataParams | string,
): Promise<Document[]> {
@@ -88,7 +91,7 @@ export class SimpleDirectoryReader implements BaseReader {
let docs: Document[] = [];
for await (const filePath of walk(fs, directoryPath)) {
try {
const fileExt = _.last(filePath.split(".")) || "";
const fileExt = path.extname(filePath).slice(1).toLowerCase();
// Observer can decide to skip each file
if (!this.doObserverCheck("file", filePath, ReaderStatus.STARTED)) {
@@ -96,11 +99,11 @@ export class SimpleDirectoryReader implements BaseReader {
continue;
}
let reader = null;
let reader: BaseReader;
if (fileExt in fileExtToReader) {
reader = fileExtToReader[fileExt];
} else if (!_.isNil(defaultReader)) {
} else if (defaultReader != null) {
reader = defaultReader;
} else {
const msg = `No reader for file extension of ${filePath}`;
@@ -1,6 +1,6 @@
import { MongoClient } from "mongodb";
import { Document, Metadata } from "../Node";
import { BaseReader } from "./base";
import { BaseReader } from "./type";
/**
* Read in from MongoDB
-8
View File
@@ -1,8 +0,0 @@
import { Document } from "../Node";
/**
* A reader takes imports data into Document objects.
*/
export interface BaseReader {
loadData(...args: any[]): Promise<Document[]>;
}
+11
View File
@@ -0,0 +1,11 @@
export * from "./AssemblyAIReader";
export * from "./CSVReader";
export * from "./DocxReader";
export * from "./HTMLReader";
export * from "./ImageReader";
export * from "./MarkdownReader";
export * from "./NotionReader";
export * from "./PDFReader";
export * from "./SimpleDirectoryReader";
export * from "./SimpleMongoReader";
export * from "./type";
+16
View File
@@ -0,0 +1,16 @@
import { Document } from "../Node";
import { CompleteFileSystem } from "../storage";
/**
* A reader takes imports data into Document objects.
*/
export interface BaseReader {
loadData(...args: unknown[]): Promise<Document[]>;
}
/**
* A reader takes file paths and imports data into Document objects.
*/
export interface FileReader extends BaseReader {
loadData(filePath: string, fs?: CompleteFileSystem): Promise<Document[]>;
}
+1224 -1285
View File
File diff suppressed because it is too large Load Diff
+1
View File
@@ -2,3 +2,4 @@ packages:
- "apps/*"
- "packages/*"
- "examples/"
- "examples/*"
+3
View File
@@ -28,6 +28,9 @@
},
{
"path": "./examples"
},
{
"path": "./examples/readers"
}
]
}