Compare commits

...

16 Commits

Author SHA1 Message Date
Clelia (Astra) Bertelli d073d6ed30 docs: add link to example in extract.md 2025-08-07 12:16:54 +02:00
Clelia (Astra) Bertelli 848fb8973d docs: adding example 2025-08-06 17:56:08 +02:00
Clelia (Astra) Bertelli 78cf778613 fix: change agent name 2025-08-06 13:05:11 +02:00
Clelia (Astra) Bertelli 535bfb5433 fix: infer file type 2025-08-06 13:01:00 +02:00
Clelia (Astra) Bertelli c48c1cd6c4 fix: infer file type 2025-08-06 12:59:51 +02:00
Clelia (Astra) Bertelli e22c99bc62 chore: intervals are now in seconds, extractStateless -> extract, support for multiple file types 2025-08-06 12:19:00 +02:00
Clelia (Astra) Bertelli 7e9768396d Merge branch 'main' into clelia/add-extract-to-ts 2025-08-06 11:26:53 +02:00
Clelia (Astra) Bertelli 8f221541eb correct stateless extraction test 2025-08-05 17:32:37 +02:00
Clelia (Astra) Bertelli 52d8083893 correct stateless extraction test 2025-08-05 17:08:10 +02:00
Clelia (Astra) Bertelli 30f12c91d6 refactor: working LlamaExtract + tests 2025-08-05 16:56:20 +02:00
Clelia (Astra) Bertelli dd33455413 refactor: working LlamaExtract + tests 2025-08-05 16:56:08 +02:00
Clelia (Astra) Bertelli 491900f628 merge main 2025-08-05 12:49:11 +02:00
Clelia (Astra) Bertelli 2fa68bc4be feat: add stateless api support and retries mechanisms 2025-08-05 12:42:10 +02:00
Clelia (Astra) Bertelli ec9dbc0495 ci: lint 2025-08-04 17:47:10 +02:00
Clelia (Astra) Bertelli 35c4fd6478 feat: main implementation (untested) 2025-08-04 17:30:15 +02:00
Clelia (Astra) Bertelli 6a7872b9e5 wip: implementing Extract in TS 2025-08-04 16:02:45 +02:00
29 changed files with 5299 additions and 12 deletions
+122
View File
@@ -0,0 +1,122 @@
# LlamaExtract Demo
A TypeScript demo application showcasing the power of **LlamaExract** - a structured data extraction agentic service from [LlamaCloud](https://cloud.llamaindex.ai). This demo allows you to extract structured information from scientific papers and get them into a nice markdown format.
## Table of Contents
- [Features](#features)
- [Prerequisites](#prerequisites)
- [Installation](#installation)
- [Usage](#usage)
- [Start the Demo](#start-the-demo)
- [Development Mode](#development-mode)
- [Build the Project](#build-the-project)
- [Code Quality](#code-quality)
- [Quick Commands Reference](#quick-commands-reference)
- [How It Works](#how-it-works)
- [API Dependencies](#api-dependencies)
- [Troubleshooting](#troubleshooting)
- [Common Issues](#common-issues)
- [License](#license)
- [Contributing](#contributing)
## Features
- 📄 **Structured Data Extraction**: Extract data from your files effortlessly, and structure them the way you want!
- 🤖 **Markdown Rendering**: Generate markdown directly from your extracted data
- 🎨 **Beautiful CLI**: Styled console interface with colors and ASCII art
-**Fast Development**: Hot reload support with watch mode
- 🛠️ **TypeScript**: Full TypeScript support with strict type checking
## Prerequisites
- Node.js (version 18 or higher)
- pnpm package manager
- LlamaCloud API key
## Installation
1. Clone the repository:
```bash
git clone https://github.com/run-llama/llama_cloud_services
cd lama_cloud_services/examples-ts/extract/
```
2. Install dependencies:
```bash
npm install
```
3. Set up your environment variables:
```bash
# Add your API key to your environment
export LLAMA_CLOUD_API_KEY="your-llamacloud-api-key"
```
## Usage
### Start the Demo
```bash
npm run start
```
The application will display a welcome screen and prompt you to enter the path to a document you'd like to process.
### Development Mode
For development with hot reload:
```bash
npm run dev
```
### Build the Project
```bash
npm run build
```
### Code Quality
Format code:
```bash
npm run format
```
Lint code:
```bash
npm run lint
```
## How It Works
1. **Document Input**: Enter the path to your document when prompted
2. **Parsing**: LlamaExtract, based on the schema you can find [here](./src/schema.ts), processes the document and extracts structured data
3. **Markdown Rendering**: The extracted content is rendered into beautiful markdown
4. **Results**: View the results directly in your terminal
## Troubleshooting
### Common Issues
1. **Module Resolution Errors**: Ensure you're using Node.js 18+ and have all dependencies installed
2. **API Key Issues**: Verify your LlamaCloud API key is correctly set
3. **File Path Errors**: Use absolute paths or ensure relative paths are correct from the project root
## License
MIT License - see the [LICENSE](../../LICENSE) file for details.
## Contributing
1. Fork the repository
2. Create a feature branch
3. Make your changes
4. Run `npm run format` and `npm run lint`
5. Submit a pull request
+14
View File
@@ -0,0 +1,14 @@
import js from "@eslint/js";
import globals from "globals";
import tseslint from "typescript-eslint";
import { defineConfig } from "eslint/config";
export default defineConfig([
{
files: ["**/*.{js,mjs,cjs,ts,mts,cts}"],
plugins: { js },
extends: ["js/recommended"],
languageOptions: { globals: globals.browser },
},
tseslint.configs.recommended,
]);
File diff suppressed because it is too large Load Diff
+37
View File
@@ -0,0 +1,37 @@
{
"name": "llama-extract-demo",
"version": "0.1.0",
"description": "Demo for LlamaExtract in TypeScript",
"main": "index.js",
"scripts": {
"test": "echo \"There are no tests\"",
"start": "npm exec tsx src/index.ts",
"lint": "eslint ./src/",
"format": "prettier --write ./src/",
"build": "tsc",
"dev": "npm exec tsx --watch src/index.ts"
},
"author": "LlamaIndex",
"license": "MIT",
"dependencies": {
"cli-markdown": "^3.5.1",
"consola": "^3.4.2",
"figlet": "^1.8.2",
"llama-cloud-services": "file:../../ts/llama_cloud_services",
"marked": "^15.0.12",
"marked-terminal": "^7.3.0",
"picocolors": "^1.1.1"
},
"devDependencies": {
"@eslint/js": "^9.32.0",
"@types/figlet": "^1.7.0",
"@types/marked-terminal": "^6.1.1",
"@types/node": "^24.2.0",
"eslint": "^9.32.0",
"globals": "^16.3.0",
"jiti": "^2.5.1",
"prettier": "^3.6.2",
"typescript": "^5.9.2",
"typescript-eslint": "^8.39.0"
}
}
+47
View File
@@ -0,0 +1,47 @@
import { LlamaExtract, ExtractConfig } from "llama-cloud-services";
import cliMarkdown from "cli-markdown";
import { logger } from "./logger";
import pc from "picocolors";
import { consoleInput, renderLogo } from "./utils";
import { dataSchema } from "./schema";
import { renderMarkdown, ResearchData } from "./markdown";
export async function main(): Promise<number> {
const extractClient = new LlamaExtract(
process.env.LLAMA_CLOUD_API_KEY!,
"https://api.cloud.llamaindex.ai",
);
await renderLogo();
logger.log(
`Welcome to ${pc.bold(
pc.magentaBright("LlamaExtract Demo✨"),
)}, our demo for ${pc.bold(pc.green("LlamaExtract"))}, a ${pc.bold(
pc.cyan("LlamaCloud☁️"),
)} (https://cloud.llamaindex.ai) product!.\nIn this demo we are going to try extracting relevant information ${pc.bold(
pc.yellowBright("from scientific papers"),
)}. Type the path to the paper you would like to process below👇\nIf you wish to exit, just type ${pc.bold(
pc.gray("quit"),
)}.\n`,
);
while (true) {
const userInput = await consoleInput();
if (userInput.toLowerCase() == "quit") {
break;
}
try {
const generatedData = await extractClient.extract(
dataSchema,
{} as ExtractConfig,
userInput,
);
const research = renderMarkdown(generatedData?.data as ResearchData); // Added await here
logger.log(`${pc.bold(pc.cyan("Extracted information:✨"))}:\n`);
logger.log(cliMarkdown(research));
} catch (error) {
logger.error(`Error processing file: ${error}`);
}
}
return 0;
}
main().catch(console.error);
+8
View File
@@ -0,0 +1,8 @@
import { createConsola } from "consola";
import type { ConsolaInstance } from "consola";
export const logger: ConsolaInstance = createConsola({
formatOptions: {
date: false,
},
});
+172
View File
@@ -0,0 +1,172 @@
type Author = {
name: string;
affiliation?: string;
email?: string;
};
type Methodology = {
approach?: string;
participants?: string;
methods?: string[];
};
type Result = {
finding?: string;
significance?: string;
supportingData?: string;
};
type Reference = {
title: string;
authors: string;
year?: string;
relevance?: string;
};
type Discussion = {
implications?: string[];
limitations?: string[];
futureWork?: string[];
};
type Publication = {
journal?: string;
year: string;
doi?: string;
url?: string;
};
export type ResearchData = {
title: string;
authors: Author[];
abstract: string;
keywords?: string[];
mainFindings: string[];
methodology?: Methodology;
results?: Result[];
discussion?: Discussion;
references?: Reference[];
publication?: Publication;
};
export function renderMarkdown(data: ResearchData): string {
const {
title,
authors,
abstract,
keywords,
mainFindings,
methodology,
results,
discussion,
references,
publication,
} = data;
const md: string[] = [];
md.push(`# ${title}\n`);
// Authors
md.push(`## Authors`);
md.push(
authors
.map(
(author) =>
`- **${author.name}**${
author.affiliation ? `, *${author.affiliation}*` : ""
}${author.email ? ` (${author.email})` : ""}`,
)
.join("\n"),
);
// Abstract
md.push(`\n## Abstract\n${abstract}`);
// Keywords
if (keywords && keywords.length > 0) {
md.push(`\n## Keywords\n${keywords.map((k) => `- ${k}`).join("\n")}`);
}
// Main Findings
md.push(
`\n## Main Findings\n${mainFindings.map((f) => `- ${f}`).join("\n")}`,
);
// Methodology
if (methodology) {
md.push(`\n## Methodology`);
if (methodology.approach) md.push(`**Approach:** ${methodology.approach}`);
if (methodology.participants)
md.push(`**Participants:** ${methodology.participants}`);
if (methodology.methods?.length) {
md.push(
`**Methods:**\n${methodology.methods.map((m) => `- ${m}`).join("\n")}`,
);
}
}
// Results
if (results?.length) {
md.push(`\n## Results`);
results.forEach((result, i) => {
md.push(`\n### Result ${i + 1}`);
if (result.finding) md.push(`- **Finding:** ${result.finding}`);
if (result.significance)
md.push(`- **Significance:** ${result.significance}`);
if (result.supportingData)
md.push(`- **Supporting Data:** ${result.supportingData}`);
});
}
// Discussion
if (discussion) {
md.push(`\n## Discussion`);
if (discussion.implications?.length) {
md.push(
`### Implications\n${discussion.implications
.map((d) => `- ${d}`)
.join("\n")}`,
);
}
if (discussion.limitations?.length) {
md.push(
`### Limitations\n${discussion.limitations
.map((d) => `- ${d}`)
.join("\n")}`,
);
}
if (discussion.futureWork?.length) {
md.push(
`### Future Work\n${discussion.futureWork
.map((d) => `- ${d}`)
.join("\n")}`,
);
}
}
// References
if (references?.length) {
md.push(`\n## References`);
references.forEach((ref, i) => {
md.push(
`\n**[${i + 1}]** ${ref.title} — *${ref.authors}*${
ref.year ? ` (${ref.year})` : ""
}`,
);
if (ref.relevance) md.push(`> ${ref.relevance}`);
});
}
// Publication Info
if (publication) {
md.push(`\n## Publication`);
if (publication.journal) md.push(`- **Journal:** ${publication.journal}`);
if (publication.year) md.push(`- **Year:** ${publication.year}`);
if (publication.doi) md.push(`- **DOI:** ${publication.doi}`);
if (publication.url)
md.push(`- **URL:** [${publication.url}](${publication.url})`);
}
return md.join("\n");
}
+169
View File
@@ -0,0 +1,169 @@
export const dataSchema = {
type: "object",
required: ["title", "authors", "abstract", "mainFindings"],
properties: {
title: {
type: "string",
description: "The full title of the research paper",
},
authors: {
type: "array",
description: "List of all authors of the paper",
items: {
type: "object",
properties: {
name: {
type: "string",
description: "Full name of the author",
},
affiliation: {
type: "string",
description:
"Institution or organization the author is affiliated with",
},
email: {
type: "string",
description: "Contact email of the author if provided",
},
},
},
},
abstract: {
type: "string",
description: "Complete abstract or summary of the paper",
},
keywords: {
type: "array",
description:
"Key terms and phrases that describe the paper's main topics",
items: {
type: "string",
},
},
mainFindings: {
type: "array",
description: "Key findings, conclusions, or contributions of the paper",
items: {
type: "string",
},
},
methodology: {
type: "object",
description: "Research methods and approaches used",
properties: {
approach: {
type: "string",
description: "Overall research approach or study design",
},
participants: {
type: "string",
description: "Description of study participants or data sources",
},
methods: {
type: "array",
description: "Specific methods, techniques, or tools used",
items: {
type: "string",
},
},
},
},
results: {
type: "array",
description: "Main results and outcomes of the research",
items: {
type: "object",
properties: {
finding: {
type: "string",
description: "Description of the specific result or finding",
},
significance: {
type: "string",
description:
"Statistical significance or importance of the finding",
},
supportingData: {
type: "string",
description: "Relevant statistics, measurements, or data points",
},
},
},
},
discussion: {
type: "object",
properties: {
implications: {
type: "array",
description: "Theoretical or practical implications of the findings",
items: {
type: "string",
},
},
limitations: {
type: "array",
description: "Study limitations or constraints",
items: {
type: "string",
},
},
futureWork: {
type: "array",
description: "Suggested future research directions",
items: {
type: "string",
},
},
},
},
references: {
type: "array",
description:
"Key papers cited that are crucial to understanding this work",
items: {
type: "object",
properties: {
title: {
type: "string",
description: "Title of the cited paper",
},
authors: {
type: "string",
description: "Authors of the cited paper",
},
year: {
type: "string",
description: "Publication year",
},
relevance: {
type: "string",
description: "Why this reference is important to the current paper",
},
},
required: ["title", "authors"],
},
},
publication: {
type: "object",
properties: {
journal: {
type: "string",
description: "Name of the journal or conference",
},
year: {
type: "string",
description: "Year of publication",
},
doi: {
type: "string",
description: "Digital Object Identifier (DOI) of the paper",
},
url: {
type: "string",
description: "URL where the paper can be accessed",
},
},
required: ["year"],
},
},
};
+4
View File
@@ -0,0 +1,4 @@
declare module "cli-markdown" {
function cliMarkdown(input: string): string;
export default cliMarkdown;
}
+33
View File
@@ -0,0 +1,33 @@
import * as readline from "readline/promises";
import figlet from "figlet";
import pc from "picocolors";
export async function renderLogo(): Promise<void> {
const logoText = figlet.textSync("Extract Demo", {
font: "ANSI Shadow",
horizontalLayout: "default",
verticalLayout: "default",
width: 100,
whitespaceBreak: true,
});
// Add some styling with picocolors
const styledLogo = pc.bold(pc.redBright(logoText));
// Add some padding/margin
console.log("\n");
console.log(styledLogo);
console.log(pc.gray("─".repeat(60)));
console.log("\n");
}
export async function consoleInput(): Promise<string> {
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
});
const answer = await rl.question("Path to your file: ");
rl.close();
return answer;
}
+4 -4
View File
@@ -40,8 +40,8 @@ A TypeScript demo application showcasing the power of **LlamaCloud Index** - a f
1. Clone the repository:
```bash
git clone <repository-url>
cd llamaparse-demo
git clone https://github.com/run-llama/llama_cloud_services
cd lama_cloud_services/examples-ts/index/
```
2. Install dependencies:
@@ -120,12 +120,12 @@ pnpm run lint
## License
MIT License - see the [LICENSE](../../../LICENSE) file for details.
MIT License - see the [LICENSE](../../LICENSE) file for details.
## Contributing
1. Fork the repository
2. Create a feature branch
3. Make your changes
4. Run `pnpm format` and `pnpm lint`
4. Run `pnpm run format` and `pnpm run lint`
5. Submit a pull request
+1 -1
View File
@@ -42,7 +42,7 @@
"consola": "^3.4.2",
"dotenv": "^17.2.1",
"figlet": "^1.8.2",
"llama-cloud-services": "link:../../../ts/llama_cloud_services",
"llama-cloud-services": "link:../../ts/llama_cloud_services",
"picocolors": "^1.1.1"
}
}
+4 -4
View File
@@ -40,8 +40,8 @@ A TypeScript demo application showcasing the power of **LlamaParse** - an intell
1. Clone the repository:
```bash
git clone <repository-url>
cd llamaparse-demo
git clone https://github.com/run-llama/llama_cloud_services
cd lama_cloud_services/examples-ts/parse/
```
2. Install dependencies:
@@ -113,12 +113,12 @@ pnpm run lint
## License
MIT License - see the [LICENSE](../../../LICENSE) file for details.
MIT License - see the [LICENSE](../../LICENSE) file for details.
## Contributing
1. Fork the repository
2. Create a feature branch
3. Make your changes
4. Run `pnpm format` and `pnpm lint`
4. Run `pnpm run format` and `pnpm run lint`
5. Submit a pull request
+1 -1
View File
@@ -41,7 +41,7 @@
"ai": "^4.3.19",
"consola": "^3.4.2",
"figlet": "^1.8.2",
"llama-cloud-services": "link:../../../ts/llama_cloud_services",
"llama-cloud-services": "link:../../ts/llama_cloud_services",
"picocolors": "^1.1.1"
}
}
+1
View File
@@ -327,4 +327,5 @@ Another option (orthogonal to the above) is to break the document into smaller s
## Additional Resources
- [Example Notebook](docs/examples-py/extract/resume_screening.ipynb) - Detailed walkthrough of resume parsing
- [Example Application with TypeScript](./examples-ts/extract/) - End-to-end examples using LlamaExtract TypeScript client.
- [Discord Community](https://discord.com/invite/eN6D2HQ4aX) - Get help and share feedback
+63
View File
@@ -23,6 +23,9 @@ importers:
ajv:
specifier: ^8.17.1
version: 8.17.1
file-type:
specifier: ^21.0.0
version: 21.0.0
p-retry:
specifier: ^6.2.1
version: 6.2.1
@@ -92,6 +95,8 @@ importers:
ts/llama_cloud_services/beta/agent: {}
ts/llama_cloud_services/extract: {}
ts/llama_cloud_services/parse: {}
ts/llama_cloud_services/reader: {}
@@ -681,6 +686,13 @@ packages:
'@swc/types@0.1.23':
resolution: {integrity: sha512-u1iIVZV9Q0jxY+yM2vw/hZGDNudsN85bBpTqzAQ9rzkxW9D+e3aEM4Han+ow518gSewkXgjmEK0BD79ZcNVgPw==}
'@tokenizer/inflate@0.2.7':
resolution: {integrity: sha512-MADQgmZT1eKjp06jpI2yozxaU9uVs4GzzgSL+uEq7bVcJ9V1ZXQkeGNql1fsSI0gMy1vhvNTNbUqrx+pZfJVmg==}
engines: {node: '>=18'}
'@tokenizer/token@0.3.0':
resolution: {integrity: sha512-OvjF+z51L3ov0OyAU0duzsYuvO01PH7x4t6DJx+guahgTnBHkhJdG7soQeTSFLWN3efnHyibZ4Z8l2EuWwJN3A==}
'@types/estree@1.0.8':
resolution: {integrity: sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==}
@@ -1097,6 +1109,10 @@ packages:
resolution: {integrity: sha512-XXTUwCvisa5oacNGRP9SfNtYBNAMi+RPwBFmblZEF7N7swHYQS6/Zfk7SRwx4D5j3CH211YNRco1DEMNVfZCnQ==}
engines: {node: '>=16.0.0'}
file-type@21.0.0:
resolution: {integrity: sha512-ek5xNX2YBYlXhiUXui3D/BXa3LdqPmoLJ7rqEx2bKJ7EAUEfmXgW0Das7Dc6Nr9MvqaOnIqiPV0mZk/r/UpNAg==}
engines: {node: '>=20'}
fill-range@7.1.1:
resolution: {integrity: sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==}
engines: {node: '>=8'}
@@ -1182,6 +1198,9 @@ packages:
html-escaper@2.0.2:
resolution: {integrity: sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==}
ieee754@1.2.1:
resolution: {integrity: sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==}
ignore@5.3.2:
resolution: {integrity: sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==}
engines: {node: '>= 4'}
@@ -1632,6 +1651,10 @@ packages:
resolution: {integrity: sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==}
engines: {node: '>=8'}
strtok3@10.3.4:
resolution: {integrity: sha512-KIy5nylvC5le1OdaaoCJ07L+8iQzJHGH6pWDuzS+d07Cu7n1MZ2x26P8ZKIWfbK02+XIL8Mp4RkWeqdUCrDMfg==}
engines: {node: '>=18'}
supports-color@7.2.0:
resolution: {integrity: sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==}
engines: {node: '>=8'}
@@ -1674,6 +1697,10 @@ packages:
resolution: {integrity: sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==}
engines: {node: '>=8.0'}
token-types@6.0.4:
resolution: {integrity: sha512-MD9MjpVNhVyH4fyd5rKphjvt/1qj+PtQUz65aFqAZA6XniWAuSFRjLk3e2VALEFlh9OwBpXUN7rfeqSnT/Fmkw==}
engines: {node: '>=14.16'}
totalist@3.0.1:
resolution: {integrity: sha512-sf4i37nQ2LBx4m3wB74y+ubopq6W/dIzXg0FDGjsYnZHVa1Da8FH853wlL2gtUhg+xJXjfk3kUZS3BRoQeoQBQ==}
engines: {node: '>=6'}
@@ -1745,6 +1772,10 @@ packages:
engines: {node: '>=0.8.0'}
hasBin: true
uint8array-extras@1.4.0:
resolution: {integrity: sha512-ZPtzy0hu4cZjv3z5NW9gfKnNLjoz4y6uv4HlelAjDK7sY/xOkKZv9xK/WQpcsBB3jEybChz9DPC2U/+cusjJVQ==}
engines: {node: '>=18'}
undici-types@6.21.0:
resolution: {integrity: sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==}
@@ -2312,6 +2343,16 @@ snapshots:
dependencies:
'@swc/counter': 0.1.3
'@tokenizer/inflate@0.2.7':
dependencies:
debug: 4.4.1
fflate: 0.8.2
token-types: 6.0.4
transitivePeerDependencies:
- supports-color
'@tokenizer/token@0.3.0': {}
'@types/estree@1.0.8': {}
'@types/json-schema@7.0.15': {}
@@ -2815,6 +2856,15 @@ snapshots:
dependencies:
flat-cache: 4.0.1
file-type@21.0.0:
dependencies:
'@tokenizer/inflate': 0.2.7
strtok3: 10.3.4
token-types: 6.0.4
uint8array-extras: 1.4.0
transitivePeerDependencies:
- supports-color
fill-range@7.1.1:
dependencies:
to-regex-range: 5.0.1
@@ -2903,6 +2953,8 @@ snapshots:
html-escaper@2.0.2: {}
ieee754@1.2.1: {}
ignore@5.3.2: {}
ignore@7.0.5: {}
@@ -3328,6 +3380,10 @@ snapshots:
strip-json-comments@3.1.1: {}
strtok3@10.3.4:
dependencies:
'@tokenizer/token': 0.3.0
supports-color@7.2.0:
dependencies:
has-flag: 4.0.0
@@ -3368,6 +3424,11 @@ snapshots:
dependencies:
is-number: 7.0.0
token-types@6.0.4:
dependencies:
'@tokenizer/token': 0.3.0
ieee754: 1.2.1
totalist@3.0.1: {}
ts-api-utils@2.1.0(typescript@5.9.2):
@@ -3425,6 +3486,8 @@ snapshots:
uglify-js@3.19.3:
optional: true
uint8array-extras@1.4.0: {}
undici-types@6.21.0: {}
undici-types@7.10.0: {}
@@ -0,0 +1,8 @@
{
"type": "module",
"main": "./dist/index.cjs",
"module": "./dist/index.js",
"types": "./dist/index.d.ts",
"exports": "./dist/index.js",
"private": true
}
+124
View File
@@ -12855,6 +12855,72 @@
}
}
},
"/api/v1/extraction/run": {
"post": {
"tags": ["LlamaExtract"],
"summary": "Extract Stateless",
"description": "Stateless extraction endpoint that uses a default extraction agent in the user's default project. Requires data_schema, config, and either file_id, text, or base64 encoded file data.",
"operationId": "extract_stateless_api_v1_extraction_run_post",
"security": [
{
"HTTPBearer": []
},
{
"HTTPBearer": []
}
],
"parameters": [
{
"name": "session",
"in": "cookie",
"required": false,
"schema": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Session"
}
}
],
"requestBody": {
"required": true,
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/StatelessExtractionRequest"
}
}
}
},
"responses": {
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ExtractJob"
}
}
}
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HTTPValidationError"
}
}
}
}
}
}
},
"/api/v1/extraction/jobs": {
"get": {
"tags": ["LlamaExtract"],
@@ -35483,6 +35549,64 @@
"title": "WebhookConfiguration",
"description": "Allows the user to configure webhook options for notifications and callbacks."
},
"StatelessExtractionRequest": {
"type": "object",
"required": ["data_schema"],
"properties": {
"data_schema": {
"anyOf": [
{
"additionalProperties": {
"anyOf": [
{
"additionalProperties": true,
"type": "object"
},
{
"items": {},
"type": "array"
},
{
"type": "string"
},
{
"type": "integer"
},
{
"type": "number"
},
{
"type": "boolean"
},
{
"type": "null"
}
]
},
"type": "object"
},
{
"type": "string"
},
{
"type": "null"
}
]
},
"config": {
"$ref": "#/components/schemas/ExtractConfig",
"description": "The configuration parameters for the extraction agent."
},
"file_id": {
"type": "string",
"format": "uuid",
"title": "File Id",
"description": "ID of an uploaded file to extract from"
}
},
"title": "StatelessExtractionRequest",
"description": "Request body for stateless extraction. Must include either file_id, text, or base64."
},
"llama_index__core__base__llms__types__ChatMessage": {
"properties": {
"role": {
+15 -2
View File
@@ -1,6 +1,6 @@
{
"name": "llama-cloud-services",
"version": "0.2.0",
"version": "0.3.0",
"type": "module",
"license": "MIT",
"scripts": {
@@ -20,7 +20,8 @@
"./api",
"./reader",
"./parse",
"./beta/agent"
"./beta/agent",
"./extract"
],
"exports": {
"./openapi.json": "./openapi.json",
@@ -68,6 +69,17 @@
},
"default": "./parse/dist/index.js"
},
"./extract": {
"require": {
"types": "./extract/dist/index.d.cts",
"default": "./extract/dist/index.cjs"
},
"import": {
"types": "./extract/dist/index.d.ts",
"default": "./extract/dist/index.js"
},
"default": "./extract/dist/index.js"
},
".": {
"require": {
"types": "./dist/index.d.cts",
@@ -113,6 +125,7 @@
},
"dependencies": {
"ajv": "^8.17.1",
"file-type": "^21.0.0",
"p-retry": "^6.2.1",
"zod": "^3.25.76"
},
+225
View File
@@ -0,0 +1,225 @@
import { createClient, createConfig, type Client } from "@hey-api/client-fetch";
import { File } from "buffer";
import * as extract from "./extract";
import type { ExtractAgent, ExtractConfig } from "./extract";
import { getEnv } from "@llamaindex/env";
import type { ExtractResult } from "./type";
const URLS = {
us: "https://api.cloud.llamaindex.ai",
eu: "https://api.cloud.eu.llamaindex.ai",
"us-staging": "https://api.staging.llamaindex.ai",
} as const;
function getUrl(baseUrl: string | undefined, region: string | undefined) {
if (typeof baseUrl != "undefined") {
return baseUrl;
}
if (typeof region === "undefined") {
return URLS["us"];
} else if (region === "us" || region === "eu" || region === "us-staging") {
return URLS[region];
} else {
throw new Error(`Unsupported region: ${region}`);
}
}
export class LlamaExtractAgent {
private agent: ExtractAgent;
private client: Client;
id: string;
name: string;
dataSchema: {
[key: string]:
| string
| number
| boolean
| {
[key: string]: unknown;
}
| unknown[]
| null;
};
constructor(agent: ExtractAgent, client: Client) {
this.agent = agent;
this.client = client;
this.id = agent.id;
this.name = agent.name;
this.dataSchema = agent.data_schema;
}
async extract(
filePath: string | undefined = undefined,
fileContent:
| Buffer<ArrayBufferLike>
| Uint8Array<ArrayBuffer>
| string
| File
| undefined = undefined,
fileName: string | undefined = undefined,
project_id: string | null = null,
organization_id: string | null = null,
fromUi: boolean | undefined = undefined,
pollingInterval: number = 1,
maxPollingIterations: number = 1800,
maxRetriesOnError: number = 10,
retryInterval: number = 0.5,
): Promise<ExtractResult | undefined> {
return await extract.extract(
this.agent.id,
filePath,
fileContent,
fileName,
project_id,
organization_id,
this.client,
fromUi,
pollingInterval,
maxPollingIterations,
maxRetriesOnError,
retryInterval,
);
}
}
export class LlamaExtract {
private client: Client;
constructor(
apiKey: string | undefined = undefined,
baseUrl: string | undefined = undefined,
region: string | undefined = undefined,
) {
const key = apiKey ?? getEnv("LLAMA_CLOUD_API_KEY");
if (typeof key === "undefined") {
throw new Error(
"No API key provided and no API key found in environment. Please pass the API key or set `LLAMA_CLOUD_API_KEY` as an environment variable.",
);
}
const url = getUrl(baseUrl, region);
this.client = createClient(
createConfig({
baseUrl: url,
headers: {
Authorization: `Bearer ${key}`,
},
}),
);
}
async createAgent(
name: string,
dataSchema:
| {
[key: string]:
| { [key: string]: unknown }
| Array<unknown>
| string
| number
| number
| boolean
| null;
}
| string,
config: ExtractConfig | undefined = undefined,
project_id: string | null = null,
organization_id: string | null = null,
maxRetriesOnError: number = 10,
retryInterval: number = 0.5,
): Promise<LlamaExtractAgent | undefined> {
const agent = await extract.createAgent(
name,
dataSchema,
config,
project_id,
organization_id,
this.client,
maxRetriesOnError,
retryInterval,
);
if (typeof agent != "undefined") {
return new LlamaExtractAgent(agent, this.client);
}
}
async getAgent(
name: string | undefined = undefined,
id: string | undefined = undefined,
project_id: string | null = null,
organization_id: string | null = null,
maxRetriesOnError: number = 10,
retryInterval: number = 0.5,
): Promise<LlamaExtractAgent | undefined> {
const agent = await extract.getAgent(
id,
name,
project_id,
organization_id,
this.client,
maxRetriesOnError,
retryInterval,
);
if (typeof agent != "undefined") {
return new LlamaExtractAgent(agent, this.client);
}
}
async deleteAgent(
id: string,
maxRetriesOnError: number = 10,
retryInterval: number = 500,
): Promise<boolean | undefined> {
return await extract.deleteAgent(
id,
this.client,
maxRetriesOnError,
retryInterval,
);
}
async extract(
dataSchema:
| {
[key: string]:
| { [key: string]: unknown }
| Array<unknown>
| string
| number
| number
| boolean
| null;
}
| string,
config: ExtractConfig | undefined = undefined,
filePath: string | undefined = undefined,
fileContent:
| Buffer<ArrayBufferLike>
| Uint8Array<ArrayBuffer>
| string
| File
| undefined = undefined,
fileName: string | undefined = undefined,
project_id: string | null = null,
organization_id: string | null = null,
pollingInterval: number = 1,
maxPollingIterations: number = 1800,
maxRetriesOnError: number = 10,
retryInterval: number = 0.5,
): Promise<ExtractResult | undefined> {
return await extract.extractStateless(
dataSchema,
config,
filePath,
fileContent,
fileName,
project_id,
organization_id,
this.client,
pollingInterval,
maxPollingIterations,
maxRetriesOnError,
retryInterval,
);
}
}
@@ -18644,6 +18644,66 @@ export const WebhookConfigurationSchema = {
"Allows the user to configure webhook options for notifications and callbacks.",
} as const;
export const StatelessExtractionRequestSchema = {
type: "object",
required: ["data_schema"],
properties: {
data_schema: {
anyOf: [
{
additionalProperties: {
anyOf: [
{
additionalProperties: true,
type: "object",
},
{
items: {},
type: "array",
},
{
type: "string",
},
{
type: "integer",
},
{
type: "number",
},
{
type: "boolean",
},
{
type: "null",
},
],
},
type: "object",
},
{
type: "string",
},
{
type: "null",
},
],
},
config: {
$ref: "#/components/schemas/ExtractConfig",
description: "The configuration parameters for the extraction agent.",
},
file_id: {
type: "string",
format: "uuid",
title: "File Id",
description: "ID of an uploaded file to extract from",
},
},
title: "StatelessExtractionRequest",
description:
"Request body for stateless extraction. Must include either file_id, text, or base64.",
} as const;
export const llama_index__core__base__llms__types__ChatMessageSchema = {
properties: {
role: {
@@ -452,6 +452,9 @@ import type {
UpdateExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgentIdPutData,
UpdateExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgentIdPutResponse,
UpdateExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgentIdPutError,
ExtractStatelessApiV1ExtractionRunPostData,
ExtractStatelessApiV1ExtractionRunPostResponse,
ExtractStatelessApiV1ExtractionRunPostError,
ListJobsApiV1ExtractionJobsGetData,
ListJobsApiV1ExtractionJobsGetResponse,
ListJobsApiV1ExtractionJobsGetError,
@@ -5701,6 +5704,39 @@ export const updateExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgent
});
};
/**
* Extract Stateless
* Stateless extraction endpoint that uses a default extraction agent in the user's default project. Requires data_schema, config, and either file_id, text, or base64 encoded file data.
*/
export const extractStatelessApiV1ExtractionRunPost = <
ThrowOnError extends boolean = false,
>(
options: Options<ExtractStatelessApiV1ExtractionRunPostData, ThrowOnError>,
) => {
return (options.client ?? _heyApiClient).post<
ExtractStatelessApiV1ExtractionRunPostResponse,
ExtractStatelessApiV1ExtractionRunPostError,
ThrowOnError
>({
security: [
{
scheme: "bearer",
type: "http",
},
{
scheme: "bearer",
type: "http",
},
],
url: "/api/v1/extraction/run",
...options,
headers: {
"Content-Type": "application/json",
...options?.headers,
},
});
};
/**
* List Jobs
*/
@@ -8272,6 +8272,35 @@ export type WebhookConfiguration = {
> | null;
};
/**
* Request body for stateless extraction. Must include either file_id, text, or base64.
*/
export type StatelessExtractionRequest = {
data_schema:
| {
[key: string]:
| {
[key: string]: unknown;
}
| Array<unknown>
| string
| number
| number
| boolean
| null;
}
| string
| null;
/**
* The configuration parameters for the extraction agent.
*/
config?: ExtractConfig;
/**
* ID of an uploaded file to extract from
*/
file_id?: string;
};
/**
* Chat message.
*/
@@ -13078,6 +13107,33 @@ export type UpdateExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgentI
export type UpdateExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgentIdPutResponse =
UpdateExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgentIdPutResponses[keyof UpdateExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgentIdPutResponses];
export type ExtractStatelessApiV1ExtractionRunPostData = {
body: StatelessExtractionRequest;
path?: never;
query?: never;
url: "/api/v1/extraction/run";
};
export type ExtractStatelessApiV1ExtractionRunPostErrors = {
/**
* Validation Error
*/
422: HttpValidationError;
};
export type ExtractStatelessApiV1ExtractionRunPostError =
ExtractStatelessApiV1ExtractionRunPostErrors[keyof ExtractStatelessApiV1ExtractionRunPostErrors];
export type ExtractStatelessApiV1ExtractionRunPostResponses = {
/**
* Successful Response
*/
200: ExtractJob;
};
export type ExtractStatelessApiV1ExtractionRunPostResponse =
ExtractStatelessApiV1ExtractionRunPostResponses[keyof ExtractStatelessApiV1ExtractionRunPostResponses];
export type ListJobsApiV1ExtractionJobsGetData = {
body?: never;
path?: never;
@@ -3472,6 +3472,12 @@ export const zUserOrganizationRoleCreate = z.object({
role_id: z.string().uuid(),
});
export const zStatelessExtractionRequest = z.object({
data_schema: z.union([z.object({}), z.string(), z.null()]),
config: zExtractConfig.optional(),
file_id: z.string().uuid().optional(),
});
export const zListKeysApiV1ApiKeysGetResponse = z.array(zApiKey);
export const zGenerateKeyApiV1ApiKeysPostResponse = zApiKey;
@@ -3829,6 +3835,8 @@ export const zGetExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgentId
export const zUpdateExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgentIdPutResponse =
zExtractAgent;
export const zExtractStatelessApiV1ExtractionRunPostResponse = zExtractJob;
export const zListJobsApiV1ExtractionJobsGetResponse = z.array(zExtractJob);
export const zRunJobApiV1ExtractionJobsPostResponse = zExtractJob;
+651
View File
@@ -0,0 +1,651 @@
import { emitWarning } from "process";
import fs from "fs/promises";
import { Blob } from "buffer";
import * as path from "path";
import type { ExtractResult } from "./type";
import { randomUUID } from "@llamaindex/env";
import { File } from "buffer";
import {
type Options,
type ExtractAgentCreate,
type ExtractConfig,
type ExtractJobCreate,
type ExtractAgent,
type ExtractJob,
type CreateExtractionAgentApiV1ExtractionExtractionAgentsPostData,
type GetExtractionAgentByNameApiV1ExtractionExtractionAgentsByNameNameGetData,
type GetExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgentIdGetData,
type RunJobApiV1ExtractionJobsPostData,
type GetJobApiV1ExtractionJobsJobIdGetData,
type GetJobResultApiV1ExtractionJobsJobIdResultGetData,
StatusEnum,
type UploadFileApiV1FilesPostData,
type StatelessExtractionRequest,
type ExtractStatelessApiV1ExtractionRunPostData,
type DeleteExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgentIdDeleteData,
createExtractionAgentApiV1ExtractionExtractionAgentsPost,
getExtractionAgentByNameApiV1ExtractionExtractionAgentsByNameNameGet,
getExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgentIdGet,
runJobApiV1ExtractionJobsPost,
getJobApiV1ExtractionJobsJobIdGet,
getJobResultApiV1ExtractionJobsJobIdResultGet,
uploadFileApiV1FilesPost,
extractStatelessApiV1ExtractionRunPost,
deleteExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgentIdDelete,
} from "./api";
import type { Client } from "@hey-api/client-fetch";
import { sleep } from "./utils";
import { fileTypeFromBuffer } from "file-type";
type BodyUploadFileApiV1FilesPost = {
upload_file: Blob | File;
};
export async function createAgent(
name: string,
dataSchema:
| {
[key: string]:
| { [key: string]: unknown }
| Array<unknown>
| string
| number
| number
| boolean
| null;
}
| string,
config: ExtractConfig = {} as ExtractConfig,
project_id: string | null = null,
organization_id: string | null = null,
client: Client | undefined = undefined,
maxRetriesOnError: number = 10,
retryInterval: number = 0.5,
): Promise<ExtractAgent | undefined> {
const agentData = {
name: name,
data_schema: dataSchema,
config: config,
} as ExtractAgentCreate;
const agentDataCreation = {
body: agentData,
query: { project_id: project_id, organization_id: organization_id },
} as CreateExtractionAgentApiV1ExtractionExtractionAgentsPostData;
const options =
agentDataCreation as Options<CreateExtractionAgentApiV1ExtractionExtractionAgentsPostData>;
if (typeof client != "undefined") {
options.client = client;
}
let retries: number = 0;
while (true) {
if (retries > maxRetriesOnError) {
throw new Error(
"Error while creating the agent: Exceeded maximum number of retries, the API keeps returning errors.",
);
}
const response =
await createExtractionAgentApiV1ExtractionExtractionAgentsPost(options);
if (!response.response.ok) {
if ("error" in response) {
console.log(
`An error occurred while creating the extraction agent.\nDetails:\n\n${JSON.stringify(
response.error,
)}\n\nRetrying...`,
);
}
retries++;
await sleep(retryInterval * 1000);
} else {
return response.data as ExtractAgent;
}
}
}
export async function getAgent(
id: string | undefined = undefined,
name: string | undefined = undefined,
project_id: string | null = null,
organization_id: string | null = null,
client: Client | undefined = undefined,
maxRetriesOnError: number = 10,
retryInterval: number = 0.5,
): Promise<ExtractAgent | undefined> {
if (typeof id === "undefined" && typeof name === "undefined") {
throw new Error("One of `id` and `string` must be passed.");
} else if (typeof id != "undefined" && typeof name != "undefined") {
emitWarning("You passed both `id` and `name`, using only id...");
const data = {
path: { extraction_agent_id: id },
} as GetExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgentIdGetData;
const options =
data as Options<GetExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgentIdGetData>;
if (typeof client != "undefined") {
options.client = client;
}
let retries: number = 0;
while (true) {
if (retries > maxRetriesOnError) {
throw new Error(
"Error while getting the agent: Exceeded maximum number of retries, the API keeps returning errors.",
);
}
const response =
await getExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgentIdGet(
options,
);
if (!response.response.ok) {
if ("error" in response) {
console.log(
`An error occurred while getting the extraction agent by ID.\nDetails:\n\n${JSON.stringify(
response.error,
)}\n\nRetrying...`,
);
}
retries++;
await sleep(retryInterval * 1000);
} else {
return response.data as ExtractAgent;
}
}
} else if (typeof name != "undefined" && typeof id === "undefined") {
const data = {
path: { name: name },
query: { organization_id: organization_id, project_id: project_id },
} as GetExtractionAgentByNameApiV1ExtractionExtractionAgentsByNameNameGetData;
const options =
data as Options<GetExtractionAgentByNameApiV1ExtractionExtractionAgentsByNameNameGetData>;
if (typeof client != "undefined") {
options.client = client;
}
let retries: number = 0;
while (true) {
if (retries > maxRetriesOnError) {
throw new Error(
"Error while getting the agent: Exceeded maximum number of retries, the API keeps returning errors.",
);
}
const response =
await getExtractionAgentByNameApiV1ExtractionExtractionAgentsByNameNameGet(
options,
);
if (!response.response.ok) {
if ("error" in response) {
console.log(
`An error occurred while getting the extraction agent by name.\nDetails:\n\n${JSON.stringify(
response.error,
)}\n\nRetrying...`,
);
}
retries++;
await sleep(retryInterval * 1000);
} else {
return response.data as ExtractAgent;
}
}
} else {
const data = {
path: { extraction_agent_id: id },
} as GetExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgentIdGetData;
const options =
data as Options<GetExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgentIdGetData>;
if (typeof client != "undefined") {
options.client = client;
}
let retries: number = 0;
while (true) {
if (retries > maxRetriesOnError) {
throw new Error(
"Error while getting the agent: Exceeded maximum number of retries, the API keeps returning errors.",
);
}
const response =
await getExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgentIdGet(
options,
);
if (!response.response.ok) {
if (!response.response.ok) {
if ("error" in response) {
console.log(
`An error occurred while getting the extraction agent by ID.\nDetails:\n\n${JSON.stringify(
response.error,
)}\n\nRetrying...`,
);
}
retries++;
await sleep(retryInterval * 1000);
}
} else {
return response.data as ExtractAgent;
}
}
}
}
function textToFile(text: string, fileName: string | null = null) {
return new File(
[text],
fileName ?? "uploadedFile_" + randomUUID().replaceAll("-", "_") + ".txt",
);
}
async function uploadFile(
filePath: string | undefined = undefined,
fileContent:
| Buffer<ArrayBufferLike>
| File
| Uint8Array<ArrayBuffer>
| string
| undefined = undefined,
fileName: string | undefined = undefined,
project_id: string | null = null,
organization_id: string | null = null,
client: Client | undefined = undefined,
maxRetriesOnError: number = 10,
retryInterval: number = 0.5,
): Promise<string | undefined> {
let file: File | undefined = undefined;
if (typeof filePath === "undefined" && typeof fileContent === "undefined") {
throw new Error(
"One between filePath and fileContent needs to be provided",
);
} else if (typeof filePath != "undefined") {
const buffer = await fs.readFile(filePath);
const actualFileName = fileName ?? path.basename(filePath);
const uint8Array = new Uint8Array(buffer);
file = new File([uint8Array], actualFileName);
} else if (typeof fileContent != "undefined") {
if (fileContent instanceof File) {
file = fileContent;
} else if (fileContent instanceof Buffer) {
const fileType = await fileTypeFromBuffer(fileContent);
const ext = fileType?.ext ?? "pdf";
const uint8Array = new Uint8Array(fileContent);
file = new File(
[uint8Array],
fileName ??
"uploadedFile_" + randomUUID().replaceAll("-", "_") + "." + ext,
);
} else if (fileContent instanceof Uint8Array) {
const fileType = await fileTypeFromBuffer(fileContent);
const ext = fileType?.ext ?? "pdf";
file = new File(
[fileContent],
fileName ??
"uploadedFile_" + randomUUID().replaceAll("-", "_") + "." + ext,
);
} else if (typeof fileContent === "string") {
file = textToFile(fileContent, fileName);
} else {
throw new Error("Unsupported fileContent type");
}
}
const fileToUpload = {
upload_file: file,
} as BodyUploadFileApiV1FilesPost;
const uploadData = {
body: fileToUpload,
query: { organization_id: organization_id, project_id: project_id },
} as UploadFileApiV1FilesPostData;
const uploadOptions = uploadData as Options<UploadFileApiV1FilesPostData>;
if (typeof client != "undefined") {
uploadOptions.client = client;
}
let retries: number = 0;
while (true) {
if (retries > maxRetriesOnError) {
throw new Error(
"Error while processing your file: Exceeded maximum number of retries, the API keeps returning errors.",
);
}
const uploadResponse = await uploadFileApiV1FilesPost(uploadOptions);
let fileId: string | undefined = undefined;
if (!uploadResponse.response.ok) {
retries++;
await sleep(retryInterval * 1000);
}
if (typeof uploadResponse.data != "undefined") {
fileId = uploadResponse.data.id as string;
return fileId;
}
}
}
async function createExtractJob(
options:
| Options<RunJobApiV1ExtractionJobsPostData>
| Options<ExtractStatelessApiV1ExtractionRunPostData>,
stateless: boolean = false,
maxRetriesOnError: number = 10,
retryInterval: number = 0.5,
): Promise<string | undefined> {
let retries: number = 0;
while (true) {
if (retries > maxRetriesOnError) {
throw new Error(
"Error while creating the extraction job: Exceeded maximum number of retries, the API keeps returning errors.",
);
}
let response:
| {
data: ExtractJob | undefined;
request: Request;
response: Response;
}
| undefined = undefined;
if (!stateless) {
response = (await runJobApiV1ExtractionJobsPost(
options as Options<RunJobApiV1ExtractionJobsPostData>,
)) as {
data: ExtractJob | undefined;
request: Request;
response: Response;
};
} else {
response = (await extractStatelessApiV1ExtractionRunPost(
options as Options<ExtractStatelessApiV1ExtractionRunPostData>,
)) as {
data: ExtractJob | undefined;
request: Request;
response: Response;
};
}
if (!response.response.ok) {
if ("error" in response) {
console.log(
"An error occurred: ",
JSON.stringify(response.error),
"\nRetrying...",
);
}
retries++;
await sleep(retryInterval * 1000);
}
if (typeof response.data != "undefined") {
const jobStatus = response.data.status as StatusEnum;
if (jobStatus == "CANCELLED") {
retries++;
await sleep(retryInterval * 1000);
} else if (jobStatus == "ERROR") {
retries++;
await sleep(retryInterval * 1000);
} else {
return response.data.id as string;
}
}
}
}
async function pollForJobCompletion(
jobId: string,
interval: number = 1,
maxIterations: number = 1800,
client: Client | undefined = undefined,
): Promise<boolean> {
let status: StatusEnum | undefined = undefined;
const jobData = {
path: { job_id: jobId },
} as GetJobApiV1ExtractionJobsJobIdGetData;
const jobOptions = jobData as Options<GetJobApiV1ExtractionJobsJobIdGetData>;
if (typeof client != "undefined") {
jobOptions.client = client;
}
let numIterations: number = 0;
while (true) {
if (numIterations > maxIterations) {
return false;
}
const response = await getJobApiV1ExtractionJobsJobIdGet(jobOptions);
if (!response.response.ok) {
numIterations++;
}
if (typeof response.data != "undefined") {
status = response.data.status as StatusEnum;
if (status == StatusEnum.CANCELLED || status == StatusEnum.ERROR) {
throw new Error("There was an error extracting data from your file.");
} else if (status == StatusEnum.SUCCESS) {
return true;
} else {
numIterations++;
await sleep(interval * 1000);
}
}
}
}
async function getJobResult(
jobId: string,
client: Client | undefined = undefined,
project_id: string | null = null,
organization_id: string | null = null,
maxRetriesOnError: number = 10,
retryInterval: number = 0.5,
): Promise<ExtractResult | undefined> {
const jobData = {
path: { job_id: jobId },
query: { organization_id: organization_id, project_id: project_id },
} as GetJobResultApiV1ExtractionJobsJobIdResultGetData;
const jobOptions =
jobData as Options<GetJobResultApiV1ExtractionJobsJobIdResultGetData>;
if (typeof client != "undefined") {
jobOptions.client = client;
}
let retries: number = 0;
while (true) {
if (retries > maxRetriesOnError) {
throw new Error(
"Error while getting the result of the extraction job: Exceeded maximum number of retries, the API keeps returning errors.",
);
}
const response =
await getJobResultApiV1ExtractionJobsJobIdResultGet(jobOptions);
if (!response.response.ok) {
if ("error" in response) {
console.log(
"An error occurred: ",
JSON.stringify(response.error),
"\nRetrying...",
);
}
retries++;
await sleep(retryInterval * 1000);
}
if (typeof response.data != "undefined") {
return {
data: response.data.data,
extractionMetadata: response.data.extraction_metadata,
} as ExtractResult;
}
}
}
export async function extract(
agentId: string,
filePath: string | undefined = undefined,
fileContent:
| Buffer<ArrayBufferLike>
| File
| Uint8Array<ArrayBuffer>
| string
| undefined = undefined,
fileName: string | undefined = undefined,
project_id: string | null = null,
organization_id: string | null = null,
client: Client | undefined = undefined,
fromUi: boolean | undefined = undefined,
pollingInterval: number = 1,
maxPollingIterations: number = 1800,
maxRetriesOnError: number = 10,
retryInterval: number = 0.5,
): Promise<ExtractResult | undefined> {
const fileId = (await uploadFile(
filePath,
fileContent,
fileName,
project_id,
organization_id,
client,
maxRetriesOnError,
retryInterval,
)) as string;
const extractJobCreate = {
extraction_agent_id: agentId,
file_id: fileId,
} as ExtractJobCreate;
const extractData = {
body: extractJobCreate,
query: { from_ui: fromUi },
} as RunJobApiV1ExtractionJobsPostData;
const extractOptions =
extractData as Options<RunJobApiV1ExtractionJobsPostData>;
if (typeof client != "undefined") {
extractOptions.client = client;
}
const jobId = (await createExtractJob(
extractOptions,
false,
maxRetriesOnError,
retryInterval,
)) as string;
const success = await pollForJobCompletion(
jobId,
pollingInterval,
maxPollingIterations,
client,
);
if (!success) {
throw new Error("Your job is taking longer than 10 minutes, timing out...");
} else {
return (await getJobResult(
jobId,
client,
project_id,
organization_id,
maxRetriesOnError,
retryInterval,
)) as ExtractResult;
}
}
export async function extractStateless(
dataSchema:
| {
[key: string]:
| { [key: string]: unknown }
| Array<unknown>
| string
| number
| number
| boolean
| null;
}
| string,
config: ExtractConfig = {} as ExtractConfig,
filePath: string | undefined = undefined,
fileContent:
| Buffer<ArrayBufferLike>
| File
| Uint8Array<ArrayBuffer>
| string
| undefined = undefined,
fileName: string | undefined = undefined,
project_id: string | null = null,
organization_id: string | null = null,
client: Client | undefined = undefined,
pollingInterval: number = 1,
maxPollingIterations: number = 1800,
maxRetriesOnError: number = 10,
retryInterval: number = 0.5,
): Promise<ExtractResult | undefined> {
const fileId = (await uploadFile(
filePath,
fileContent,
fileName,
project_id,
organization_id,
client,
maxRetriesOnError,
retryInterval,
)) as string;
const extractStatetelessCreate = {
data_schema: dataSchema,
file_id: fileId,
config: config,
} as StatelessExtractionRequest;
const extractStatetelessData = {
body: extractStatetelessCreate,
} as ExtractStatelessApiV1ExtractionRunPostData;
const extractOptions =
extractStatetelessData as Options<ExtractStatelessApiV1ExtractionRunPostData>;
if (typeof client != "undefined") {
extractOptions.client = client;
}
const jobId = (await createExtractJob(
extractOptions,
true,
maxRetriesOnError,
retryInterval,
)) as string;
const success = await pollForJobCompletion(
jobId,
pollingInterval,
maxPollingIterations,
client,
);
if (!success) {
throw new Error("Your job is taking longer than 10 minutes, timing out...");
} else {
return (await getJobResult(
jobId,
client,
project_id,
organization_id,
maxRetriesOnError,
retryInterval,
)) as ExtractResult;
}
}
export async function deleteAgent(
id: string,
client: Client | undefined = undefined,
maxRetriesOnError: number = 10,
retryInterval: number = 0.5,
): Promise<boolean | undefined> {
const deleteData = {
path: { extraction_agent_id: id },
} as DeleteExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgentIdDeleteData;
const deleteOptions =
deleteData as Options<DeleteExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgentIdDeleteData>;
if (typeof client != "undefined") {
deleteOptions.client = client;
}
let retries: number = 0;
while (true) {
if (retries > maxRetriesOnError) {
throw new Error(
"Maximum number of attempts for deleting agent " +
id +
" reached, but the API continues to return errors.",
);
}
const response =
await deleteExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgentIdDelete(
deleteOptions,
);
if (!response.response.ok) {
if ("error" in response) {
console.log(
`An error occurred while deleting the agent: ${JSON.stringify(
response.error,
)}\nRetrying...`,
);
}
retries++;
await sleep(retryInterval * 1000);
} else {
return true;
}
}
}
export { type ExtractAgent, type ExtractConfig };
+2
View File
@@ -6,3 +6,5 @@ export {
} from "./LlamaCloudRetriever.js";
export type { CloudConstructorParams } from "./type.js";
export { LlamaParseReader } from "./reader.js";
export { LlamaExtract, LlamaExtractAgent } from "./LlamaExtract.js";
export type { ExtractConfig } from "./extract.js";
+41
View File
@@ -8,3 +8,44 @@ export type CloudConstructorParams = {
projectName: string;
organizationId?: string | undefined;
} & ClientParams;
export type ExtractResult = {
data:
| {
[key: string]:
| {
[key: string]: unknown;
}
| Array<unknown>
| string
| number
| number
| boolean
| null;
}
| Array<{
[key: string]:
| {
[key: string]: unknown;
}
| Array<unknown>
| string
| number
| number
| boolean
| null;
}>
| null;
extractionMetadata: {
[key: string]:
| {
[key: string]: unknown;
}
| Array<unknown>
| string
| number
| number
| boolean
| null;
};
};
@@ -1,8 +1,10 @@
import { describe, it, expect, beforeEach, beforeAll } from "vitest";
import { LlamaParseReader } from "../src/reader.js";
import { LlamaCloudIndex } from "../src/LlamaCloudIndex.js";
import { LlamaExtract, LlamaExtractAgent } from "../src/LlamaExtract.js";
import { Document } from "@llamaindex/core/schema";
import { fs } from "@llamaindex/env";
import { ExtractConfig } from "../src/api.js";
// Integration tests that require actual API keys and files
describe("Integration Tests", () => {
@@ -414,6 +416,121 @@ describe("Integration Tests", () => {
);
});
describe("LlamaExtract Integration", () => {
it.skipIf(skipIfNoApiKey)(
"should create agents correctly",
async () => {
const dataSchema = {
properties: {
text: {
description: "Text from the file",
type: "string",
},
},
required: ["text"],
type: "object",
};
const extractClient = new LlamaExtract(
process.env.LLAMA_CLOUD_API_KEY!,
"https://api.cloud.llamaindex.ai",
);
const agent = await extractClient.createAgent(
"ExtractTestAgent",
dataSchema,
);
expect(agent).instanceOf(LlamaExtractAgent);
},
60000,
);
it.skipIf(skipIfNoApiKey)(
"should fetch agents correctly",
async () => {
const extractClient = new LlamaExtract(
process.env.LLAMA_CLOUD_API_KEY!,
"https://api.cloud.llamaindex.ai",
);
const agent = await extractClient.getAgent("ExtractTestAgent");
expect(agent).instanceOf(LlamaExtractAgent);
},
60000,
);
it.skipIf(skipIfNoApiKey)(
"should extract data correctly (file paths and file contents) with an agent and delete that agent",
async () => {
const extractClient = new LlamaExtract(
process.env.LLAMA_CLOUD_API_KEY!,
"https://api.cloud.llamaindex.ai",
);
const agent = await extractClient.getAgent("ExtractTestAgent");
const testContent =
"**Text to extract**: Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.";
const testFilePath = "test-extract-agent.md";
await fs.writeFile(testFilePath, new TextEncoder().encode(testContent));
const result = await agent!.extract("test-extract-agent.md");
expect("data" in result!).toBeTruthy();
expect("extractionMetadata" in result!).toBeTruthy();
const buffer = await fs.readFile("test-extract-agent.md");
const resultBuffer = await agent!.extract(
undefined,
buffer,
"test-extract-agent.md",
);
expect("data" in resultBuffer!).toBeTruthy();
expect("extractionMetadata" in resultBuffer!).toBeTruthy();
const success = await extractClient.deleteAgent(agent!.id);
expect(success).toBeTruthy();
},
60000,
);
it.skipIf(skipIfNoApiKey)(
"should extract statelessly file paths and file contents",
async () => {
const dataSchema = {
properties: {
text: {
description: "Text from the file",
type: "string",
},
},
required: ["text"],
type: "object",
};
const testContent =
"**Text to extract**: Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.";
const testFilePath = "test-extract.md";
await fs.writeFile(testFilePath, new TextEncoder().encode(testContent));
const extractClient = new LlamaExtract(
process.env.LLAMA_CLOUD_API_KEY!,
"https://api.cloud.llamaindex.ai",
);
const result = await extractClient.extract(
dataSchema,
{} as ExtractConfig,
"test-extract.md",
);
expect("data" in result!).toBeTruthy();
expect("extractionMetadata" in result!).toBeTruthy();
const buffer = await fs.readFile("test-extract.md");
const resultBuffer = await extractClient.extract(
dataSchema,
{} as ExtractConfig,
undefined,
buffer,
); // testing without passing a file name
expect("data" in resultBuffer!).toBeTruthy();
expect("extractionMetadata" in resultBuffer!).toBeTruthy();
},
60000,
);
});
describe("Error Handling Integration", () => {
it.skipIf(skipIfNoApiKey)(
"should handle malformed files gracefully",