mirror of
https://github.com/run-llama/llama_cloud_services.git
synced 2026-07-01 21:44:37 -04:00
Compare commits
10 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| bb8aeca14f | |||
| f385e96ab8 | |||
| c3e4696b5f | |||
| 1e40c9cf94 | |||
| 802bc2a9f8 | |||
| 5ea758b853 | |||
| 208b6f2fa5 | |||
| e1b9143f79 | |||
| 232c55bd6a | |||
| ab6f2f8da5 |
@@ -149,7 +149,7 @@ jobs:
|
||||
- name: Post to Extract Slack channel
|
||||
id: slack
|
||||
if: (failure() || cancelled()) && steps.runtime.outputs.notify_slack == 'true'
|
||||
uses: slackapi/slack-github-action@v1.27.0
|
||||
uses: slackapi/slack-github-action@v2.1.1
|
||||
with:
|
||||
channel-id: ${{ env.SLACK_CHANNEL_ID }}
|
||||
slack-message: |
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
[](https://discord.gg/dGcwcsnxhU)
|
||||
|
||||
# Llama Cloud Services
|
||||
|
||||
> **⚠️ DEPRECATION NOTICE**
|
||||
>
|
||||
> This repository and its packages are deprecated and will be maintained until **May 1, 2026**.
|
||||
@@ -12,79 +13,3 @@
|
||||
> - **TypeScript**: `npm install @llamaindex/llama-cloud` ([GitHub](https://github.com/run-llama/llama-cloud-ts))
|
||||
>
|
||||
> The new packages provide the same functionality with improved performance, better support, and active development.
|
||||
|
||||
|
||||
This repository contains the code for hand-written SDKs and clients for interacting with LlamaCloud.
|
||||
|
||||
This includes:
|
||||
|
||||
- [LlamaParse](./parse.md) - A GenAI-native document parser that can parse complex document data for any downstream LLM use case (Agents, RAG, data processing, etc.).
|
||||
- [LlamaExtract](./extract.md) - A prebuilt agentic data extractor that can be used to transform data into a structured JSON representation.
|
||||
- [LlamaCloud Index](./index.md) - A widely customizable and fully automated document ingestion pipeline that also serves retrieval purposes.
|
||||
|
||||
## Getting Started
|
||||
|
||||
Install the package:
|
||||
|
||||
```bash
|
||||
pip install llama-cloud-services
|
||||
```
|
||||
|
||||
Then, get your API key from [LlamaCloud](https://cloud.llamaindex.ai/).
|
||||
|
||||
Then, you can use the services in your code:
|
||||
|
||||
```python
|
||||
from llama_cloud_services import (
|
||||
LlamaParse,
|
||||
LlamaExtract,
|
||||
LlamaCloudIndex,
|
||||
)
|
||||
|
||||
parser = LlamaParse(api_key="YOUR_API_KEY")
|
||||
extract = LlamaExtract(api_key="YOUR_API_KEY")
|
||||
index = LlamaCloudIndex(
|
||||
"my_first_index", project_name="default", api_key="YOUR_API_KEY"
|
||||
)
|
||||
```
|
||||
|
||||
See the quickstart guides for each service for more information:
|
||||
|
||||
- [LlamaParse](./parse.md)
|
||||
- [LlamaExtract](./extract.md)
|
||||
- [LlamaCloud Index](./index.md)
|
||||
|
||||
## Switch to EU SaaS 🇪🇺
|
||||
|
||||
If you are interested in using LlamaCloud services in the EU, you can adjust your base URL to `https://api.cloud.eu.llamaindex.ai`.
|
||||
|
||||
You can also create your API key in the EU region [here](https://cloud.eu.llamaindex.ai).
|
||||
|
||||
```python
|
||||
from llama_cloud_services import (
|
||||
LlamaParse,
|
||||
LlamaExtract,
|
||||
EU_BASE_URL,
|
||||
)
|
||||
|
||||
parser = LlamaParse(api_key="YOUR_API_KEY", base_url=EU_BASE_URL)
|
||||
extract = LlamaExtract(api_key="YOUR_API_KEY", base_url=EU_BASE_URL)
|
||||
index = LlamaCloudIndex(
|
||||
"my_first_index",
|
||||
project_name="default",
|
||||
api_key="YOUR_API_KEY",
|
||||
base_url=EU_BASE_URL,
|
||||
)
|
||||
```
|
||||
|
||||
## Documentation
|
||||
|
||||
You can see complete SDK and API documentation for each service on [our official docs](https://docs.cloud.llamaindex.ai/).
|
||||
|
||||
## Terms of Service
|
||||
|
||||
See the [Terms of Service Here](./TOS.pdf).
|
||||
|
||||
## Get in Touch (LlamaCloud)
|
||||
|
||||
You can get in touch with us by following our [contact link](https://www.llamaindex.ai/contact).
|
||||
|
||||
-403
@@ -1,403 +0,0 @@
|
||||
# LlamaExtract
|
||||
|
||||
LlamaExtract provides a simple API for extracting structured data from unstructured documents like PDFs, text files and images.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Quick Start](#quick-start)
|
||||
- [Supported File Types](#supported-file-types)
|
||||
- [Different Input Types](#different-input-types)
|
||||
- [Async Extraction](#async-extraction)
|
||||
- [Core Concepts](#core-concepts)
|
||||
- [Defining Schemas](#defining-schemas)
|
||||
- [Using Pydantic (Recommended)](#using-pydantic-recommended)
|
||||
- [Using JSON Schema](#using-json-schema)
|
||||
- [Important restrictions on JSON/Pydantic Schema](#important-restrictions-on-jsonpydantic-schema)
|
||||
- [Extraction Configuration](#extraction-configuration)
|
||||
- [Configuration Options](#configuration-options)
|
||||
- [Extraction Agents (Advanced)](#extraction-agents-advanced)
|
||||
- [Creating Agents](#creating-agents)
|
||||
- [Agent Batch Processing](#agent-batch-processing)
|
||||
- [Updating Agent Schemas](#updating-agent-schemas)
|
||||
- [Managing Agents](#managing-agents)
|
||||
- [When to Use Agents vs Direct Extraction](#when-to-use-agents-vs-direct-extraction)
|
||||
- [Installation](#installation)
|
||||
- [Tips & Best Practices](#tips--best-practices)
|
||||
- [Additional Resources](#additional-resources)
|
||||
|
||||
## Quick Start
|
||||
|
||||
The simplest way to get started is to use the stateless API with the extraction configuration and the file/text to extract from:
|
||||
|
||||
```python
|
||||
from llama_cloud_services import LlamaExtract
|
||||
from llama_cloud import ExtractConfig, ExtractMode
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
# Initialize client
|
||||
extractor = LlamaExtract(api_key="YOUR_API_KEY")
|
||||
|
||||
|
||||
# Define schema using Pydantic
|
||||
class Resume(BaseModel):
|
||||
name: str = Field(description="Full name of candidate")
|
||||
email: str = Field(description="Email address")
|
||||
skills: list[str] = Field(description="Technical skills and technologies")
|
||||
|
||||
|
||||
# Configure extraction settings
|
||||
config = ExtractConfig(extraction_mode=ExtractMode.FAST)
|
||||
|
||||
# Extract data directly from document - no agent needed!
|
||||
result = extractor.extract(Resume, config, "resume.pdf")
|
||||
print(result.data)
|
||||
```
|
||||
|
||||
### Supported File Types
|
||||
|
||||
LlamaExtract supports the following file formats:
|
||||
|
||||
- **Documents**: PDF (.pdf), Word (.docx)
|
||||
- **Text files**: Plain text (.txt), CSV (.csv), JSON (.json), HTML (.html, .htm), Markdown (.md)
|
||||
- **Images**: PNG (.png), JPEG (.jpg, .jpeg)
|
||||
|
||||
### Different Input Types
|
||||
|
||||
```python
|
||||
# From file path (string or Path)
|
||||
result = extractor.extract(Resume, config, "resume.pdf")
|
||||
|
||||
# From file handle
|
||||
with open("resume.pdf", "rb") as f:
|
||||
result = extractor.extract(Resume, config, f)
|
||||
|
||||
# From bytes with filename
|
||||
with open("resume.pdf", "rb") as f:
|
||||
file_bytes = f.read()
|
||||
from llama_cloud_services.extract import SourceText
|
||||
|
||||
result = extractor.extract(
|
||||
Resume, config, SourceText(file=file_bytes, filename="resume.pdf")
|
||||
)
|
||||
|
||||
# From text content
|
||||
text = "Name: John Doe\nEmail: john@example.com\nSkills: Python, AI"
|
||||
result = extractor.extract(Resume, config, SourceText(text_content=text))
|
||||
```
|
||||
|
||||
### Async Extraction
|
||||
|
||||
For better performance with multiple files or when integrating with async applications.
|
||||
Here `queue_extraction` will enqueue the extraction jobs and exit. Alternatively, you
|
||||
can use `aextract` to poll for the job and return the extraction results.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
|
||||
|
||||
async def extract_resumes():
|
||||
# Async extraction
|
||||
result = await extractor.aextract(Resume, config, "resume.pdf")
|
||||
print(result.data)
|
||||
|
||||
# Queue extraction jobs (returns immediately)
|
||||
jobs = await extractor.queue_extraction(
|
||||
Resume, config, ["resume1.pdf", "resume2.pdf"]
|
||||
)
|
||||
print(f"Queued {len(jobs)} extraction jobs")
|
||||
return jobs
|
||||
|
||||
|
||||
# Run async function
|
||||
jobs = asyncio.run(extract_resumes())
|
||||
# Check job status
|
||||
for job in jobs:
|
||||
status = agent.get_extraction_job(job.id).status
|
||||
print(f"Job {job.id}: {status}")
|
||||
|
||||
# Get results when complete
|
||||
results = [agent.get_extraction_run_for_job(job.id) for job in jobs]
|
||||
```
|
||||
|
||||
## Core Concepts
|
||||
|
||||
- **Data Schema**: Structure definition for the data you want to extract in the form of a JSON schema or a Pydantic model.
|
||||
- **Extraction Config**: Settings that control how extraction is performed (e.g., speed vs accuracy trade-offs).
|
||||
- **Extraction Jobs**: Asynchronous extraction tasks that can be monitored.
|
||||
- **Extraction Agents** (Advanced): Reusable extractors configured with a specific schema and extraction settings.
|
||||
|
||||
## Defining Schemas
|
||||
|
||||
Schemas define the structure of data you want to extract. You can use either Pydantic models or JSON Schema:
|
||||
|
||||
### Using Pydantic (Recommended)
|
||||
|
||||
```python
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List, Optional
|
||||
from llama_cloud import ExtractConfig, ExtractMode
|
||||
|
||||
|
||||
class Experience(BaseModel):
|
||||
company: str = Field(description="Company name")
|
||||
title: str = Field(description="Job title")
|
||||
start_date: Optional[str] = Field(description="Start date of employment")
|
||||
end_date: Optional[str] = Field(description="End date of employment")
|
||||
|
||||
|
||||
class Resume(BaseModel):
|
||||
name: str = Field(description="Candidate name")
|
||||
experience: List[Experience] = Field(description="Work history")
|
||||
|
||||
|
||||
# Use the schema for extraction
|
||||
config = ExtractConfig(extraction_mode=ExtractMode.FAST)
|
||||
result = extractor.extract(Resume, config, "resume.pdf")
|
||||
```
|
||||
|
||||
### Using JSON Schema
|
||||
|
||||
```python
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string", "description": "Candidate name"},
|
||||
"experience": {
|
||||
"type": "array",
|
||||
"description": "Work history",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"company": {
|
||||
"type": "string",
|
||||
"description": "Company name",
|
||||
},
|
||||
"title": {"type": "string", "description": "Job title"},
|
||||
"start_date": {
|
||||
"anyOf": [{"type": "string"}, {"type": "null"}],
|
||||
"description": "Start date of employment",
|
||||
},
|
||||
"end_date": {
|
||||
"anyOf": [{"type": "string"}, {"type": "null"}],
|
||||
"description": "End date of employment",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
# Use the schema for extraction
|
||||
config = ExtractConfig(extraction_mode=ExtractMode.FAST)
|
||||
result = extractor.extract(schema, config, "resume.pdf")
|
||||
```
|
||||
|
||||
### Important restrictions on JSON/Pydantic Schema
|
||||
|
||||
_LlamaExtract only supports a subset of the JSON Schema specification._ While limited, it should
|
||||
be sufficient for a wide variety of use-cases.
|
||||
|
||||
- All fields are required by default. Nullable fields must be explicitly marked as such,
|
||||
using `anyOf` with a `null` type. See `"start_date"` field above.
|
||||
- Root node must be of type `object`.
|
||||
- Schema nesting must be limited to within 5 levels.
|
||||
- The important fields are key names/titles, type and description. Fields for
|
||||
formatting, default values, etc. are **not supported**. If you need these, you can add the
|
||||
restrictions to your field description and/or use a post-processing step. e.g. default values can be supported by making a field optional and then setting `"null"` values from the extraction result to the default value.
|
||||
- There are other restrictions on number of keys, size of the schema, etc. that you may
|
||||
hit for complex extraction use cases. In such cases, it is worth thinking how to restructure
|
||||
your extraction workflow to fit within these constraints, e.g. by extracting subset of fields
|
||||
and later merging them together.
|
||||
|
||||
## Extraction Configuration
|
||||
|
||||
Configure how extraction is performed using `ExtractConfig`. The schema is the most important part, but several configuration options can significantly impact the extraction process.
|
||||
|
||||
```python
|
||||
from llama_cloud import ExtractConfig, ExtractMode, ChunkMode, ExtractTarget
|
||||
|
||||
# Basic configuration
|
||||
config = ExtractConfig(
|
||||
extraction_mode=ExtractMode.BALANCED, # FAST, BALANCED, MULTIMODAL, PREMIUM
|
||||
extraction_target=ExtractTarget.PER_DOC, # PER_DOC, PER_PAGE
|
||||
system_prompt="Focus on the most recent data",
|
||||
page_range="1-5,10-15", # Extract from specific pages
|
||||
)
|
||||
|
||||
# Advanced configuration
|
||||
advanced_config = ExtractConfig(
|
||||
extraction_mode=ExtractMode.MULTIMODAL,
|
||||
chunk_mode=ChunkMode.PAGE, # PAGE, SECTION
|
||||
high_resolution_mode=True, # Better OCR accuracy
|
||||
invalidate_cache=False, # Bypass cached results
|
||||
cite_sources=True, # Enable source citations
|
||||
use_reasoning=True, # Enable reasoning (not in FAST mode)
|
||||
confidence_scores=True, # MULTIMODAL/PREMIUM only
|
||||
)
|
||||
```
|
||||
|
||||
### Key Configuration Options
|
||||
|
||||
**Extraction Mode**: Controls processing quality and speed
|
||||
|
||||
- `FAST`: Fastest processing, suitable for simple documents with no OCR
|
||||
- `BALANCED`: Good speed/accuracy tradeoff for text-rich documents
|
||||
- `MULTIMODAL`: For visually rich documents with text, tables, and images (recommended)
|
||||
- `PREMIUM`: Highest accuracy with OCR, complex table/header detection
|
||||
|
||||
**Extraction Target**: Defines extraction scope
|
||||
|
||||
- `PER_DOC`: Apply schema to entire document (default)
|
||||
- `PER_PAGE`: Apply schema to each page, returns array of results
|
||||
|
||||
**Advanced Options**:
|
||||
|
||||
- `system_prompt`: Additional system-level instructions
|
||||
- `page_range`: Specific pages to extract (e.g., "1,3,5-7,9")
|
||||
- `chunk_mode`: Document splitting strategy (`PAGE` or `SECTION`)
|
||||
- `high_resolution_mode`: Better OCR for small text (slower processing)
|
||||
|
||||
**Extensions** (return additional metadata):
|
||||
|
||||
- `cite_sources`: Source tracing for extracted fields
|
||||
- `use_reasoning`: Explanations for extraction decisions
|
||||
- `confidence_scores`: Quantitative confidence measures (MULTIMODAL/PREMIUM only)
|
||||
|
||||
For complete configuration options, advanced settings, and detailed examples, see the [LlamaExtract Configuration Documentation](https://docs.cloud.llamaindex.ai/llamaextract/features/options).
|
||||
|
||||
## Extraction Agents (Advanced)
|
||||
|
||||
For reusable extraction workflows, you can create extraction agents that encapsulate both schema and configuration:
|
||||
|
||||
### Creating Agents
|
||||
|
||||
```python
|
||||
from llama_cloud_services import LlamaExtract
|
||||
from llama_cloud import ExtractConfig, ExtractMode
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
# Initialize client
|
||||
extractor = LlamaExtract()
|
||||
|
||||
|
||||
# Define schema
|
||||
class Resume(BaseModel):
|
||||
name: str = Field(description="Full name of candidate")
|
||||
email: str = Field(description="Email address")
|
||||
skills: list[str] = Field(description="Technical skills and technologies")
|
||||
|
||||
|
||||
# Configure extraction settings
|
||||
config = ExtractConfig(extraction_mode=ExtractMode.FAST)
|
||||
|
||||
# Create extraction agent
|
||||
agent = extractor.create_agent(
|
||||
name="resume-parser", data_schema=Resume, config=config
|
||||
)
|
||||
|
||||
# Use the agent
|
||||
result = agent.extract("resume.pdf")
|
||||
print(result.data)
|
||||
```
|
||||
|
||||
### Agent Batch Processing
|
||||
|
||||
Process multiple files with an agent:
|
||||
|
||||
```python
|
||||
# Queue multiple files for extraction
|
||||
jobs = await agent.queue_extraction(["resume1.pdf", "resume2.pdf"])
|
||||
|
||||
# Check job status
|
||||
for job in jobs:
|
||||
status = agent.get_extraction_job(job.id).status
|
||||
print(f"Job {job.id}: {status}")
|
||||
|
||||
# Get results when complete
|
||||
results = [agent.get_extraction_run_for_job(job.id) for job in jobs]
|
||||
```
|
||||
|
||||
### Updating Agent Schemas
|
||||
|
||||
Schemas can be modified and updated after creation:
|
||||
|
||||
```python
|
||||
# Update schema
|
||||
agent.data_schema = new_schema
|
||||
|
||||
# Save changes
|
||||
agent.save()
|
||||
```
|
||||
|
||||
### Managing Agents
|
||||
|
||||
```python
|
||||
# List all agents
|
||||
agents = extractor.list_agents()
|
||||
|
||||
# Get specific agent
|
||||
agent = extractor.get_agent(name="resume-parser")
|
||||
|
||||
# Delete agent
|
||||
extractor.delete_agent(agent.id)
|
||||
```
|
||||
|
||||
### When to Use Agents vs Direct Extraction
|
||||
|
||||
**Use Direct Extraction When:**
|
||||
|
||||
- One-off extractions
|
||||
- Different schemas for different documents
|
||||
- Simple workflows
|
||||
- Getting started quickly
|
||||
|
||||
**Use Extraction Agents When:**
|
||||
|
||||
- Repeated extractions with the same schema
|
||||
- Team collaboration (shared, named extractors)
|
||||
- Complex workflows requiring state management
|
||||
- Production systems with consistent extraction patterns
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
pip install llama-cloud-services
|
||||
```
|
||||
|
||||
## Tips & Best Practices
|
||||
|
||||
At the core of LlamaExtract is the schema, which defines the structure of the data you want to extract from your documents.
|
||||
|
||||
1. **Schema Design**:
|
||||
|
||||
- Try to limit schema nesting to 3-4 levels.
|
||||
- Make fields optional when data might not always be present. Having required fields may force the model
|
||||
to hallucinate when these fields are not present in the documents.
|
||||
- When you want to extract a variable number of entities, use an `array` type. However, note that you cannot use
|
||||
an `array` type for the root node.
|
||||
- Use descriptive field names and detailed descriptions. Use descriptions to pass formatting
|
||||
instructions or few-shot examples.
|
||||
- Above all, start simple and iteratively build your schema to incorporate requirements.
|
||||
|
||||
2. **Running Extractions**:
|
||||
- Note that resetting `agent.schema` will not save the schema to the database,
|
||||
until you call `agent.save`, but it will be used for running extractions.
|
||||
- Check extraction results for any errors. Error information is available in the `result.error` field for debugging.
|
||||
- Consider async operations (`aextract` or `queue_extraction`) for large-scale extraction or when processing multiple files.
|
||||
- For repeated extractions with the same schema, consider creating an extraction agent to avoid redefining the schema each time.
|
||||
|
||||
### Hitting "The response was too long to be processed" Error
|
||||
|
||||
This implies that the extraction response is hitting output token limits of the LLM. In such cases, it is worth rethinking the design of your schema to enable a more efficient/scalable extraction. e.g.
|
||||
|
||||
- Instead of one field that extracts a complex object, you can use multiple fields to distribute the extraction logic.
|
||||
- You can also use multiple schemas to extract different subsets of fields from the same document and merge them later.
|
||||
|
||||
Another option (orthogonal to the above) is to break the document into smaller sections and extract from each section individually, when possible. LlamaExtract will in most cases be able to handle both document and schema chunking automatically, but there are cases where you may need to do this manually.
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- [Extract Documentation](https://docs.cloud.llamaindex.ai/llamaextract/getting_started) - Details on Extract features, API and examples.
|
||||
- [Example Notebook](examples/extract/resume_screening.ipynb) - Detailed walkthrough of resume parsing
|
||||
- [Example Application with TypeScript](./examples-ts/extract/) - End-to-end examples using LlamaExtract TypeScript client.
|
||||
- [Discord Community](https://discord.com/invite/eN6D2HQ4aX) - Get help and share feedback
|
||||
@@ -1,86 +0,0 @@
|
||||
# LlamaCloud Index + Retriever
|
||||
|
||||
LlamaCloud is a new generation of managed parsing, ingestion, and retrieval services, designed to bring production-grade context-augmentation to your LLM and RAG applications.
|
||||
|
||||
Currently, LlamaCloud supports
|
||||
|
||||
- Managed Ingestion API, handling parsing and document management
|
||||
- Managed Retrieval API, configuring optimal retrieval for your RAG system
|
||||
|
||||
## Access
|
||||
|
||||
We are opening up a private beta to a limited set of enterprise partners for the managed ingestion and retrieval API. If you’re interested in centralizing your data pipelines and spending more time working on your actual RAG use cases, come [talk to us.](https://www.llamaindex.ai/contact)
|
||||
|
||||
If you have access to LlamaCloud, you can visit [LlamaCloud](https://cloud.llamaindex.ai) to sign in and get an API key.
|
||||
|
||||
## Setup
|
||||
|
||||
First, make sure you have the latest LlamaIndex version installed.
|
||||
|
||||
```
|
||||
pip uninstall llama-index # run this if upgrading from v0.9.x or older
|
||||
pip install -U llama-index --upgrade --no-cache-dir --force-reinstall
|
||||
```
|
||||
|
||||
The `llama-index-indices-managed-llama-cloud` package is included with the above install, but you can also install directly
|
||||
|
||||
```
|
||||
pip install -U llama-index-indices-managed-llama-cloud
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
You can create an index on LlamaCloud using the following code. By default, new indexes use managed embeddings (OpenAI text-embedding-3-small, 1536 dimensions, 1 credit/page):
|
||||
|
||||
```python
|
||||
import os
|
||||
|
||||
os.environ[
|
||||
"LLAMA_CLOUD_API_KEY"
|
||||
] = "llx-..." # can provide API-key in env or in the constructor later on
|
||||
|
||||
from llama_index.core import SimpleDirectoryReader
|
||||
from llama_cloud_services import LlamaCloudIndex
|
||||
|
||||
# create a new index (uses managed embeddings by default)
|
||||
index = LlamaCloudIndex.from_documents(
|
||||
documents,
|
||||
"my_first_index",
|
||||
project_name="default",
|
||||
api_key="llx-...",
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# connect to an existing index
|
||||
index = LlamaCloudIndex("my_first_index", project_name="default")
|
||||
```
|
||||
|
||||
You can also configure a retriever for managed retrieval:
|
||||
|
||||
```python
|
||||
# from the existing index
|
||||
index.as_retriever()
|
||||
|
||||
# from scratch
|
||||
from llama_index.indices.managed.llama_cloud import LlamaCloudRetriever
|
||||
|
||||
retriever = LlamaCloudRetriever("my_first_index", project_name="default")
|
||||
```
|
||||
|
||||
And of course, you can use other index shortcuts to get use out of your new managed index:
|
||||
|
||||
```python
|
||||
query_engine = index.as_query_engine(llm=llm)
|
||||
|
||||
chat_engine = index.as_chat_engine(llm=llm)
|
||||
```
|
||||
|
||||
## Retriever Settings
|
||||
|
||||
A full list of retriever settings/kwargs is below:
|
||||
|
||||
- `dense_similarity_top_k`: Optional[int] -- If greater than 0, retrieve `k` nodes using dense retrieval
|
||||
- `sparse_similarity_top_k`: Optional[int] -- If greater than 0, retrieve `k` nodes using sparse retrieval
|
||||
- `enable_reranking`: Optional[bool] -- Whether to enable reranking or not. Sacrifices some speed for accuracy
|
||||
- `rerank_top_n`: Optional[int] -- The number of nodes to return after reranking initial retrieval results
|
||||
- `alpha` Optional[float] -- The weighting between dense and sparse retrieval. 1 = Full dense retrieval, 0 = Full sparse retrieval.
|
||||
@@ -1,163 +0,0 @@
|
||||
# LlamaParse
|
||||
|
||||
LlamaParse is a **GenAI-native document parser** that can parse complex document data for any downstream LLM use case (RAG, agents).
|
||||
|
||||
It is really good at the following:
|
||||
|
||||
- ✅ **Broad file type support**: Parsing a variety of unstructured file types (.pdf, .pptx, .docx, .xlsx, .html) with text, tables, visual elements, weird layouts, and more.
|
||||
- ✅ **Table recognition**: Parsing embedded tables accurately into text and semi-structured representations.
|
||||
- ✅ **Multimodal parsing and chunking**: Extracting visual elements (images/diagrams) into structured formats and return image chunks using the latest multimodal models.
|
||||
- ✅ **Custom parsing**: Input custom prompt instructions to customize the output the way you want it.
|
||||
|
||||
LlamaParse directly integrates with [LlamaIndex](https://github.com/run-llama/llama_index).
|
||||
|
||||
The free plan is up to 1000 pages a day. Paid plan is free 7k pages per week + 0.3c per additional page by default. There is a sandbox available to test the API [**https://cloud.llamaindex.ai/parse ↗**](https://cloud.llamaindex.ai/parse).
|
||||
|
||||
Read below for some quickstart information, or see the [full documentation](https://docs.cloud.llamaindex.ai/).
|
||||
|
||||
If you're a company interested in enterprise RAG solutions, and/or high volume/on-prem usage of LlamaParse, come [talk to us](https://www.llamaindex.ai/contact).
|
||||
|
||||
## Getting Started
|
||||
|
||||
First, login and get an api-key from [**https://cloud.llamaindex.ai/api-key ↗**](https://cloud.llamaindex.ai/api-key).
|
||||
|
||||
Then, install the package:
|
||||
|
||||
`pip install llama-cloud-services`
|
||||
|
||||
## CLI Usage
|
||||
|
||||
Now you can parse your first PDF file using the command line interface. Use the command `llama-parse [file_paths]`. See the help text with `llama-parse --help`.
|
||||
|
||||
```bash
|
||||
export LLAMA_CLOUD_API_KEY='llx-...'
|
||||
|
||||
# output as text
|
||||
llama-parse my_file.pdf --result-type text --output-file output.txt
|
||||
|
||||
# output as markdown
|
||||
llama-parse my_file.pdf --result-type markdown --output-file output.md
|
||||
|
||||
# output as raw json
|
||||
llama-parse my_file.pdf --output-raw-json --output-file output.json
|
||||
```
|
||||
|
||||
## Python Usage
|
||||
|
||||
You can also create simple scripts:
|
||||
|
||||
```python
|
||||
from llama_cloud_services import LlamaParse
|
||||
|
||||
parser = LlamaParse(
|
||||
api_key="llx-...", # can also be set in your env as LLAMA_CLOUD_API_KEY
|
||||
num_workers=4, # if multiple files passed, split in `num_workers` API calls
|
||||
verbose=True,
|
||||
language="en", # Optionally you can define a language, default=en
|
||||
)
|
||||
|
||||
# sync
|
||||
result = parser.parse("./my_file.pdf")
|
||||
|
||||
# sync batch
|
||||
results = parser.parse(["./my_file1.pdf", "./my_file2.pdf"])
|
||||
|
||||
# async
|
||||
result = await parser.aparse("./my_file.pdf")
|
||||
|
||||
# async batch
|
||||
results = await parser.aparse(["./my_file1.pdf", "./my_file2.pdf"])
|
||||
```
|
||||
|
||||
The result object is a fully typed `JobResult` object, and you can interact with it to parse and transform various parts of the result:
|
||||
|
||||
```python
|
||||
# get the llama-index markdown documents
|
||||
markdown_documents = result.get_markdown_documents(split_by_page=True)
|
||||
|
||||
# get the llama-index text documents
|
||||
text_documents = result.get_text_documents(split_by_page=False)
|
||||
|
||||
# get the image documents
|
||||
image_documents = result.get_image_documents(
|
||||
include_screenshot_images=True,
|
||||
include_object_images=False,
|
||||
# Optional: download the images to a directory
|
||||
# (default is to return the image bytes in ImageDocument objects)
|
||||
image_download_dir="./images",
|
||||
)
|
||||
|
||||
# access the raw job result
|
||||
# Items will vary based on the parser configuration
|
||||
for page in result.pages:
|
||||
print(page.text)
|
||||
print(page.md)
|
||||
print(page.images)
|
||||
print(page.layout)
|
||||
print(page.structuredData)
|
||||
```
|
||||
|
||||
See more details about the result object in the [example notebook](./examples/parse/demo_json_tour.ipynb).
|
||||
|
||||
### Using with file object / bytes
|
||||
|
||||
You can parse a file object directly:
|
||||
|
||||
```python
|
||||
from llama_cloud_services import LlamaParse
|
||||
|
||||
parser = LlamaParse(
|
||||
api_key="llx-...", # can also be set in your env as LLAMA_CLOUD_API_KEY
|
||||
num_workers=4, # if multiple files passed, split in `num_workers` API calls
|
||||
verbose=True,
|
||||
language="en", # Optionally you can define a language, default=en
|
||||
)
|
||||
|
||||
file_name = "my_file1.pdf"
|
||||
extra_info = {"file_name": file_name}
|
||||
|
||||
with open(f"./{file_name}", "rb") as f:
|
||||
# must provide extra_info with file_name key with passing file object
|
||||
result = parser.parse(f, extra_info=extra_info)
|
||||
|
||||
# you can also pass file bytes directly
|
||||
with open(f"./{file_name}", "rb") as f:
|
||||
file_bytes = f.read()
|
||||
# must provide extra_info with file_name key with passing file bytes
|
||||
result = parser.parse(file_bytes, extra_info=extra_info)
|
||||
```
|
||||
|
||||
### Using with `SimpleDirectoryReader`
|
||||
|
||||
You can also integrate the parser as the default PDF loader in `SimpleDirectoryReader`:
|
||||
|
||||
```python
|
||||
from llama_cloud_services import LlamaParse
|
||||
from llama_index.core import SimpleDirectoryReader
|
||||
|
||||
parser = LlamaParse(
|
||||
api_key="llx-...", # can also be set in your env as LLAMA_CLOUD_API_KEY
|
||||
result_type="markdown", # "markdown" and "text" are available
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
file_extractor = {".pdf": parser}
|
||||
documents = SimpleDirectoryReader(
|
||||
"./data", file_extractor=file_extractor
|
||||
).load_data()
|
||||
```
|
||||
|
||||
Full documentation for `SimpleDirectoryReader` can be found on the [LlamaIndex Documentation](https://developers.llamaindex.ai/python/framework/module_guides/loading/simpledirectoryreader/).
|
||||
|
||||
## Examples
|
||||
|
||||
Several end-to-end indexing examples can be found in the examples folder
|
||||
|
||||
- [Getting Started](examples/parse/demo_basic.ipynb)
|
||||
- [Advanced RAG Example](examples/parse/demo_advanced.ipynb)
|
||||
- [Raw API Usage](examples/parse/demo_api.ipynb)
|
||||
- [Result Object Tour](examples/parse/demo_json_tour.ipynb)
|
||||
|
||||
## Documentation
|
||||
|
||||
[https://docs.cloud.llamaindex.ai/](https://docs.cloud.llamaindex.ai/)
|
||||
@@ -1,5 +1,11 @@
|
||||
# llama-cloud-services-py
|
||||
|
||||
## 0.6.94
|
||||
|
||||
### Patch Changes
|
||||
|
||||
- 232c55b: Include xlsx files in extract input
|
||||
|
||||
## 0.6.93
|
||||
|
||||
### Patch Changes
|
||||
|
||||
@@ -485,20 +485,25 @@ class ExtractionAgent:
|
||||
self._run_in_thread(_delete())
|
||||
|
||||
def list_extraction_runs(
|
||||
self, page: int = 0, limit: int = 100
|
||||
self, page: int = 0, page_size: int = 1000
|
||||
) -> PaginatedExtractRunsResponse:
|
||||
"""List extraction runs for the extraction agent.
|
||||
|
||||
Args:
|
||||
page: The page number (0-indexed)
|
||||
page_size: Number of results per page (default 1000, max 1000)
|
||||
|
||||
Returns:
|
||||
PaginatedExtractRunsResponse: Paginated list of extraction runs
|
||||
"""
|
||||
page_size = min(page_size, 1000)
|
||||
|
||||
@_async_retry()
|
||||
async def _list() -> PaginatedExtractRunsResponse:
|
||||
return await self._client.llama_extract.list_extract_runs(
|
||||
extraction_agent_id=self.id,
|
||||
skip=page * limit,
|
||||
limit=limit,
|
||||
skip=page * page_size,
|
||||
limit=page_size,
|
||||
)
|
||||
|
||||
return self._run_in_thread(_list())
|
||||
@@ -806,6 +811,7 @@ class LlamaExtract(BaseComponent):
|
||||
# Document files
|
||||
".pdf": "application/pdf",
|
||||
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
# Image files
|
||||
".png": "image/png",
|
||||
".jpg": "image/jpeg",
|
||||
|
||||
@@ -1,5 +1,13 @@
|
||||
# llama_parse
|
||||
|
||||
## 0.6.94
|
||||
|
||||
### Patch Changes
|
||||
|
||||
- 232c55b: Include xlsx files in extract input
|
||||
- Updated dependencies [232c55b]
|
||||
- llama-cloud-services-py@0.6.94
|
||||
|
||||
## 0.6.93
|
||||
|
||||
### Patch Changes
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "llama_parse",
|
||||
"version": "0.6.93",
|
||||
"version": "0.6.94",
|
||||
"description": "",
|
||||
"main": "index.js",
|
||||
"private": false,
|
||||
|
||||
@@ -11,13 +11,13 @@ dev = [
|
||||
|
||||
[project]
|
||||
name = "llama-parse"
|
||||
version = "0.6.93"
|
||||
version = "0.6.94"
|
||||
description = "Parse files into RAG-Optimized formats."
|
||||
authors = [{name = "Logan Markewich", email = "logan@llamaindex.ai"}]
|
||||
requires-python = ">=3.9,<4.0"
|
||||
readme = "README.md"
|
||||
license = "MIT"
|
||||
dependencies = ["llama-cloud-services>=0.6.93"]
|
||||
dependencies = ["llama-cloud-services>=0.6.94"]
|
||||
|
||||
[project.scripts]
|
||||
llama-parse = "llama_parse.cli.main:parse"
|
||||
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "llama-cloud-services-py",
|
||||
"version": "0.6.93",
|
||||
"version": "0.6.94",
|
||||
"private": false,
|
||||
"license": "MIT",
|
||||
"scripts": {},
|
||||
|
||||
+1
-1
@@ -23,7 +23,7 @@ dev = [
|
||||
|
||||
[project]
|
||||
name = "llama-cloud-services"
|
||||
version = "0.6.93"
|
||||
version = "0.6.94"
|
||||
description = "Tailored SDK clients for LlamaCloud services."
|
||||
authors = [{name = "Logan Markewich", email = "logan@runllama.ai"}]
|
||||
requires-python = ">=3.9,<4.0"
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import os
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from typing import Any, Dict, Optional, Union
|
||||
|
||||
from llama_cloud.core.api_error import ApiError
|
||||
from llama_cloud.types import ExtractConfig
|
||||
@@ -13,9 +12,6 @@ from tenacity import (
|
||||
|
||||
from llama_cloud_services.extract import ExtractionAgent, LlamaExtract
|
||||
|
||||
# Global storage for agents to cleanup
|
||||
_TEST_AGENTS_TO_CLEANUP: List[str] = []
|
||||
|
||||
|
||||
def _is_rate_limit_error(exception: BaseException) -> bool:
|
||||
"""Check if the exception is a rate limit error (429)."""
|
||||
@@ -42,38 +38,3 @@ def pytest_configure(config):
|
||||
"""Register custom markers for extract tests."""
|
||||
config.addinivalue_line("markers", "agent_name: custom agent name for test")
|
||||
config.addinivalue_line("markers", "agent_schema: custom agent schema for test")
|
||||
|
||||
|
||||
def pytest_sessionfinish(session, exitstatus):
|
||||
"""Hook that runs after all tests complete - cleanup agents here"""
|
||||
print(
|
||||
f"pytest_sessionfinish hook called! Agents to cleanup: {_TEST_AGENTS_TO_CLEANUP}"
|
||||
)
|
||||
|
||||
if _TEST_AGENTS_TO_CLEANUP:
|
||||
print("Creating cleanup client...")
|
||||
# Create a fresh client just for cleanup
|
||||
cleanup_client = LlamaExtract(
|
||||
api_key=os.getenv("LLAMA_CLOUD_API_KEY"),
|
||||
base_url=os.getenv("LLAMA_CLOUD_BASE_URL"),
|
||||
project_id=os.getenv("LLAMA_CLOUD_PROJECT_ID"),
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
for agent_id in _TEST_AGENTS_TO_CLEANUP:
|
||||
try:
|
||||
print(f"Deleting agent {agent_id}...")
|
||||
cleanup_client.delete_agent(agent_id)
|
||||
print(f"Cleaned up agent {agent_id}")
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to delete agent {agent_id}: {e}")
|
||||
|
||||
_TEST_AGENTS_TO_CLEANUP.clear()
|
||||
print("Agent cleanup completed")
|
||||
else:
|
||||
print("No agents to cleanup")
|
||||
|
||||
|
||||
def register_agent_for_cleanup(agent_id: str):
|
||||
"""Register an agent ID for cleanup at the end of the test session"""
|
||||
_TEST_AGENTS_TO_CLEANUP.append(agent_id)
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from pydantic import BaseModel
|
||||
@@ -6,7 +8,7 @@ from pydantic import BaseModel
|
||||
from llama_cloud_services.extract import LlamaExtract, ExtractionAgent, SourceText
|
||||
from llama_cloud.types import ExtractConfig, ExtractMode, ExtractRun
|
||||
from tests.extract.util import load_test_dotenv
|
||||
from .conftest import register_agent_for_cleanup, create_agent_with_retry
|
||||
from .conftest import create_agent_with_retry
|
||||
|
||||
load_test_dotenv()
|
||||
|
||||
@@ -59,17 +61,27 @@ def test_schema_dict():
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_agent(llama_extract, test_agent_name, test_schema_dict, request):
|
||||
"""Creates a test agent and collects it for cleanup at the end of all tests"""
|
||||
test_id = request.node.nodeid
|
||||
test_hash = hex(hash(test_id))[-8:]
|
||||
base_name = test_agent_name
|
||||
def unique_test_pdf(tmp_path):
|
||||
"""Copy test PDF to a unique path to avoid file deduplication across parallel tests.
|
||||
|
||||
Uses a UUID in the filename so that external_file_id is unique regardless of
|
||||
whether the full path or just the filename is sent to the backend.
|
||||
"""
|
||||
unique_name = f"{TEST_PDF.stem}-{uuid.uuid4().hex[:8]}{TEST_PDF.suffix}"
|
||||
unique_pdf = tmp_path / unique_name
|
||||
shutil.copy2(TEST_PDF, unique_pdf)
|
||||
return unique_pdf
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_agent(llama_extract, test_agent_name, test_schema_dict, request):
|
||||
"""Creates a test agent with a unique name and cleans it up after the test."""
|
||||
unique_id = uuid.uuid4().hex[:8]
|
||||
base_name = next(
|
||||
(marker.args[0] for marker in request.node.iter_markers("agent_name")),
|
||||
base_name,
|
||||
test_agent_name,
|
||||
)
|
||||
name = f"{base_name}_{test_hash}"
|
||||
name = f"{base_name}_{unique_id}"
|
||||
|
||||
schema = next(
|
||||
(
|
||||
@@ -79,25 +91,20 @@ def test_agent(llama_extract, test_agent_name, test_schema_dict, request):
|
||||
test_schema_dict,
|
||||
)
|
||||
|
||||
# Cleanup existing agent
|
||||
try:
|
||||
for agent in llama_extract.list_agents():
|
||||
if agent.name == name:
|
||||
llama_extract.delete_agent(agent.id)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to cleanup existing agent: {e}")
|
||||
|
||||
# Use config with cache invalidation to ensure fresh results in tests
|
||||
config = ExtractConfig(invalidate_cache=True)
|
||||
agent = create_agent_with_retry(
|
||||
llama_extract, name=name, data_schema=schema, config=config
|
||||
)
|
||||
|
||||
# Add agent to cleanup list via conftest helper
|
||||
register_agent_for_cleanup(agent.id)
|
||||
|
||||
yield agent
|
||||
|
||||
# Inline cleanup -- each worker cleans up its own agents
|
||||
try:
|
||||
llama_extract.delete_agent(agent.id)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to cleanup agent {agent.id}: {e}")
|
||||
|
||||
|
||||
class TestLlamaExtract:
|
||||
def test_init_without_api_key(self):
|
||||
@@ -138,34 +145,38 @@ class TestLlamaExtract:
|
||||
|
||||
class TestExtractionAgent:
|
||||
@pytest.mark.asyncio
|
||||
async def test_extract_single_file(self, test_agent):
|
||||
result = await test_agent.aextract(TEST_PDF)
|
||||
async def test_extract_single_file(self, test_agent, unique_test_pdf):
|
||||
result = await test_agent.aextract(unique_test_pdf)
|
||||
assert result.status == "SUCCESS"
|
||||
assert result.data is not None
|
||||
assert isinstance(result.data, dict)
|
||||
assert "title" in result.data
|
||||
assert "summary" in result.data
|
||||
|
||||
def test_sync_extract_single_file(self, test_agent):
|
||||
result = test_agent.extract(TEST_PDF)
|
||||
def test_sync_extract_single_file(self, test_agent, unique_test_pdf):
|
||||
result = test_agent.extract(unique_test_pdf)
|
||||
assert result.status == "SUCCESS"
|
||||
assert result.data is not None
|
||||
assert isinstance(result.data, dict)
|
||||
assert "title" in result.data
|
||||
assert "summary" in result.data
|
||||
|
||||
def test_extract_file_from_buffered_io(self, test_agent):
|
||||
result = test_agent.extract(SourceText(file=open(TEST_PDF, "rb")))
|
||||
def test_extract_file_from_buffered_io(self, test_agent, unique_test_pdf):
|
||||
result = test_agent.extract(
|
||||
SourceText(file=open(unique_test_pdf, "rb"), filename=unique_test_pdf.name)
|
||||
)
|
||||
assert result.status == "SUCCESS"
|
||||
assert result.data is not None
|
||||
assert isinstance(result.data, dict)
|
||||
assert "title" in result.data
|
||||
assert "summary" in result.data
|
||||
|
||||
def test_extract_file_from_bytes(self, test_agent):
|
||||
with open(TEST_PDF, "rb") as f:
|
||||
def test_extract_file_from_bytes(self, test_agent, unique_test_pdf):
|
||||
with open(unique_test_pdf, "rb") as f:
|
||||
file_bytes = f.read()
|
||||
result = test_agent.extract(SourceText(file=file_bytes, filename=TEST_PDF.name))
|
||||
result = test_agent.extract(
|
||||
SourceText(file=file_bytes, filename=unique_test_pdf.name)
|
||||
)
|
||||
assert result.status == "SUCCESS"
|
||||
assert result.data is not None
|
||||
assert isinstance(result.data, dict)
|
||||
@@ -181,7 +192,10 @@ class TestExtractionAgent:
|
||||
weight for 8 to 13 km (5–8 miles).[3] The name llama (also historically spelled
|
||||
"glama") was adopted by European settlers from native Peruvians.
|
||||
"""
|
||||
result = test_agent.extract(SourceText(text_content=TEST_TEXT))
|
||||
unique_name = f"text-{uuid.uuid4().hex[:8]}.txt"
|
||||
result = test_agent.extract(
|
||||
SourceText(text_content=TEST_TEXT, filename=unique_name)
|
||||
)
|
||||
assert result.status == "SUCCESS"
|
||||
assert result.data is not None
|
||||
assert isinstance(result.data, dict)
|
||||
@@ -189,8 +203,8 @@ class TestExtractionAgent:
|
||||
assert "summary" in result.data
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extract_multiple_files(self, test_agent):
|
||||
files = [TEST_PDF, TEST_PDF] # Using same file twice for testing
|
||||
async def test_extract_multiple_files(self, test_agent, unique_test_pdf):
|
||||
files = [unique_test_pdf, unique_test_pdf] # Using same file twice for testing
|
||||
response = await test_agent.aextract(files)
|
||||
|
||||
assert len(response) == 2
|
||||
@@ -219,15 +233,23 @@ class TestExtractionAgent:
|
||||
updated_agent = llama_extract.get_agent(name=test_agent.name)
|
||||
assert "new_field" in updated_agent.data_schema["properties"]
|
||||
|
||||
def test_list_extraction_runs(self, test_agent: ExtractionAgent):
|
||||
def test_list_extraction_runs(self, test_agent: ExtractionAgent, unique_test_pdf):
|
||||
assert test_agent.list_extraction_runs().total == 0
|
||||
test_agent.extract(TEST_PDF)
|
||||
test_agent.extract(unique_test_pdf)
|
||||
runs = test_agent.list_extraction_runs()
|
||||
assert runs.total > 0
|
||||
|
||||
def test_delete_extraction_run(self, test_agent: ExtractionAgent):
|
||||
def test_list_extraction_runs_with_page_size(
|
||||
self, test_agent: ExtractionAgent, unique_test_pdf
|
||||
):
|
||||
test_agent.extract(unique_test_pdf)
|
||||
runs = test_agent.list_extraction_runs(page=0, page_size=500)
|
||||
assert runs.total > 0
|
||||
assert runs.limit <= 500
|
||||
|
||||
def test_delete_extraction_run(self, test_agent: ExtractionAgent, unique_test_pdf):
|
||||
assert test_agent.list_extraction_runs().total == 0
|
||||
run: ExtractRun = test_agent.extract(TEST_PDF)
|
||||
run: ExtractRun = test_agent.extract(unique_test_pdf)
|
||||
test_agent.delete_extraction_run(run.id)
|
||||
runs = test_agent.list_extraction_runs()
|
||||
assert runs.total == 0
|
||||
|
||||
@@ -10,7 +10,7 @@ import uuid
|
||||
from llama_cloud.types import ExtractConfig, ExtractMode
|
||||
from deepdiff import DeepDiff
|
||||
from tests.extract.util import json_subset_match_score, load_test_dotenv
|
||||
from .conftest import register_agent_for_cleanup, create_agent_with_retry
|
||||
from .conftest import create_agent_with_retry
|
||||
|
||||
load_test_dotenv()
|
||||
|
||||
@@ -109,32 +109,24 @@ def extractor():
|
||||
@pytest.fixture
|
||||
def extraction_agent(test_case: ExtractionTestCase, extractor: LlamaExtract):
|
||||
"""Fixture to create and cleanup extraction agent for each test."""
|
||||
# Create unique name with random UUID (important for CI to avoid conflicts)
|
||||
unique_id = uuid.uuid4().hex[:8]
|
||||
agent_name = f"{test_case.name}_{unique_id}"
|
||||
|
||||
with open(test_case.schema_path, "r") as f:
|
||||
schema = json.load(f)
|
||||
|
||||
# Clean up any existing agents with this name
|
||||
try:
|
||||
agents = extractor.list_agents()
|
||||
for agent in agents:
|
||||
if agent.name == agent_name:
|
||||
extractor.delete_agent(agent.id)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to cleanup existing agent: {str(e)}")
|
||||
|
||||
# Create new agent with retry logic for rate limiting
|
||||
agent = create_agent_with_retry(
|
||||
extractor, name=agent_name, data_schema=schema, config=test_case.config
|
||||
)
|
||||
|
||||
# Register agent for cleanup at the end of the test session
|
||||
register_agent_for_cleanup(agent.id)
|
||||
|
||||
yield agent
|
||||
|
||||
# Inline cleanup -- each worker cleans up its own agents
|
||||
try:
|
||||
extractor.delete_agent(agent.id)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to cleanup agent {agent.id}: {e}")
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import { createClient, createConfig, type Client } from "@hey-api/client-fetch";
|
||||
import { File } from "buffer";
|
||||
import * as extract from "./extract";
|
||||
import type { ExtractAgent, ExtractConfig } from "./extract";
|
||||
import type { ExtractAgent, ExtractConfig, PaginatedExtractRunsResponse } from "./extract";
|
||||
import { getEnv } from "@llamaindex/env";
|
||||
import type { ExtractResult } from "./type";
|
||||
import { getUrl } from "./utils";
|
||||
@@ -63,6 +63,22 @@ export class LlamaExtractAgent {
|
||||
retryInterval,
|
||||
);
|
||||
}
|
||||
|
||||
async listExtractionRuns(
|
||||
page: number = 0,
|
||||
pageSize: number = 1000,
|
||||
maxRetriesOnError: number = 10,
|
||||
retryInterval: number = 0.5,
|
||||
): Promise<PaginatedExtractRunsResponse | undefined> {
|
||||
return await extract.listExtractionRuns(
|
||||
this.agent.id,
|
||||
page,
|
||||
pageSize,
|
||||
this.client,
|
||||
maxRetriesOnError,
|
||||
retryInterval,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
export class LlamaExtract {
|
||||
|
||||
@@ -8,12 +8,14 @@ import {
|
||||
type ExtractJobCreate,
|
||||
type ExtractAgent,
|
||||
type ExtractJob,
|
||||
type PaginatedExtractRunsResponse,
|
||||
type CreateExtractionAgentApiV1ExtractionExtractionAgentsPostData,
|
||||
type GetExtractionAgentByNameApiV1ExtractionExtractionAgentsByNameNameGetData,
|
||||
type GetExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgentIdGetData,
|
||||
type RunJobApiV1ExtractionJobsPostData,
|
||||
type GetJobApiV1ExtractionJobsJobIdGetData,
|
||||
type GetJobResultApiV1ExtractionJobsJobIdResultGetData,
|
||||
type ListExtractRunsApiV1ExtractionRunsGetData,
|
||||
StatusEnum,
|
||||
type StatelessExtractionRequest,
|
||||
type ExtractStatelessApiV1ExtractionRunPostData,
|
||||
@@ -26,6 +28,7 @@ import {
|
||||
getJobResultApiV1ExtractionJobsJobIdResultGet,
|
||||
extractStatelessApiV1ExtractionRunPost,
|
||||
deleteExtractionAgentApiV1ExtractionExtractionAgentsExtractionAgentIdDelete,
|
||||
listExtractRunsApiV1ExtractionRunsGet,
|
||||
} from "./api";
|
||||
import type { Client } from "@hey-api/client-fetch";
|
||||
import { sleep } from "./utils";
|
||||
@@ -545,4 +548,51 @@ export async function deleteAgent(
|
||||
}
|
||||
}
|
||||
|
||||
export { type ExtractAgent, type ExtractConfig };
|
||||
const MAX_PAGE_SIZE = 1000;
|
||||
const DEFAULT_PAGE_SIZE = 1000;
|
||||
|
||||
export async function listExtractionRuns(
|
||||
extractionAgentId: string,
|
||||
page: number = 0,
|
||||
pageSize: number = DEFAULT_PAGE_SIZE,
|
||||
client: Client | undefined = undefined,
|
||||
maxRetriesOnError: number = 10,
|
||||
retryInterval: number = 0.5,
|
||||
): Promise<PaginatedExtractRunsResponse | undefined> {
|
||||
const effectivePageSize = Math.min(pageSize, MAX_PAGE_SIZE);
|
||||
const data = {
|
||||
query: {
|
||||
extraction_agent_id: extractionAgentId,
|
||||
skip: page * effectivePageSize,
|
||||
limit: effectivePageSize,
|
||||
},
|
||||
} as ListExtractRunsApiV1ExtractionRunsGetData;
|
||||
const options = data as Options<ListExtractRunsApiV1ExtractionRunsGetData>;
|
||||
if (typeof client != "undefined") {
|
||||
options.client = client;
|
||||
}
|
||||
let retries: number = 0;
|
||||
while (true) {
|
||||
if (retries > maxRetriesOnError) {
|
||||
throw new Error(
|
||||
"Error while listing extraction runs: Exceeded maximum number of retries, the API keeps returning errors.",
|
||||
);
|
||||
}
|
||||
const response = await listExtractRunsApiV1ExtractionRunsGet(options);
|
||||
if (!response.response.ok) {
|
||||
if ("error" in response) {
|
||||
console.log(
|
||||
`An error occurred while listing extraction runs.\nDetails:\n\n${JSON.stringify(
|
||||
response.error,
|
||||
)}\n\nRetrying...`,
|
||||
);
|
||||
}
|
||||
retries++;
|
||||
await sleep(retryInterval * 1000);
|
||||
} else {
|
||||
return response.data as PaginatedExtractRunsResponse;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export { type ExtractAgent, type ExtractConfig, type PaginatedExtractRunsResponse };
|
||||
|
||||
@@ -17,7 +17,7 @@ export {
|
||||
export type { CloudConstructorParams } from "./type.js";
|
||||
export { LlamaParseReader } from "./reader.js";
|
||||
export { LlamaExtract, LlamaExtractAgent } from "./LlamaExtract.js";
|
||||
export type { ExtractConfig } from "./extract.js";
|
||||
export type { ExtractConfig, PaginatedExtractRunsResponse } from "./extract.js";
|
||||
export { LlamaClassify } from "./LlamaClassify.js";
|
||||
export type {
|
||||
ClassifierRule,
|
||||
|
||||
Reference in New Issue
Block a user