Compare commits

...

27 Commits

Author SHA1 Message Date
Logan f385e96ab8 Delete parse.md 2026-03-24 19:27:52 -06:00
Logan c3e4696b5f Delete index.md 2026-03-24 19:27:41 -06:00
Logan 1e40c9cf94 Delete extract.md 2026-03-24 19:27:25 -06:00
Logan 802bc2a9f8 Add deprecation notice and clean up README
Added deprecation notice and removed outdated content.
2026-03-24 19:26:59 -06:00
Neeraj Pradhan 5ea758b853 More robust extract tests with pytest xdist (#1117) 2026-02-16 16:16:15 -08:00
dependabot[bot] 208b6f2fa5 build(deps): bump slackapi/slack-github-action from 1.27.0 to 2.1.1 (#1092)
Bumps [slackapi/slack-github-action](https://github.com/slackapi/slack-github-action) from 1.27.0 to 2.1.1.
- [Release notes](https://github.com/slackapi/slack-github-action/releases)
- [Commits](https://github.com/slackapi/slack-github-action/compare/v1.27.0...v2.1.1)

---
updated-dependencies:
- dependency-name: slackapi/slack-github-action
  dependency-version: 2.1.1
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2026-02-14 21:03:05 -06:00
github-actions[bot] e1b9143f79 chore: version packages (#1116)
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2026-02-13 15:29:09 -08:00
Neeraj Pradhan 232c55bd6a Bump up patch version (#1115) 2026-02-13 15:20:52 -08:00
Neeraj Pradhan ab6f2f8da5 Allows xlsx files in the sdk for extract (#1114) 2026-02-13 14:44:25 -08:00
github-actions[bot] 66c2639ec8 chore: version packages (#1112) 2026-02-11 15:18:43 -06:00
Logan da1916c69f more loudly deprecate ancient llama-parse package (#1111) 2026-02-11 15:16:01 -06:00
Neeraj Pradhan 345e272573 Lower frequency for e2e tests (#1110) 2026-02-11 09:07:15 -08:00
github-actions[bot] d70fbac1ce chore: version packages (#1103) 2026-02-02 11:46:39 -06:00
Logan 2358df10c6 add notice (don't merge until ready) (#1065) 2026-02-02 11:42:47 -06:00
Neeraj Pradhan 829628cc86 Use unique filenames when running dist tests (#1101) 2026-01-30 14:00:27 -08:00
Neeraj Pradhan 42b7bbd1ae Use sonnet when testing premium mode in extract e2e (#1098)
* Use sonnet when testing premium mode in extract e2e

* fix parse model
2026-01-27 16:16:48 -08:00
Neeraj Pradhan 38da9a52d7 Invalidate cache when running extract tests (#1097) 2026-01-26 17:33:23 -08:00
Neeraj Pradhan 1e7ec40ee7 Fix verbose logging on slack channel (#1096) 2026-01-26 17:12:50 -08:00
Neeraj Pradhan dd83c1a9d0 Add retries to all extract sdk functions uniformly (#1095) 2026-01-26 12:05:16 -08:00
Neeraj Pradhan 7cb83f5cd3 Change cron schedule for hourly extract tests (#1094) 2026-01-26 10:15:34 -08:00
Neeraj Pradhan b05266be6d Try to reparse scheduled workflow (#1093) 2026-01-26 09:56:22 -08:00
Neeraj Pradhan eab4798165 Force github reparse of the workflow (#1090) 2026-01-23 11:36:28 -08:00
Neeraj Pradhan b174fa8fab Run hourly extract tests to catch SDK schema drifts (#1089)
* Run hourly extract tests to catch SDK schema drifts

* fix url

* fix prod/staging env
2026-01-22 18:18:45 -08:00
github-actions[bot] b12ffef916 chore: version packages (#1087)
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2026-01-21 12:44:43 -08:00
Neeraj Pradhan 07ec282257 Bump up patch version for python packages (#1086) 2026-01-21 12:30:23 -08:00
Neeraj Pradhan 013b689812 Bump up minor version for python packages (#1085) 2026-01-21 12:13:13 -08:00
Adrian Lyjak 3040951cb8 Use error description in invalid extraction error (#1081)
* fix: display extraction job error in InvalidExtractionData exception

Refactored InvalidExtractionData to read the `error` field from
ExtractRun and prominently display it in the exception message.
The job-level error is now stored in the `extraction_error` attribute
and included in the invalid_item's metadata as `job_error`.

* Create three-yaks-beg.md

---------

Co-authored-by: Claude <noreply@anthropic.com>
2026-01-18 17:43:21 -05:00
87 changed files with 1046 additions and 974 deletions
+162
View File
@@ -0,0 +1,162 @@
name: Extract E2E Tests (every 4 hours)
on:
schedule:
- cron: "0 */4 * * *"
workflow_dispatch:
# Allows manual triggering
inputs:
environment:
description: "Environment to run the tests in"
required: false
default: staging
type: choice
options:
- staging
- production
notify_slack:
description: "Notify Slack"
required: false
default: false
type: boolean
workflow_call:
env:
UV_VERSION: "0.7.20"
PYTHON_VERSION: "3.12"
SLACK_CHANNEL_ID: C078PHNTF44 # Extract channel ID
API_E2E_LOG_PATH: ${{ github.workspace }}/extract-e2e.log
jobs:
extract-e2e:
name: "Extract E2E Tests (${{ matrix.environment }})"
runs-on: ubuntu-latest
timeout-minutes: 30
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.environment }}
cancel-in-progress: true
strategy:
fail-fast: false
matrix:
environment: ${{ github.event_name == 'schedule' && fromJson('["staging", "production"]') || fromJson(format('["{0}"]', github.event.inputs.environment || 'staging')) }}
steps:
- name: Set runtime inputs
id: runtime
run: |
environment=${{ matrix.environment }}
notify_slack=${{ github.event.inputs.notify_slack || github.event_name == 'schedule' }}
echo "environment=${environment}" >> $GITHUB_OUTPUT
echo "notify_slack=${notify_slack}" >> $GITHUB_OUTPUT
if [ "${environment}" = "production" ]; then
echo "LLAMA_CLOUD_BASE_URL=https://api.cloud.llamaindex.ai" >> $GITHUB_ENV
api_key_secret="${{ secrets.LLAMA_CLOUD_API_KEY }}"
project_id_secret="${{ secrets.LLAMA_CLOUD_PROJECT_ID }}"
else
echo "LLAMA_CLOUD_BASE_URL=https://api.staging.llamaindex.ai" >> $GITHUB_ENV
api_key_secret="${{ secrets.LLAMA_CLOUD_API_KEY_STAGING }}"
project_id_secret="${{ secrets.LLAMA_CLOUD_PROJECT_ID_STAGING }}"
fi
if [ -n "$api_key_secret" ]; then
echo "LLAMA_CLOUD_API_KEY=$api_key_secret" >> $GITHUB_ENV
fi
if [ -n "$project_id_secret" ]; then
echo "LLAMA_CLOUD_PROJECT_ID=$project_id_secret" >> $GITHUB_ENV
fi
- uses: actions/checkout@v5
with:
fetch-depth: 0
- name: Install uv
uses: astral-sh/setup-uv@v7
with:
version: ${{ env.UV_VERSION }}
- name: Set up Python
run: uv python install ${{ env.PYTHON_VERSION }} && uv python pin ${{ env.PYTHON_VERSION }}
- name: Run Extract E2E tests
id: extract-tests
continue-on-error: true
working-directory: py
run: |
set -o pipefail
rm -f "$API_E2E_LOG_PATH"
uv run pytest -v -n 8 --timeout=300 --session-timeout=1740 tests/extract/ 2>&1 | tee "$API_E2E_LOG_PATH"
- name: Extract pytest failure summary
id: failed-tests
if: steps.extract-tests.outcome == 'failure' || cancelled()
run: |
summary="$(python3 - <<'PY'
import os
import re
from pathlib import Path
log_path = Path(os.environ["API_E2E_LOG_PATH"])
if not log_path.exists():
print("Test log not found.")
raise SystemExit(0)
lines = log_path.read_text(errors="ignore").splitlines()
# Find the "short test summary info" section
start = None
for i, line in enumerate(lines):
if line.startswith("=") and "short test summary info" in line:
start = i + 1
break
if start is None:
print("No test summary found.")
raise SystemExit(0)
# Extract just the FAILED/ERROR lines (test name + short reason)
failed_tests = []
for line in lines[start:]:
if line.startswith("="):
break # End of section
if line.startswith("FAILED ") or line.startswith("ERROR "):
# Extract test name and truncate the error message
match = re.match(r"(FAILED|ERROR) ([\w/:.\[\]_-]+)", line)
if match:
failed_tests.append(f"{match.group(1)}: {match.group(2)}")
if failed_tests:
print("\n".join(failed_tests[:20])) # Limit to 20 tests max
else:
print("No failed tests found in summary.")
PY
)"
if [ -z "$summary" ]; then
summary="Failed test summary not available. Review the full run logs."
fi
{
printf 'summary<<EOF\n%s\nEOF\n' "$summary"
} >> "$GITHUB_OUTPUT"
- name: Check test results
if: always()
run: |
if [ "${{ steps.extract-tests.outcome }}" == "failure" ]; then
echo "Extract E2E tests failed"
exit 1
fi
- name: Post to Extract Slack channel
id: slack
if: (failure() || cancelled()) && steps.runtime.outputs.notify_slack == 'true'
uses: slackapi/slack-github-action@v2.1.1
with:
channel-id: ${{ env.SLACK_CHANNEL_ID }}
slack-message: |
:red_circle: *Extract E2E Failed* (${{ steps.runtime.outputs.environment }})
```
${{ steps.failed-tests.outputs.summary }}
```
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View Run>
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+1 -1
View File
@@ -22,7 +22,7 @@ repos:
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
exclude: ".*uv.lock"
exclude: ".*uv.lock|examples/"
- repo: https://github.com/psf/black-pre-commit-mirror
rev: 23.10.1
hooks:
+9 -74
View File
@@ -4,77 +4,12 @@
# Llama Cloud Services
This repository contains the code for hand-written SDKs and clients for interacting with LlamaCloud.
This includes:
- [LlamaParse](./parse.md) - A GenAI-native document parser that can parse complex document data for any downstream LLM use case (Agents, RAG, data processing, etc.).
- [LlamaExtract](./extract.md) - A prebuilt agentic data extractor that can be used to transform data into a structured JSON representation.
- [LlamaCloud Index](./index.md) - A widely customizable and fully automated document ingestion pipeline that also serves retrieval purposes.
## Getting Started
Install the package:
```bash
pip install llama-cloud-services
```
Then, get your API key from [LlamaCloud](https://cloud.llamaindex.ai/).
Then, you can use the services in your code:
```python
from llama_cloud_services import (
LlamaParse,
LlamaExtract,
LlamaCloudIndex,
)
parser = LlamaParse(api_key="YOUR_API_KEY")
extract = LlamaExtract(api_key="YOUR_API_KEY")
index = LlamaCloudIndex(
"my_first_index", project_name="default", api_key="YOUR_API_KEY"
)
```
See the quickstart guides for each service for more information:
- [LlamaParse](./parse.md)
- [LlamaExtract](./extract.md)
- [LlamaCloud Index](./index.md)
## Switch to EU SaaS 🇪🇺
If you are interested in using LlamaCloud services in the EU, you can adjust your base URL to `https://api.cloud.eu.llamaindex.ai`.
You can also create your API key in the EU region [here](https://cloud.eu.llamaindex.ai).
```python
from llama_cloud_services import (
LlamaParse,
LlamaExtract,
EU_BASE_URL,
)
parser = LlamaParse(api_key="YOUR_API_KEY", base_url=EU_BASE_URL)
extract = LlamaExtract(api_key="YOUR_API_KEY", base_url=EU_BASE_URL)
index = LlamaCloudIndex(
"my_first_index",
project_name="default",
api_key="YOUR_API_KEY",
base_url=EU_BASE_URL,
)
```
## Documentation
You can see complete SDK and API documentation for each service on [our official docs](https://docs.cloud.llamaindex.ai/).
## Terms of Service
See the [Terms of Service Here](./TOS.pdf).
## Get in Touch (LlamaCloud)
You can get in touch with us by following our [contact link](https://www.llamaindex.ai/contact).
> **⚠️ DEPRECATION NOTICE**
>
> This repository and its packages are deprecated and will be maintained until **May 1, 2026**.
>
> **Please migrate to the new packages:**
> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))
> - **TypeScript**: `npm install @llamaindex/llama-cloud` ([GitHub](https://github.com/run-llama/llama-cloud-ts))
>
> The new packages provide the same functionality with improved performance, better support, and active development.
+10
View File
@@ -1,4 +1,14 @@
# LlamaCloud Services Examples - Python
> **⚠️ DEPRECATION NOTICE**
>
> This repository and its packages are deprecated and will be maintained until **May 1, 2026**.
>
> **Please migrate to the new packages:**
> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))
> - **TypeScript**: `npm install @llamaindex/llama-cloud` ([GitHub](https://github.com/run-llama/llama-cloud-ts))
>
> The new packages provide the same functionality with improved performance, better support, and active development.
In this folder you will find several python notebooks that contain examples regarding:
@@ -17,6 +17,14 @@
"This is useful when you need to parse many documents at once, as the batch API handles the orchestration and provides progress tracking."
]
},
{
"cell_type": "markdown",
"id": "0c2b5e1a",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"id": "cell-1",
@@ -16,6 +16,14 @@
"![](asset_manager_fund_analysis.png)\n"
]
},
{
"cell_type": "markdown",
"id": "cbafd7ee",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"id": "cda2e5e9-fe9d-42d9-9387-f529d970ff7b",
@@ -20,6 +20,14 @@
"This workflow is designed for equity research analysts and investment professionals."
]
},
{
"cell_type": "markdown",
"id": "e7979faf",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -19,6 +19,13 @@
"The example we go through below is also replicable within Llama Cloud as well, where you will also be able to pick between a number of pre-defined schemas, instead of building your own."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "code",
"execution_count": null,
+7
View File
@@ -15,6 +15,13 @@
"Dow Jones Industrial Average (DJIA) is a stock market index that consists of 30 large companies listed on the New York Stock Exchange and the NASDAQ and is considered a good proxy for the overall US stock market. For this exercise, we will extract the insider transactions for all the companies in the DJIA. Let's first get the list of tickers in the Dow Jones Industrial Average using Wikipedia."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -16,6 +16,14 @@
"This approach reduces manual data entry, improves extraction accuracy and standardization, and provides traceability for each technical detail."
]
},
{
"cell_type": "markdown",
"id": "8d1efe6e",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"id": "a3b8c8d5-ff3e-48ce-b0b8-29b6b1f517f8",
+7
View File
@@ -11,6 +11,13 @@
"Take a look at one of the resumes in the `data/resumes` directory. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "code",
"execution_count": null,
+8
View File
@@ -20,6 +20,14 @@
"> **Note:** This principle of what fields generalize across your target documents and what might be optional is an important one to keep in mind when designing your schema. \n"
]
},
{
"cell_type": "markdown",
"id": "355adfd4",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -21,6 +21,14 @@
"The following notebook uses the eventdriven syntax (with custom events, steps, and a workflow class) adapted from the technical datasheet and contract review examples."
]
},
{
"cell_type": "markdown",
"id": "ab7be988",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"id": "36d8e34e-ed98-46ac-b744-1642f6e253d5",
@@ -35,6 +35,14 @@
"📖 For more details, see the [Extraction Target documentation](https://developers.llamaindex.ai/python/cloud/llamaextract/features/concepts/#extraction-target)."
]
},
{
"cell_type": "markdown",
"id": "cb760594",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "code",
"execution_count": null,
+7
View File
@@ -31,6 +31,13 @@
"| Sep-02-2025 | 0.6.62 | Active |\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -19,6 +19,13 @@
"The workflow is implemented as a proper LlamaIndex Workflow with separate steps for parsing, classification, and extraction, connected by typed events. This provides modularity, observability, and type safety."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -27,6 +27,14 @@
"| Aug-19-2025 | 0.6.61 | Maintained |"
]
},
{
"cell_type": "markdown",
"id": "e2b422f5",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"id": "2e4f707a-c7b5-473f-b4a6-881e2245e82d",
@@ -14,6 +14,13 @@
"| Aug-19-2025 | 0.6.61 | Maintained |"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"metadata": {},
+5
View File
@@ -1,4 +1,9 @@
"""
⚠️ DEPRECATION NOTICE:
This example uses the deprecated llama-cloud-services package, which will be maintained until May 1, 2026.
Please migrate to: pip install llama-cloud>=1.0 (https://github.com/run-llama/llama-cloud-py)
"""
"""
Example: Batch Processing a Folder of PDFs with LlamaParse
This script demonstrates how to process multiple PDFs from a folder
@@ -17,6 +17,14 @@
"| Aug-19-2025 | 0.6.61 | Maintained |"
]
},
{
"cell_type": "markdown",
"id": "0cb82ca8",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"id": "ef115dbe-b834-4639-828e-e2c11aef710b",
+7
View File
@@ -18,6 +18,13 @@
"| Aug-18-2025 | 0.6.61 | Maintained |"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "code",
"execution_count": null,
+7
View File
@@ -14,6 +14,13 @@
"| Aug-18-2025 | N/A | Maintained |"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "code",
"execution_count": null,
+7
View File
@@ -14,6 +14,13 @@
"| Aug-18-2025 | 0.6.61 | Maintained |"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "code",
"execution_count": null,
+7
View File
@@ -18,6 +18,13 @@
"| Aug-18-2025 | 0.6.61 | Maintained |\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "code",
"execution_count": null,
+8
View File
@@ -19,6 +19,14 @@
"| Aug-18-2025 | 0.6.61 | Maintained |"
]
},
{
"cell_type": "markdown",
"id": "bb595498",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"id": "a004db48-8d3f-421c-915a-477692f71b90",
+7
View File
@@ -16,6 +16,13 @@
"| Aug-19-2025 | 0.6.61 | Deprecated |"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"metadata": {},
+8
View File
@@ -19,6 +19,14 @@
"| Aug-19-2025 | 0.6.61 | Maintained |"
]
},
{
"cell_type": "markdown",
"id": "8b937443",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"id": "a004db48-8d3f-421c-915a-477692f71b90",
+8
View File
@@ -19,6 +19,14 @@
"| Aug-19-2025 | 0.6.61 | Maintained |"
]
},
{
"cell_type": "markdown",
"id": "037cc6d9",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"id": "a004db48-8d3f-421c-915a-477692f71b90",
+8
View File
@@ -19,6 +19,14 @@
"| Aug-19-2025 | 0.6.61 | Maintained |"
]
},
{
"cell_type": "markdown",
"id": "7aa3be47",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "code",
"execution_count": null,
+7
View File
@@ -21,6 +21,13 @@
"| Aug-19-2025 | 0.6.61 | Maintained |"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -8,6 +8,14 @@
"<a href=\"https://colab.research.google.com/github/run-llama/llama_cloud_services/blob/main/examples/parse/demo_starter_multimodal.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"id": "da52cfa3",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"id": "4e081457",
@@ -7,6 +7,13 @@
"<a href=\"https://colab.research.google.com/github/run-llama/llama_cloud_services/blob/main/examples/parse/demo_starter_parse_selected_pages.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -14,6 +14,13 @@
"| Aug-19-2025 | 0.6.61 | Maintained |"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"metadata": {},
+8
View File
@@ -17,6 +17,14 @@
"| Aug-19-2025 | 0.6.61 | Maintained |\n"
]
},
{
"cell_type": "markdown",
"id": "a3636937",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"id": "5f7d99ad-6ebd-47d0-92a7-566630b0c22a",
+7
View File
@@ -7,6 +7,13 @@
"<a href=\"https://colab.research.google.com/github/run-llama/llama_cloud_services/blob/main/examples/parse/excel/o1_excel_rag.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -17,6 +17,14 @@
"| Before Feb 2025 | N/A | Deprecated |"
]
},
{
"cell_type": "markdown",
"id": "0facb0b9",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"id": "e8db8ac2-5221-44de-a53e-cb5ab37ac8f5",
@@ -19,6 +19,14 @@
"| Aug-19-2025 | 0.6.61 | Maintained |\n"
]
},
{
"cell_type": "markdown",
"id": "bb943339",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -19,6 +19,14 @@
"| Aug-19-2025 | 0.6.61 | Maintained |\n"
]
},
{
"cell_type": "markdown",
"id": "17e62444",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -19,6 +19,14 @@
"| Aug-19-2025 | 0.6.61 | Maintained |"
]
},
{
"cell_type": "markdown",
"id": "fe7e837a",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"id": "15e60ecf-519c-41fc-911b-765adaf8bad4",
@@ -9,6 +9,13 @@
"<a href=\"https://colab.research.google.com/github/run-llama/llama_cloud_services/blob/main/examples/parse/multimodal/insurance_rag.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -23,6 +23,13 @@
"- [US Immigration Case](https://github.com/user-attachments/files/16536446/us_immigration_case.pdf)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -27,6 +27,14 @@
"![mm_rag_diagram](./multimodal_contextual_retrieval_rag_img.png)"
]
},
{
"cell_type": "markdown",
"id": "93d4f9ab",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"id": "54e8d9a7-5036-4d32-818f-00b2e888521f",
@@ -27,6 +27,14 @@
"![mm_rag_diagram](./multimodal_rag_slide_deck_img.png)"
]
},
{
"cell_type": "markdown",
"id": "fc1b5803",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"id": "54e8d9a7-5036-4d32-818f-00b2e888521f",
@@ -19,6 +19,14 @@
"| Aug-20-2025 | 0.6.61 | Maintained |"
]
},
{
"cell_type": "markdown",
"id": "7dafd458",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"id": "54e8d9a7-5036-4d32-818f-00b2e888521f",
@@ -21,6 +21,14 @@
"We use our workflow abstraction to define an agentic system that contains two main phases: a research phase that pulls in relevant files through chunk-level or file-level retrieval, and then a blog generation phase that synthesizes the final report."
]
},
{
"cell_type": "markdown",
"id": "8c881021",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"id": "54e8d9a7-5036-4d32-818f-00b2e888521f",
@@ -9,6 +9,13 @@
"<a href=\"https://colab.research.google.com/github/run-llama/llama_cloud_services/blob/main/examples/parse/multimodal/product_manual_rag.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -19,6 +19,14 @@
"| Prior to Feb-2025 | N/A | Deprecated |"
]
},
{
"cell_type": "markdown",
"id": "b27f0e78",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -14,6 +14,13 @@
"| Prior to Feb-2025 | N/A | Deprecated |"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -29,6 +29,13 @@
"In this demonstration, we showcase how parsing instructions can be used to extract specific information from unstructured documents. Using a McDonald's Receipt, we show how to ignore parts of the document and only parse the price of each order and the final amount to be paid."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -18,6 +18,13 @@
"Many documents can have varying complexity across pages - some pages have text, and other pages have images. The text-only pages only require cheap parsing modes, whereas the image-based pages require more advanced modes. In this notebook we show you how to take advantage of \"auto mode\" in LlamaParse which adaptively parses different pages according to different modes, which lets you get optimal performance at the cheapest cost.\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -37,6 +37,13 @@
"With visual references, you can build applications that preserve document structure and provide users with trustworthy, traceable visual citations. We will now leverage this feature to build our query engine."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -24,6 +24,13 @@
"| Aug-18-2025 | 0.6.61 | Maintained |"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -26,6 +26,14 @@
"We use LlamaParse to parse the context documents as well as the RFP document itself."
]
},
{
"cell_type": "markdown",
"id": "ad140aef",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -22,6 +22,14 @@
"**NOTE**: The pricing for LlamaParse + gpt4o is an order more expensive than using LlamaParse by default. Currently, every page parsed with gpt4o counts for 10 pages in the LlamaParse usage tracker.\n"
]
},
{
"cell_type": "markdown",
"id": "211c52fe",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -1,4 +1,9 @@
"""
⚠️ DEPRECATION NOTICE:
This example uses the deprecated llama-cloud-services package, which will be maintained until May 1, 2026.
Please migrate to: pip install llama-cloud>=1.0 (https://github.com/run-llama/llama-cloud-py)
"""
"""
Generate sample spreadsheets for LlamaSheets + Claude workflows.
This script creates example Excel files that demonstrate different use cases:
@@ -1,3 +1,8 @@
"""
⚠️ DEPRECATION NOTICE:
This example uses the deprecated llama-cloud-services package, which will be maintained until May 1, 2026.
Please migrate to: pip install llama-cloud>=1.0 (https://github.com/run-llama/llama-cloud-py)
"""
"""Helper script to extract spreadsheets using LlamaSheets."""
import asyncio
@@ -1,4 +1,9 @@
"""
⚠️ DEPRECATION NOTICE:
This example uses the deprecated llama-cloud-services package, which will be maintained until May 1, 2026.
Please migrate to: pip install llama-cloud>=1.0 (https://github.com/run-llama/llama-cloud-py)
"""
"""
Generate sample spreadsheets for LlamaSheets + LlamaIndex Agent workflows.
This script creates example Excel files that demonstrate different use cases:
@@ -1,4 +1,9 @@
"""
⚠️ DEPRECATION NOTICE:
This example uses the deprecated llama-cloud-services package, which will be maintained until May 1, 2026.
Please migrate to: pip install llama-cloud>=1.0 (https://github.com/run-llama/llama-cloud-py)
"""
"""
LlamaSheets Agent with LlamaIndex
This example shows how to build an agent that can work with spreadsheet data
@@ -1,3 +1,8 @@
"""
⚠️ DEPRECATION NOTICE:
This example uses the deprecated llama-cloud-services package, which will be maintained until May 1, 2026.
Please migrate to: pip install llama-cloud>=1.0 (https://github.com/run-llama/llama-cloud-py)
"""
"""Helper script to extract spreadsheets using LlamaSheets."""
import asyncio
@@ -26,6 +26,13 @@
"We'll split this into segments categorized as either `essay` or `research_paper`.\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
]
},
{
"cell_type": "markdown",
"metadata": {},
-403
View File
@@ -1,403 +0,0 @@
# LlamaExtract
LlamaExtract provides a simple API for extracting structured data from unstructured documents like PDFs, text files and images.
## Table of Contents
- [Quick Start](#quick-start)
- [Supported File Types](#supported-file-types)
- [Different Input Types](#different-input-types)
- [Async Extraction](#async-extraction)
- [Core Concepts](#core-concepts)
- [Defining Schemas](#defining-schemas)
- [Using Pydantic (Recommended)](#using-pydantic-recommended)
- [Using JSON Schema](#using-json-schema)
- [Important restrictions on JSON/Pydantic Schema](#important-restrictions-on-jsonpydantic-schema)
- [Extraction Configuration](#extraction-configuration)
- [Configuration Options](#configuration-options)
- [Extraction Agents (Advanced)](#extraction-agents-advanced)
- [Creating Agents](#creating-agents)
- [Agent Batch Processing](#agent-batch-processing)
- [Updating Agent Schemas](#updating-agent-schemas)
- [Managing Agents](#managing-agents)
- [When to Use Agents vs Direct Extraction](#when-to-use-agents-vs-direct-extraction)
- [Installation](#installation)
- [Tips & Best Practices](#tips--best-practices)
- [Additional Resources](#additional-resources)
## Quick Start
The simplest way to get started is to use the stateless API with the extraction configuration and the file/text to extract from:
```python
from llama_cloud_services import LlamaExtract
from llama_cloud import ExtractConfig, ExtractMode
from pydantic import BaseModel, Field
# Initialize client
extractor = LlamaExtract(api_key="YOUR_API_KEY")
# Define schema using Pydantic
class Resume(BaseModel):
name: str = Field(description="Full name of candidate")
email: str = Field(description="Email address")
skills: list[str] = Field(description="Technical skills and technologies")
# Configure extraction settings
config = ExtractConfig(extraction_mode=ExtractMode.FAST)
# Extract data directly from document - no agent needed!
result = extractor.extract(Resume, config, "resume.pdf")
print(result.data)
```
### Supported File Types
LlamaExtract supports the following file formats:
- **Documents**: PDF (.pdf), Word (.docx)
- **Text files**: Plain text (.txt), CSV (.csv), JSON (.json), HTML (.html, .htm), Markdown (.md)
- **Images**: PNG (.png), JPEG (.jpg, .jpeg)
### Different Input Types
```python
# From file path (string or Path)
result = extractor.extract(Resume, config, "resume.pdf")
# From file handle
with open("resume.pdf", "rb") as f:
result = extractor.extract(Resume, config, f)
# From bytes with filename
with open("resume.pdf", "rb") as f:
file_bytes = f.read()
from llama_cloud_services.extract import SourceText
result = extractor.extract(
Resume, config, SourceText(file=file_bytes, filename="resume.pdf")
)
# From text content
text = "Name: John Doe\nEmail: john@example.com\nSkills: Python, AI"
result = extractor.extract(Resume, config, SourceText(text_content=text))
```
### Async Extraction
For better performance with multiple files or when integrating with async applications.
Here `queue_extraction` will enqueue the extraction jobs and exit. Alternatively, you
can use `aextract` to poll for the job and return the extraction results.
```python
import asyncio
async def extract_resumes():
# Async extraction
result = await extractor.aextract(Resume, config, "resume.pdf")
print(result.data)
# Queue extraction jobs (returns immediately)
jobs = await extractor.queue_extraction(
Resume, config, ["resume1.pdf", "resume2.pdf"]
)
print(f"Queued {len(jobs)} extraction jobs")
return jobs
# Run async function
jobs = asyncio.run(extract_resumes())
# Check job status
for job in jobs:
status = agent.get_extraction_job(job.id).status
print(f"Job {job.id}: {status}")
# Get results when complete
results = [agent.get_extraction_run_for_job(job.id) for job in jobs]
```
## Core Concepts
- **Data Schema**: Structure definition for the data you want to extract in the form of a JSON schema or a Pydantic model.
- **Extraction Config**: Settings that control how extraction is performed (e.g., speed vs accuracy trade-offs).
- **Extraction Jobs**: Asynchronous extraction tasks that can be monitored.
- **Extraction Agents** (Advanced): Reusable extractors configured with a specific schema and extraction settings.
## Defining Schemas
Schemas define the structure of data you want to extract. You can use either Pydantic models or JSON Schema:
### Using Pydantic (Recommended)
```python
from pydantic import BaseModel, Field
from typing import List, Optional
from llama_cloud import ExtractConfig, ExtractMode
class Experience(BaseModel):
company: str = Field(description="Company name")
title: str = Field(description="Job title")
start_date: Optional[str] = Field(description="Start date of employment")
end_date: Optional[str] = Field(description="End date of employment")
class Resume(BaseModel):
name: str = Field(description="Candidate name")
experience: List[Experience] = Field(description="Work history")
# Use the schema for extraction
config = ExtractConfig(extraction_mode=ExtractMode.FAST)
result = extractor.extract(Resume, config, "resume.pdf")
```
### Using JSON Schema
```python
schema = {
"type": "object",
"properties": {
"name": {"type": "string", "description": "Candidate name"},
"experience": {
"type": "array",
"description": "Work history",
"items": {
"type": "object",
"properties": {
"company": {
"type": "string",
"description": "Company name",
},
"title": {"type": "string", "description": "Job title"},
"start_date": {
"anyOf": [{"type": "string"}, {"type": "null"}],
"description": "Start date of employment",
},
"end_date": {
"anyOf": [{"type": "string"}, {"type": "null"}],
"description": "End date of employment",
},
},
},
},
},
}
# Use the schema for extraction
config = ExtractConfig(extraction_mode=ExtractMode.FAST)
result = extractor.extract(schema, config, "resume.pdf")
```
### Important restrictions on JSON/Pydantic Schema
_LlamaExtract only supports a subset of the JSON Schema specification._ While limited, it should
be sufficient for a wide variety of use-cases.
- All fields are required by default. Nullable fields must be explicitly marked as such,
using `anyOf` with a `null` type. See `"start_date"` field above.
- Root node must be of type `object`.
- Schema nesting must be limited to within 5 levels.
- The important fields are key names/titles, type and description. Fields for
formatting, default values, etc. are **not supported**. If you need these, you can add the
restrictions to your field description and/or use a post-processing step. e.g. default values can be supported by making a field optional and then setting `"null"` values from the extraction result to the default value.
- There are other restrictions on number of keys, size of the schema, etc. that you may
hit for complex extraction use cases. In such cases, it is worth thinking how to restructure
your extraction workflow to fit within these constraints, e.g. by extracting subset of fields
and later merging them together.
## Extraction Configuration
Configure how extraction is performed using `ExtractConfig`. The schema is the most important part, but several configuration options can significantly impact the extraction process.
```python
from llama_cloud import ExtractConfig, ExtractMode, ChunkMode, ExtractTarget
# Basic configuration
config = ExtractConfig(
extraction_mode=ExtractMode.BALANCED, # FAST, BALANCED, MULTIMODAL, PREMIUM
extraction_target=ExtractTarget.PER_DOC, # PER_DOC, PER_PAGE
system_prompt="Focus on the most recent data",
page_range="1-5,10-15", # Extract from specific pages
)
# Advanced configuration
advanced_config = ExtractConfig(
extraction_mode=ExtractMode.MULTIMODAL,
chunk_mode=ChunkMode.PAGE, # PAGE, SECTION
high_resolution_mode=True, # Better OCR accuracy
invalidate_cache=False, # Bypass cached results
cite_sources=True, # Enable source citations
use_reasoning=True, # Enable reasoning (not in FAST mode)
confidence_scores=True, # MULTIMODAL/PREMIUM only
)
```
### Key Configuration Options
**Extraction Mode**: Controls processing quality and speed
- `FAST`: Fastest processing, suitable for simple documents with no OCR
- `BALANCED`: Good speed/accuracy tradeoff for text-rich documents
- `MULTIMODAL`: For visually rich documents with text, tables, and images (recommended)
- `PREMIUM`: Highest accuracy with OCR, complex table/header detection
**Extraction Target**: Defines extraction scope
- `PER_DOC`: Apply schema to entire document (default)
- `PER_PAGE`: Apply schema to each page, returns array of results
**Advanced Options**:
- `system_prompt`: Additional system-level instructions
- `page_range`: Specific pages to extract (e.g., "1,3,5-7,9")
- `chunk_mode`: Document splitting strategy (`PAGE` or `SECTION`)
- `high_resolution_mode`: Better OCR for small text (slower processing)
**Extensions** (return additional metadata):
- `cite_sources`: Source tracing for extracted fields
- `use_reasoning`: Explanations for extraction decisions
- `confidence_scores`: Quantitative confidence measures (MULTIMODAL/PREMIUM only)
For complete configuration options, advanced settings, and detailed examples, see the [LlamaExtract Configuration Documentation](https://docs.cloud.llamaindex.ai/llamaextract/features/options).
## Extraction Agents (Advanced)
For reusable extraction workflows, you can create extraction agents that encapsulate both schema and configuration:
### Creating Agents
```python
from llama_cloud_services import LlamaExtract
from llama_cloud import ExtractConfig, ExtractMode
from pydantic import BaseModel, Field
# Initialize client
extractor = LlamaExtract()
# Define schema
class Resume(BaseModel):
name: str = Field(description="Full name of candidate")
email: str = Field(description="Email address")
skills: list[str] = Field(description="Technical skills and technologies")
# Configure extraction settings
config = ExtractConfig(extraction_mode=ExtractMode.FAST)
# Create extraction agent
agent = extractor.create_agent(
name="resume-parser", data_schema=Resume, config=config
)
# Use the agent
result = agent.extract("resume.pdf")
print(result.data)
```
### Agent Batch Processing
Process multiple files with an agent:
```python
# Queue multiple files for extraction
jobs = await agent.queue_extraction(["resume1.pdf", "resume2.pdf"])
# Check job status
for job in jobs:
status = agent.get_extraction_job(job.id).status
print(f"Job {job.id}: {status}")
# Get results when complete
results = [agent.get_extraction_run_for_job(job.id) for job in jobs]
```
### Updating Agent Schemas
Schemas can be modified and updated after creation:
```python
# Update schema
agent.data_schema = new_schema
# Save changes
agent.save()
```
### Managing Agents
```python
# List all agents
agents = extractor.list_agents()
# Get specific agent
agent = extractor.get_agent(name="resume-parser")
# Delete agent
extractor.delete_agent(agent.id)
```
### When to Use Agents vs Direct Extraction
**Use Direct Extraction When:**
- One-off extractions
- Different schemas for different documents
- Simple workflows
- Getting started quickly
**Use Extraction Agents When:**
- Repeated extractions with the same schema
- Team collaboration (shared, named extractors)
- Complex workflows requiring state management
- Production systems with consistent extraction patterns
## Installation
```bash
pip install llama-cloud-services
```
## Tips & Best Practices
At the core of LlamaExtract is the schema, which defines the structure of the data you want to extract from your documents.
1. **Schema Design**:
- Try to limit schema nesting to 3-4 levels.
- Make fields optional when data might not always be present. Having required fields may force the model
to hallucinate when these fields are not present in the documents.
- When you want to extract a variable number of entities, use an `array` type. However, note that you cannot use
an `array` type for the root node.
- Use descriptive field names and detailed descriptions. Use descriptions to pass formatting
instructions or few-shot examples.
- Above all, start simple and iteratively build your schema to incorporate requirements.
2. **Running Extractions**:
- Note that resetting `agent.schema` will not save the schema to the database,
until you call `agent.save`, but it will be used for running extractions.
- Check extraction results for any errors. Error information is available in the `result.error` field for debugging.
- Consider async operations (`aextract` or `queue_extraction`) for large-scale extraction or when processing multiple files.
- For repeated extractions with the same schema, consider creating an extraction agent to avoid redefining the schema each time.
### Hitting "The response was too long to be processed" Error
This implies that the extraction response is hitting output token limits of the LLM. In such cases, it is worth rethinking the design of your schema to enable a more efficient/scalable extraction. e.g.
- Instead of one field that extracts a complex object, you can use multiple fields to distribute the extraction logic.
- You can also use multiple schemas to extract different subsets of fields from the same document and merge them later.
Another option (orthogonal to the above) is to break the document into smaller sections and extract from each section individually, when possible. LlamaExtract will in most cases be able to handle both document and schema chunking automatically, but there are cases where you may need to do this manually.
## Additional Resources
- [Extract Documentation](https://docs.cloud.llamaindex.ai/llamaextract/getting_started) - Details on Extract features, API and examples.
- [Example Notebook](examples/extract/resume_screening.ipynb) - Detailed walkthrough of resume parsing
- [Example Application with TypeScript](./examples-ts/extract/) - End-to-end examples using LlamaExtract TypeScript client.
- [Discord Community](https://discord.com/invite/eN6D2HQ4aX) - Get help and share feedback
-86
View File
@@ -1,86 +0,0 @@
# LlamaCloud Index + Retriever
LlamaCloud is a new generation of managed parsing, ingestion, and retrieval services, designed to bring production-grade context-augmentation to your LLM and RAG applications.
Currently, LlamaCloud supports
- Managed Ingestion API, handling parsing and document management
- Managed Retrieval API, configuring optimal retrieval for your RAG system
## Access
We are opening up a private beta to a limited set of enterprise partners for the managed ingestion and retrieval API. If youre interested in centralizing your data pipelines and spending more time working on your actual RAG use cases, come [talk to us.](https://www.llamaindex.ai/contact)
If you have access to LlamaCloud, you can visit [LlamaCloud](https://cloud.llamaindex.ai) to sign in and get an API key.
## Setup
First, make sure you have the latest LlamaIndex version installed.
```
pip uninstall llama-index # run this if upgrading from v0.9.x or older
pip install -U llama-index --upgrade --no-cache-dir --force-reinstall
```
The `llama-index-indices-managed-llama-cloud` package is included with the above install, but you can also install directly
```
pip install -U llama-index-indices-managed-llama-cloud
```
## Usage
You can create an index on LlamaCloud using the following code. By default, new indexes use managed embeddings (OpenAI text-embedding-3-small, 1536 dimensions, 1 credit/page):
```python
import os
os.environ[
"LLAMA_CLOUD_API_KEY"
] = "llx-..." # can provide API-key in env or in the constructor later on
from llama_index.core import SimpleDirectoryReader
from llama_cloud_services import LlamaCloudIndex
# create a new index (uses managed embeddings by default)
index = LlamaCloudIndex.from_documents(
documents,
"my_first_index",
project_name="default",
api_key="llx-...",
verbose=True,
)
# connect to an existing index
index = LlamaCloudIndex("my_first_index", project_name="default")
```
You can also configure a retriever for managed retrieval:
```python
# from the existing index
index.as_retriever()
# from scratch
from llama_index.indices.managed.llama_cloud import LlamaCloudRetriever
retriever = LlamaCloudRetriever("my_first_index", project_name="default")
```
And of course, you can use other index shortcuts to get use out of your new managed index:
```python
query_engine = index.as_query_engine(llm=llm)
chat_engine = index.as_chat_engine(llm=llm)
```
## Retriever Settings
A full list of retriever settings/kwargs is below:
- `dense_similarity_top_k`: Optional[int] -- If greater than 0, retrieve `k` nodes using dense retrieval
- `sparse_similarity_top_k`: Optional[int] -- If greater than 0, retrieve `k` nodes using sparse retrieval
- `enable_reranking`: Optional[bool] -- Whether to enable reranking or not. Sacrifices some speed for accuracy
- `rerank_top_n`: Optional[int] -- The number of nodes to return after reranking initial retrieval results
- `alpha` Optional[float] -- The weighting between dense and sparse retrieval. 1 = Full dense retrieval, 0 = Full sparse retrieval.
-163
View File
@@ -1,163 +0,0 @@
# LlamaParse
LlamaParse is a **GenAI-native document parser** that can parse complex document data for any downstream LLM use case (RAG, agents).
It is really good at the following:
-**Broad file type support**: Parsing a variety of unstructured file types (.pdf, .pptx, .docx, .xlsx, .html) with text, tables, visual elements, weird layouts, and more.
-**Table recognition**: Parsing embedded tables accurately into text and semi-structured representations.
-**Multimodal parsing and chunking**: Extracting visual elements (images/diagrams) into structured formats and return image chunks using the latest multimodal models.
-**Custom parsing**: Input custom prompt instructions to customize the output the way you want it.
LlamaParse directly integrates with [LlamaIndex](https://github.com/run-llama/llama_index).
The free plan is up to 1000 pages a day. Paid plan is free 7k pages per week + 0.3c per additional page by default. There is a sandbox available to test the API [**https://cloud.llamaindex.ai/parse ↗**](https://cloud.llamaindex.ai/parse).
Read below for some quickstart information, or see the [full documentation](https://docs.cloud.llamaindex.ai/).
If you're a company interested in enterprise RAG solutions, and/or high volume/on-prem usage of LlamaParse, come [talk to us](https://www.llamaindex.ai/contact).
## Getting Started
First, login and get an api-key from [**https://cloud.llamaindex.ai/api-key ↗**](https://cloud.llamaindex.ai/api-key).
Then, install the package:
`pip install llama-cloud-services`
## CLI Usage
Now you can parse your first PDF file using the command line interface. Use the command `llama-parse [file_paths]`. See the help text with `llama-parse --help`.
```bash
export LLAMA_CLOUD_API_KEY='llx-...'
# output as text
llama-parse my_file.pdf --result-type text --output-file output.txt
# output as markdown
llama-parse my_file.pdf --result-type markdown --output-file output.md
# output as raw json
llama-parse my_file.pdf --output-raw-json --output-file output.json
```
## Python Usage
You can also create simple scripts:
```python
from llama_cloud_services import LlamaParse
parser = LlamaParse(
api_key="llx-...", # can also be set in your env as LLAMA_CLOUD_API_KEY
num_workers=4, # if multiple files passed, split in `num_workers` API calls
verbose=True,
language="en", # Optionally you can define a language, default=en
)
# sync
result = parser.parse("./my_file.pdf")
# sync batch
results = parser.parse(["./my_file1.pdf", "./my_file2.pdf"])
# async
result = await parser.aparse("./my_file.pdf")
# async batch
results = await parser.aparse(["./my_file1.pdf", "./my_file2.pdf"])
```
The result object is a fully typed `JobResult` object, and you can interact with it to parse and transform various parts of the result:
```python
# get the llama-index markdown documents
markdown_documents = result.get_markdown_documents(split_by_page=True)
# get the llama-index text documents
text_documents = result.get_text_documents(split_by_page=False)
# get the image documents
image_documents = result.get_image_documents(
include_screenshot_images=True,
include_object_images=False,
# Optional: download the images to a directory
# (default is to return the image bytes in ImageDocument objects)
image_download_dir="./images",
)
# access the raw job result
# Items will vary based on the parser configuration
for page in result.pages:
print(page.text)
print(page.md)
print(page.images)
print(page.layout)
print(page.structuredData)
```
See more details about the result object in the [example notebook](./examples/parse/demo_json_tour.ipynb).
### Using with file object / bytes
You can parse a file object directly:
```python
from llama_cloud_services import LlamaParse
parser = LlamaParse(
api_key="llx-...", # can also be set in your env as LLAMA_CLOUD_API_KEY
num_workers=4, # if multiple files passed, split in `num_workers` API calls
verbose=True,
language="en", # Optionally you can define a language, default=en
)
file_name = "my_file1.pdf"
extra_info = {"file_name": file_name}
with open(f"./{file_name}", "rb") as f:
# must provide extra_info with file_name key with passing file object
result = parser.parse(f, extra_info=extra_info)
# you can also pass file bytes directly
with open(f"./{file_name}", "rb") as f:
file_bytes = f.read()
# must provide extra_info with file_name key with passing file bytes
result = parser.parse(file_bytes, extra_info=extra_info)
```
### Using with `SimpleDirectoryReader`
You can also integrate the parser as the default PDF loader in `SimpleDirectoryReader`:
```python
from llama_cloud_services import LlamaParse
from llama_index.core import SimpleDirectoryReader
parser = LlamaParse(
api_key="llx-...", # can also be set in your env as LLAMA_CLOUD_API_KEY
result_type="markdown", # "markdown" and "text" are available
verbose=True,
)
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(
"./data", file_extractor=file_extractor
).load_data()
```
Full documentation for `SimpleDirectoryReader` can be found on the [LlamaIndex Documentation](https://developers.llamaindex.ai/python/framework/module_guides/loading/simpledirectoryreader/).
## Examples
Several end-to-end indexing examples can be found in the examples folder
- [Getting Started](examples/parse/demo_basic.ipynb)
- [Advanced RAG Example](examples/parse/demo_advanced.ipynb)
- [Raw API Usage](examples/parse/demo_api.ipynb)
- [Result Object Tour](examples/parse/demo_json_tour.ipynb)
## Documentation
[https://docs.cloud.llamaindex.ai/](https://docs.cloud.llamaindex.ai/)
+25
View File
@@ -1,5 +1,30 @@
# llama-cloud-services-py
## 0.6.94
### Patch Changes
- 232c55b: Include xlsx files in extract input
## 0.6.93
### Patch Changes
- da1916c: Add more warnings
## 0.6.92
### Patch Changes
- 2358df1: add deprecation notices
## 0.6.91
### Patch Changes
- 07ec282: Bump up patch versions for python packages
- 3040951: Use error description in ExtractedData invalid extraction error
## 0.6.90
### Patch Changes
+10
View File
@@ -4,6 +4,16 @@
# Llama Cloud Services
> **⚠️ DEPRECATION NOTICE**
>
> This repository and its packages are deprecated and will be maintained until **May 1, 2026**.
>
> **Please migrate to the new packages:**
> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))
> - **TypeScript**: `npm install @llamaindex/llama-cloud` ([GitHub](https://github.com/run-llama/llama-cloud-ts))
>
> The new packages provide the same functionality with improved performance, better support, and active development.
This repository contains the code for hand-written SDKs and clients for interacting with LlamaCloud.
This includes:
+12
View File
@@ -1,3 +1,5 @@
import warnings
from llama_cloud_services.parse import LlamaParse
from llama_cloud_services.extract import LlamaExtract, ExtractionAgent
from llama_cloud_services.utils import SourceText, FileInput
@@ -8,6 +10,16 @@ from llama_cloud_services.index import (
LlamaCloudRetriever,
)
# Emit deprecation warning once when package is imported
warnings.warn(
"This package (llama-cloud-services) is deprecated and will be maintained until May 1, 2026. "
"Please migrate to the new package: pip install llama-cloud>=1.0 "
"(https://github.com/run-llama/llama-cloud-py). "
"The new package provides the same functionality with improved performance and support.",
DeprecationWarning,
stacklevel=2,
)
__all__ = [
"LlamaParse",
"LlamaExtract",
@@ -475,26 +475,49 @@ class ExtractedData(BaseModel, Generic[ExtractedT]):
},
)
except ValidationError as e:
# Capture the job-level error from the extraction run if available
job_error = result.error
invalid_item = ExtractedData[Dict[str, Any]].create(
data=result.data or {},
status="error",
field_metadata=field_metadata,
metadata={"extraction_error": str(e), **(metadata or {})},
metadata={
"extraction_error": str(e),
**({"job_error": job_error} if job_error else {}),
**(metadata or {}),
},
file_id=file_id,
file_name=file_name,
file_hash=file_hash,
)
raise InvalidExtractionData(invalid_item) from e
raise InvalidExtractionData(invalid_item, extraction_error=job_error) from e
class InvalidExtractionData(Exception):
"""
Exception raised when the extracted data does not conform to the schema.
Attributes:
invalid_item: The ExtractedData instance containing the invalid data and metadata
extraction_error: The error message from the extraction job, if available
"""
def __init__(self, invalid_item: ExtractedData[Dict[str, Any]]):
def __init__(
self,
invalid_item: ExtractedData[Dict[str, Any]],
extraction_error: Optional[str] = None,
):
self.invalid_item = invalid_item
super().__init__("Not able to parse the extracted data, parsed invalid format")
self.extraction_error = extraction_error
# Build an informative error message
if extraction_error:
message = f"Extraction error: {extraction_error}"
else:
message = "Not able to parse the extracted data, parsed invalid format"
super().__init__(message)
def calculate_overall_confidence(
+97 -95
View File
@@ -4,10 +4,11 @@ import os
import time
from io import BufferedIOBase, TextIOWrapper
from pathlib import Path
from typing import List, Optional, Type, Union, Coroutine, Any, TypeVar
from typing import Callable, List, Optional, Type, Union, Coroutine, Any, TypeVar
import warnings
import httpx
from pydantic import BaseModel
from functools import wraps
from tenacity import (
retry_if_exception,
stop_after_attempt,
@@ -54,7 +55,7 @@ DEFAULT_EXTRACT_CONFIG = ExtractConfig(
def _is_retryable_error(exception: BaseException) -> bool:
"""Check if an exception is retryable."""
if isinstance(exception, ApiError):
return exception.status_code in (502, 503, 504, 425, 408)
return exception.status_code in (429, 500, 502, 503, 504, 425, 408)
elif isinstance(
exception, (httpx.HTTPStatusError, httpx.RequestError, httpx.TimeoutException)
):
@@ -62,6 +63,33 @@ def _is_retryable_error(exception: BaseException) -> bool:
return False
def _async_retry(
max_attempts: int = 5,
initial_wait: float = 1,
max_wait: float = 30,
jitter: float = 3,
) -> Callable:
"""Decorator for async functions with retry logic for rate limiting and transient errors."""
def decorator(func: Callable) -> Callable:
@wraps(func)
async def wrapper(*args: Any, **kwargs: Any) -> Any:
async for attempt in AsyncRetrying(
retry=retry_if_exception(_is_retryable_error),
stop=stop_after_attempt(max_attempts),
wait=wait_exponential_jitter(
initial=initial_wait, max=max_wait, jitter=jitter
),
reraise=True,
):
with attempt:
return await func(*args, **kwargs)
return wrapper
return decorator
async def _validate_schema(
client: AsyncLlamaCloud, data_schema: SchemaInput
) -> JSONObjectType:
@@ -82,50 +110,6 @@ async def _validate_schema(
return validated_schema.data_schema
async def _get_job_with_retry(
client: AsyncLlamaCloud,
job_id: str,
max_attempts: int = 5,
initial_wait: float = 1,
max_wait: float = 60,
jitter: float = 5,
) -> ExtractJob:
"""Get extraction job with retry logic."""
async for attempt in AsyncRetrying(
retry=retry_if_exception(_is_retryable_error),
stop=stop_after_attempt(max_attempts),
wait=wait_exponential_jitter(initial=initial_wait, max=max_wait, jitter=jitter),
reraise=True,
):
with attempt:
return await client.llama_extract.get_job(job_id=job_id)
async def _get_run_with_retry(
client: AsyncLlamaCloud,
job_id: str,
project_id: Optional[str] = None,
organization_id: Optional[str] = None,
max_attempts: int = 3,
initial_wait: float = 1,
max_wait: float = 20,
jitter: float = 3,
) -> ExtractRun:
"""Get extraction run with retry logic."""
async for attempt in AsyncRetrying(
retry=retry_if_exception(_is_retryable_error),
stop=stop_after_attempt(max_attempts),
wait=wait_exponential_jitter(initial=initial_wait, max=max_wait, jitter=jitter),
reraise=True,
):
with attempt:
return await client.llama_extract.get_run_by_job_id(
job_id=job_id,
project_id=project_id,
organization_id=organization_id,
)
async def _wait_for_job_result(
client: AsyncLlamaCloud,
job_id: str,
@@ -142,30 +126,33 @@ async def _wait_for_job_result(
run_jitter: float = 3,
) -> Optional[ExtractRun]:
"""Wait for and return the results of an extraction job."""
@_async_retry(
max_attempts=job_retry_attempts, max_wait=job_max_wait, jitter=job_jitter
)
async def _get_job() -> ExtractJob:
return await client.llama_extract.get_job(job_id=job_id)
@_async_retry(
max_attempts=run_retry_attempts, max_wait=run_max_wait, jitter=run_jitter
)
async def _get_run() -> ExtractRun:
return await client.llama_extract.get_run_by_job_id(
job_id=job_id,
project_id=project_id,
organization_id=organization_id,
)
start = time.perf_counter()
poll_count = 0
while True:
await asyncio.sleep(check_interval)
poll_count += 1
job = await _get_job_with_retry(
client,
job_id,
max_attempts=job_retry_attempts,
max_wait=job_max_wait,
jitter=job_jitter,
)
job = await _get_job()
if job.status == StatusEnum.SUCCESS:
return await _get_run_with_retry(
client,
job_id,
project_id,
organization_id,
max_attempts=run_retry_attempts,
max_wait=run_max_wait,
jitter=run_jitter,
)
return await _get_run()
elif job.status == StatusEnum.PENDING:
end = time.perf_counter()
if end - start > max_timeout:
@@ -177,15 +164,7 @@ async def _wait_for_job_result(
warnings.warn(
f"Failure in job: {job_id}, status: {job.status}, error: {job.error}"
)
return await _get_run_with_retry(
client,
job_id,
project_id,
organization_id,
max_attempts=run_retry_attempts,
max_wait=run_max_wait,
jitter=run_jitter,
)
return await _get_run()
def run_in_thread(
@@ -498,9 +477,12 @@ class ExtractionAgent:
Args:
run_id (str): The ID of the extraction run to delete
"""
self._run_in_thread(
self._client.llama_extract.delete_extraction_run(run_id=run_id)
)
@_async_retry()
async def _delete() -> None:
return await self._client.llama_extract.delete_extraction_run(run_id=run_id)
self._run_in_thread(_delete())
def list_extraction_runs(
self, page: int = 0, limit: int = 100
@@ -510,13 +492,16 @@ class ExtractionAgent:
Returns:
PaginatedExtractRunsResponse: Paginated list of extraction runs
"""
return self._run_in_thread(
self._client.llama_extract.list_extract_runs(
@_async_retry()
async def _list() -> PaginatedExtractRunsResponse:
return await self._client.llama_extract.list_extract_runs(
extraction_agent_id=self.id,
skip=page * limit,
limit=limit,
)
)
return self._run_in_thread(_list())
def __repr__(self) -> str:
return f"ExtractionAgent(id={self.id}, name={self.name})"
@@ -658,15 +643,17 @@ class LlamaExtract(BaseComponent):
"data_schema must be either a dictionary or a Pydantic model"
)
agent = self._run_in_thread(
self._async_client.llama_extract.create_extraction_agent(
@_async_retry()
async def _create() -> CloudExtractAgent:
return await self._async_client.llama_extract.create_extraction_agent(
project_id=self._project_id,
organization_id=self._organization_id,
name=name,
data_schema=data_schema,
config=config,
)
)
agent = self._run_in_thread(_create())
return ExtractionAgent(
client=self._async_client,
@@ -702,19 +689,27 @@ class LlamaExtract(BaseComponent):
)
if id:
agent = self._run_in_thread(
self._async_client.llama_extract.get_extraction_agent(
@_async_retry()
async def _get_by_id() -> CloudExtractAgent:
return await self._async_client.llama_extract.get_extraction_agent(
extraction_agent_id=id,
)
)
agent = self._run_in_thread(_get_by_id())
elif name:
agent = self._run_in_thread(
self._async_client.llama_extract.get_extraction_agent_by_name(
name=name,
project_id=self._project_id,
@_async_retry()
async def _get_by_name() -> CloudExtractAgent:
return (
await self._async_client.llama_extract.get_extraction_agent_by_name(
name=name,
project_id=self._project_id,
)
)
)
agent = self._run_in_thread(_get_by_name())
else:
raise ValueError("Either name or extraction_agent_id must be provided.")
@@ -734,11 +729,14 @@ class LlamaExtract(BaseComponent):
def list_agents(self) -> List[ExtractionAgent]:
"""List all available extraction agents."""
agents = self._run_in_thread(
self._async_client.llama_extract.list_extraction_agents(
@_async_retry()
async def _list() -> List[CloudExtractAgent]:
return await self._async_client.llama_extract.list_extraction_agents(
project_id=self._project_id,
)
)
agents = self._run_in_thread(_list())
return [
ExtractionAgent(
@@ -763,11 +761,14 @@ class LlamaExtract(BaseComponent):
Args:
agent_id (str): ID of the extraction agent to delete
"""
self._run_in_thread(
self._async_client.llama_extract.delete_extraction_agent(
extraction_agent_id=agent_id
@_async_retry()
async def _delete() -> None:
return await self._async_client.llama_extract.delete_extraction_agent(
extraction_agent_id=agent_id,
)
)
self._run_in_thread(_delete())
async def _wait_for_job_result(self, job_id: str) -> Optional[ExtractRun]:
"""Wait for and return the results of an extraction job."""
@@ -805,6 +806,7 @@ class LlamaExtract(BaseComponent):
# Document files
".pdf": "application/pdf",
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
# Image files
".png": "image/png",
".jpg": "image/jpeg",
+4 -6
View File
@@ -3,7 +3,7 @@ from typing import BinaryIO
import os
from pathlib import Path
from llama_cloud.client import AsyncLlamaCloud
from llama_cloud.types import File, FileCreate
from llama_cloud.types import File
from typing import Optional
from llama_cloud_services.utils import SourceText, FileInput
@@ -73,11 +73,9 @@ class FileClient:
presigned_url = await self.client.files.generate_presigned_url(
project_id=self.project_id,
organization_id=self.organization_id,
request=FileCreate(
name=name,
external_file_id=external_file_id,
file_size=file_size,
),
name=name,
external_file_id=external_file_id,
file_size=file_size,
)
httpx_client = self.client._client_wrapper.httpx_client
upload_response = await httpx_client.put(
+19 -10
View File
@@ -21,7 +21,6 @@ from llama_cloud import (
PipelineCreateTransformConfig,
PipelineFileCreateCustomMetadataValue,
PipelineType,
ProjectCreate,
ManagedIngestionStatus,
CloudDocumentCreate,
CloudDocument,
@@ -507,14 +506,19 @@ class LlamaCloudIndex(BaseManagedIndex):
client = get_client(api_key, base_url, app_url, timeout)
if project_id is None:
# create project if it doesn't exist
project = client.projects.upsert_project(
# get project by name
projects = client.projects.list_projects(
organization_id=organization_id,
request=ProjectCreate(name=project_name),
project_name=project_name,
)
if not projects:
raise ValueError(
f"Project '{project_name}' not found. Please create it first in the LlamaCloud UI."
)
project = projects[0]
project_id = project.id
if verbose:
print(f"Created project {project_id} with name {project_name}")
print(f"Found project {project_id} with name {project_name}")
# create pipeline
pipeline_create = PipelineCreate(
@@ -563,15 +567,20 @@ class LlamaCloudIndex(BaseManagedIndex):
app_url = app_url or os.environ.get("LLAMA_CLOUD_APP_URL", DEFAULT_APP_URL)
aclient = get_aclient(api_key, base_url, app_url, timeout)
# create project if it doesn't exist
project = await aclient.projects.upsert_project(
organization_id=organization_id, request=ProjectCreate(name=project_name)
# get project by name
projects = await aclient.projects.list_projects(
organization_id=organization_id, project_name=project_name
)
if not projects:
raise ValueError(
f"Project '{project_name}' not found. Please create it first in the LlamaCloud UI."
)
project = projects[0]
if project.id is None:
raise ValueError(f"Failed to create/get project {project_name}")
raise ValueError(f"Failed to get project {project_name}")
if verbose:
print(f"Created project {project.id} with name {project.name}")
print(f"Found project {project.id} with name {project.name}")
# create pipeline
pipeline_create = PipelineCreate(
+32
View File
@@ -1,5 +1,37 @@
# llama_parse
## 0.6.94
### Patch Changes
- 232c55b: Include xlsx files in extract input
- Updated dependencies [232c55b]
- llama-cloud-services-py@0.6.94
## 0.6.93
### Patch Changes
- da1916c: Add more warnings
- Updated dependencies [da1916c]
- llama-cloud-services-py@0.6.93
## 0.6.92
### Patch Changes
- Updated dependencies [2358df1]
- llama-cloud-services-py@0.6.92
## 0.6.91
### Patch Changes
- 07ec282: Bump up patch versions for python packages
- Updated dependencies [07ec282]
- Updated dependencies [3040951]
- llama-cloud-services-py@0.6.91
## 0.6.90
### Patch Changes
+10
View File
@@ -1,5 +1,15 @@
# LlamaParse
> **⚠️ DEPRECATION NOTICE**
>
> This repository and its packages are deprecated and will be maintained until **May 1, 2026**.
>
> **Please migrate to the new packages:**
> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))
> - **TypeScript**: `npm install @llamaindex/llama-cloud` ([GitHub](https://github.com/run-llama/llama-cloud-ts))
>
> The new packages provide the same functionality with improved performance, better support, and active development.
[![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-parse)](https://pypi.org/project/llama-parse/)
[![GitHub contributors](https://img.shields.io/github/contributors/run-llama/llama_parse)](https://github.com/run-llama/llama_parse/graphs/contributors)
[![Discord](https://img.shields.io/discord/1059199217496772688)](https://discord.gg/dGcwcsnxhU)
+11 -1
View File
@@ -1,8 +1,18 @@
from llama_cloud_services.parse import (
import warnings
from llama_cloud_services.parse import ( # type: ignore[attr-defined]
LlamaParse,
ResultType,
ParsingMode,
FailedPageMode,
)
warnings.warn(
"The 'llama-parse' package is deprecated and will no longer receive updates. "
"Please migrate to the new unified SDK. "
"See https://developers.llamaindex.ai/python/cloud/llamaparse/getting_started/ "
"and https://github.com/run-llama/llama-cloud-py/blob/main/README.md for migration instructions.",
DeprecationWarning,
stacklevel=2,
)
__all__ = ["LlamaParse", "ResultType", "ParsingMode", "FailedPageMode"]
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "llama_parse",
"version": "0.6.90",
"version": "0.6.94",
"description": "",
"main": "index.js",
"private": false,
+2 -2
View File
@@ -11,13 +11,13 @@ dev = [
[project]
name = "llama-parse"
version = "0.6.90"
version = "0.6.94"
description = "Parse files into RAG-Optimized formats."
authors = [{name = "Logan Markewich", email = "logan@llamaindex.ai"}]
requires-python = ">=3.9,<4.0"
readme = "README.md"
license = "MIT"
dependencies = ["llama-cloud-services>=0.6.90"]
dependencies = ["llama-cloud-services>=0.6.94"]
[project.scripts]
llama-parse = "llama_parse.cli.main:parse"
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "llama-cloud-services-py",
"version": "0.6.90",
"version": "0.6.94",
"private": false,
"license": "MIT",
"scripts": {},
+2 -2
View File
@@ -23,7 +23,7 @@ dev = [
[project]
name = "llama-cloud-services"
version = "0.6.90"
version = "0.6.94"
description = "Tailored SDK clients for LlamaCloud services."
authors = [{name = "Logan Markewich", email = "logan@runllama.ai"}]
requires-python = ">=3.9,<4.0"
@@ -31,7 +31,7 @@ readme = "README.md"
license = "MIT"
dependencies = [
"llama-index-core>=0.12.0",
"llama-cloud==0.1.45",
"llama-cloud==0.1.46",
"pydantic>=2.8,!=2.10",
"click>=8.1.7,<9",
"python-dotenv>=1.0.1,<2",
+33 -40
View File
@@ -1,47 +1,40 @@
import os
from typing import List
from llama_cloud_services.extract import LlamaExtract
from typing import Any, Dict, Optional, Union
# Global storage for agents to cleanup
_TEST_AGENTS_TO_CLEANUP: List[str] = []
from llama_cloud.core.api_error import ApiError
from llama_cloud.types import ExtractConfig
from pydantic import BaseModel
from tenacity import (
retry,
retry_if_exception,
stop_after_attempt,
wait_exponential,
)
from llama_cloud_services.extract import ExtractionAgent, LlamaExtract
def _is_rate_limit_error(exception: BaseException) -> bool:
"""Check if the exception is a rate limit error (429)."""
return isinstance(exception, ApiError) and exception.status_code == 429
@retry(
retry=retry_if_exception(_is_rate_limit_error),
wait=wait_exponential(multiplier=1, min=1, max=30),
stop=stop_after_attempt(5),
reraise=True,
)
def create_agent_with_retry(
extractor: LlamaExtract,
name: str,
data_schema: Union[Dict[str, Any], type[BaseModel]],
config: Optional[ExtractConfig] = None,
) -> ExtractionAgent:
"""Create an extraction agent with retry logic for rate limiting."""
return extractor.create_agent(name=name, data_schema=data_schema, config=config)
def pytest_configure(config):
"""Register custom markers for extract tests."""
config.addinivalue_line("markers", "agent_name: custom agent name for test")
config.addinivalue_line("markers", "agent_schema: custom agent schema for test")
def pytest_sessionfinish(session, exitstatus):
"""Hook that runs after all tests complete - cleanup agents here"""
print(
f"pytest_sessionfinish hook called! Agents to cleanup: {_TEST_AGENTS_TO_CLEANUP}"
)
if _TEST_AGENTS_TO_CLEANUP:
print("Creating cleanup client...")
# Create a fresh client just for cleanup
cleanup_client = LlamaExtract(
api_key=os.getenv("LLAMA_CLOUD_API_KEY"),
base_url=os.getenv("LLAMA_CLOUD_BASE_URL"),
project_id=os.getenv("LLAMA_CLOUD_PROJECT_ID"),
verbose=True,
)
for agent_id in _TEST_AGENTS_TO_CLEANUP:
try:
print(f"Deleting agent {agent_id}...")
cleanup_client.delete_agent(agent_id)
print(f"Cleaned up agent {agent_id}")
except Exception as e:
print(f"Warning: Failed to delete agent {agent_id}: {e}")
_TEST_AGENTS_TO_CLEANUP.clear()
print("Agent cleanup completed")
else:
print("No agents to cleanup")
def register_agent_for_cleanup(agent_id: str):
"""Register an agent ID for cleanup at the end of the test session"""
_TEST_AGENTS_TO_CLEANUP.append(agent_id)
+55 -37
View File
@@ -1,4 +1,6 @@
import os
import shutil
import uuid
import pytest
from pathlib import Path
from pydantic import BaseModel
@@ -6,7 +8,7 @@ from pydantic import BaseModel
from llama_cloud_services.extract import LlamaExtract, ExtractionAgent, SourceText
from llama_cloud.types import ExtractConfig, ExtractMode, ExtractRun
from tests.extract.util import load_test_dotenv
from .conftest import register_agent_for_cleanup
from .conftest import create_agent_with_retry
load_test_dotenv()
@@ -59,17 +61,27 @@ def test_schema_dict():
@pytest.fixture
def test_agent(llama_extract, test_agent_name, test_schema_dict, request):
"""Creates a test agent and collects it for cleanup at the end of all tests"""
test_id = request.node.nodeid
test_hash = hex(hash(test_id))[-8:]
base_name = test_agent_name
def unique_test_pdf(tmp_path):
"""Copy test PDF to a unique path to avoid file deduplication across parallel tests.
Uses a UUID in the filename so that external_file_id is unique regardless of
whether the full path or just the filename is sent to the backend.
"""
unique_name = f"{TEST_PDF.stem}-{uuid.uuid4().hex[:8]}{TEST_PDF.suffix}"
unique_pdf = tmp_path / unique_name
shutil.copy2(TEST_PDF, unique_pdf)
return unique_pdf
@pytest.fixture
def test_agent(llama_extract, test_agent_name, test_schema_dict, request):
"""Creates a test agent with a unique name and cleans it up after the test."""
unique_id = uuid.uuid4().hex[:8]
base_name = next(
(marker.args[0] for marker in request.node.iter_markers("agent_name")),
base_name,
test_agent_name,
)
name = f"{base_name}_{test_hash}"
name = f"{base_name}_{unique_id}"
schema = next(
(
@@ -79,21 +91,20 @@ def test_agent(llama_extract, test_agent_name, test_schema_dict, request):
test_schema_dict,
)
# Cleanup existing agent
try:
for agent in llama_extract.list_agents():
if agent.name == name:
llama_extract.delete_agent(agent.id)
except Exception as e:
print(f"Warning: Failed to cleanup existing agent: {e}")
agent = llama_extract.create_agent(name=name, data_schema=schema)
# Add agent to cleanup list via conftest helper
register_agent_for_cleanup(agent.id)
# Use config with cache invalidation to ensure fresh results in tests
config = ExtractConfig(invalidate_cache=True)
agent = create_agent_with_retry(
llama_extract, name=name, data_schema=schema, config=config
)
yield agent
# Inline cleanup -- each worker cleans up its own agents
try:
llama_extract.delete_agent(agent.id)
except Exception as e:
print(f"Warning: Failed to cleanup agent {agent.id}: {e}")
class TestLlamaExtract:
def test_init_without_api_key(self):
@@ -134,34 +145,38 @@ class TestLlamaExtract:
class TestExtractionAgent:
@pytest.mark.asyncio
async def test_extract_single_file(self, test_agent):
result = await test_agent.aextract(TEST_PDF)
async def test_extract_single_file(self, test_agent, unique_test_pdf):
result = await test_agent.aextract(unique_test_pdf)
assert result.status == "SUCCESS"
assert result.data is not None
assert isinstance(result.data, dict)
assert "title" in result.data
assert "summary" in result.data
def test_sync_extract_single_file(self, test_agent):
result = test_agent.extract(TEST_PDF)
def test_sync_extract_single_file(self, test_agent, unique_test_pdf):
result = test_agent.extract(unique_test_pdf)
assert result.status == "SUCCESS"
assert result.data is not None
assert isinstance(result.data, dict)
assert "title" in result.data
assert "summary" in result.data
def test_extract_file_from_buffered_io(self, test_agent):
result = test_agent.extract(SourceText(file=open(TEST_PDF, "rb")))
def test_extract_file_from_buffered_io(self, test_agent, unique_test_pdf):
result = test_agent.extract(
SourceText(file=open(unique_test_pdf, "rb"), filename=unique_test_pdf.name)
)
assert result.status == "SUCCESS"
assert result.data is not None
assert isinstance(result.data, dict)
assert "title" in result.data
assert "summary" in result.data
def test_extract_file_from_bytes(self, test_agent):
with open(TEST_PDF, "rb") as f:
def test_extract_file_from_bytes(self, test_agent, unique_test_pdf):
with open(unique_test_pdf, "rb") as f:
file_bytes = f.read()
result = test_agent.extract(SourceText(file=file_bytes, filename=TEST_PDF.name))
result = test_agent.extract(
SourceText(file=file_bytes, filename=unique_test_pdf.name)
)
assert result.status == "SUCCESS"
assert result.data is not None
assert isinstance(result.data, dict)
@@ -177,7 +192,10 @@ class TestExtractionAgent:
weight for 8 to 13 km (58 miles).[3] The name llama (also historically spelled
"glama") was adopted by European settlers from native Peruvians.
"""
result = test_agent.extract(SourceText(text_content=TEST_TEXT))
unique_name = f"text-{uuid.uuid4().hex[:8]}.txt"
result = test_agent.extract(
SourceText(text_content=TEST_TEXT, filename=unique_name)
)
assert result.status == "SUCCESS"
assert result.data is not None
assert isinstance(result.data, dict)
@@ -185,8 +203,8 @@ class TestExtractionAgent:
assert "summary" in result.data
@pytest.mark.asyncio
async def test_extract_multiple_files(self, test_agent):
files = [TEST_PDF, TEST_PDF] # Using same file twice for testing
async def test_extract_multiple_files(self, test_agent, unique_test_pdf):
files = [unique_test_pdf, unique_test_pdf] # Using same file twice for testing
response = await test_agent.aextract(files)
assert len(response) == 2
@@ -215,15 +233,15 @@ class TestExtractionAgent:
updated_agent = llama_extract.get_agent(name=test_agent.name)
assert "new_field" in updated_agent.data_schema["properties"]
def test_list_extraction_runs(self, test_agent: ExtractionAgent):
def test_list_extraction_runs(self, test_agent: ExtractionAgent, unique_test_pdf):
assert test_agent.list_extraction_runs().total == 0
test_agent.extract(TEST_PDF)
test_agent.extract(unique_test_pdf)
runs = test_agent.list_extraction_runs()
assert runs.total > 0
def test_delete_extraction_run(self, test_agent: ExtractionAgent):
def test_delete_extraction_run(self, test_agent: ExtractionAgent, unique_test_pdf):
assert test_agent.list_extraction_runs().total == 0
run: ExtractRun = test_agent.extract(TEST_PDF)
run: ExtractRun = test_agent.extract(unique_test_pdf)
test_agent.delete_extraction_run(run.id)
runs = test_agent.list_extraction_runs()
assert runs.total == 0
@@ -237,7 +255,7 @@ class TestStatelessExtraction:
@pytest.fixture
def test_config(self):
return ExtractConfig(extraction_mode=ExtractMode.FAST)
return ExtractConfig(extraction_mode=ExtractMode.FAST, invalidate_cache=True)
@pytest.fixture
def test_schema_dict(self):
+28 -21
View File
@@ -1,14 +1,16 @@
import os
from pathlib import Path
import pytest
from llama_cloud_services.extract import LlamaExtract, ExtractionAgent
from llama_cloud_services.utils import SourceText
from collections import namedtuple
import json
import uuid
from llama_cloud.types import ExtractConfig, ExtractMode
from deepdiff import DeepDiff
from tests.extract.util import json_subset_match_score, load_test_dotenv
from .conftest import register_agent_for_cleanup
from .conftest import create_agent_with_retry
load_test_dotenv()
@@ -56,10 +58,16 @@ def get_test_cases():
input_files.append(file_path)
settings = [
ExtractConfig(extraction_mode=ExtractMode.FAST),
ExtractConfig(extraction_mode=ExtractMode.BALANCED),
ExtractConfig(extraction_mode=ExtractMode.MULTIMODAL),
ExtractConfig(extraction_mode=ExtractMode.PREMIUM),
ExtractConfig(extraction_mode=ExtractMode.FAST, invalidate_cache=True),
ExtractConfig(extraction_mode=ExtractMode.BALANCED, invalidate_cache=True),
ExtractConfig(
extraction_mode=ExtractMode.MULTIMODAL, invalidate_cache=True
),
ExtractConfig(
extraction_mode=ExtractMode.PREMIUM,
invalidate_cache=True,
parse_model="anthropic-sonnet-4.5",
),
]
for input_file in sorted(input_files):
@@ -101,30 +109,24 @@ def extractor():
@pytest.fixture
def extraction_agent(test_case: ExtractionTestCase, extractor: LlamaExtract):
"""Fixture to create and cleanup extraction agent for each test."""
# Create unique name with random UUID (important for CI to avoid conflicts)
unique_id = uuid.uuid4().hex[:8]
agent_name = f"{test_case.name}_{unique_id}"
with open(test_case.schema_path, "r") as f:
schema = json.load(f)
# Clean up any existing agents with this name
try:
agents = extractor.list_agents()
for agent in agents:
if agent.name == agent_name:
extractor.delete_agent(agent.id)
except Exception as e:
print(f"Warning: Failed to cleanup existing agent: {str(e)}")
# Create new agent
agent = extractor.create_agent(agent_name, schema, config=test_case.config)
# Register agent for cleanup at the end of the test session
register_agent_for_cleanup(agent.id)
agent = create_agent_with_retry(
extractor, name=agent_name, data_schema=schema, config=test_case.config
)
yield agent
# Inline cleanup -- each worker cleans up its own agents
try:
extractor.delete_agent(agent.id)
except Exception as e:
print(f"Warning: Failed to cleanup agent {agent.id}: {e}")
@pytest.mark.skipif(
os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
@@ -134,7 +136,12 @@ def extraction_agent(test_case: ExtractionTestCase, extractor: LlamaExtract):
def test_extraction(
test_case: ExtractionTestCase, extraction_agent: ExtractionAgent
) -> None:
result = extraction_agent.extract(test_case.input_file).data # type: ignore
# Use a unique external_file_id per upload to avoid cross-test collisions.
input_path = Path(test_case.input_file)
unique_filename = f"{input_path.stem}-{uuid.uuid4().hex}{input_path.suffix}"
result = extraction_agent.extract(
SourceText(file=str(input_path), filename=unique_filename)
).data # type: ignore
with open(test_case.expected_output, "r") as f:
expected = json.load(f)
# TODO: fix the saas_slide test
+8 -5
View File
@@ -8,7 +8,6 @@ from llama_cloud import (
AutoTransformConfig,
PipelineCreate,
PipelineFileCreate,
ProjectCreate,
CompositeRetrievalMode,
LlamaParseParameters,
ReRankConfig,
@@ -60,11 +59,15 @@ def local_figures_file() -> str:
def _setup_index_with_file(
client: LlamaCloud, index_name: str, remote_file: Tuple[str, str]
) -> LlamaCloudIndex:
# create project if it doesn't exist
project_create = ProjectCreate(name=project_name)
project = client.projects.upsert_project(
organization_id=organization_id, request=project_create
# get project by name
projects = client.projects.list_projects(
organization_id=organization_id, project_name=project_name
)
if not projects:
raise ValueError(
f"Project '{project_name}' not found. Please create it first in the LlamaCloud UI."
)
project = projects[0]
# create pipeline
pipeline_create = PipelineCreate(
@@ -423,6 +423,7 @@ def create_extract_run(
},
data_schema: Dict[str, Any] = {},
file: File = create_file(),
error: Optional[str] = None,
) -> ExtractRun:
return ExtractRun.parse_obj(
{
@@ -439,6 +440,7 @@ def create_extract_run(
"status": "SUCCESS",
"project_id": str(uuid.uuid4()),
"from_ui": False,
"error": error,
}
)
@@ -544,6 +546,46 @@ def test_extracted_data_from_extraction_result_invalid_data():
assert invalid_data.field_metadata["name"].confidence == 0.9
assert invalid_data.overall_confidence == 0.9
# Verify default error message when no job error present
assert exc_info.value.extraction_error is None
assert "Not able to parse the extracted data" in str(exc_info.value)
def test_extracted_data_from_extraction_result_with_job_error():
"""Test ExtractedData.from_extraction_result with job-level error prominently displayed."""
job_error_message = "Failed to process document: unsupported file format"
# Create ExtractRun with both invalid data AND a job-level error
extract_run = create_extract_run(
data={
"missing_name": "Valid Name",
"age": "not_a_number",
}, # Invalid age, missing name
extraction_metadata={
"name": {"confidence": 0.9},
},
data_schema={},
file=create_file(id="error-file", name="bad_data.pdf"),
error=job_error_message,
)
# Should raise InvalidExtractionData with the job error prominently displayed
with pytest.raises(InvalidExtractionData) as exc_info:
ExtractedData.from_extraction_result(
extract_run, Person, metadata={"test": "metadata"}
)
# Verify the exception message prominently shows the job error
exception = exc_info.value
assert exception.extraction_error == job_error_message
assert f"Extraction error: {job_error_message}" == str(exception)
# Verify the invalid_item contains both errors in metadata
invalid_data = exception.invalid_item
assert invalid_data.metadata.get("job_error") == job_error_message
assert "extraction_error" in invalid_data.metadata # Validation error still present
assert "test" in invalid_data.metadata # Original metadata preserved
class Dimensions(BaseModel):
length: Optional[str] = Field(
+17 -16
View File
@@ -34,9 +34,10 @@ TEST_PIPELINE = Pipeline(
def mock_client() -> MagicMock:
"""Mock client with sensible defaults."""
client = MagicMock()
client.projects.upsert_project.return_value = Project(
default_project = Project(
id="default-proj", name=DEFAULT_PROJECT_NAME, organization_id="default-org"
)
client.projects.list_projects.return_value = [default_project]
client.pipelines.upsert_pipeline.return_value = Pipeline(
id="default-pipe",
name="default",
@@ -100,8 +101,8 @@ def test_from_documents_uses_provided_project_id(mock_client: MagicMock) -> None
project_id=provided_project_id,
)
# Assert - project upsert not called; pipeline uses provided project_id
mock_client.projects.upsert_project.assert_not_called()
# Assert - project list not called (project_id provided); pipeline uses provided project_id
mock_client.projects.list_projects.assert_not_called()
assert mock_client.pipelines.upsert_pipeline.call_count == 1
assert (
mock_client.pipelines.upsert_pipeline.call_args.kwargs["project_id"]
@@ -110,29 +111,29 @@ def test_from_documents_uses_provided_project_id(mock_client: MagicMock) -> None
assert index.project.id == provided_project_id
def test_from_documents_upserts_project_when_project_id_missing(
def test_from_documents_lists_project_when_project_id_missing(
mock_client: MagicMock,
) -> None:
organization_id = "org-xyz"
index_name = "my_new_index"
# Project is created when project_id is not provided
upserted_project = Project(
# Project is found by name when project_id is not provided
found_project = Project(
id="proj-999", name=DEFAULT_PROJECT_NAME, organization_id=organization_id
)
mock_client.projects.upsert_project.return_value = upserted_project
mock_client.projects.list_projects.return_value = [found_project]
test_pipeline = Pipeline(
id="pipe-xyz",
name=index_name,
project_id=upserted_project.id,
project_id=found_project.id,
embedding_config=EMBEDDING_CONFIG,
)
with patch.object(
base,
"resolve_project_and_pipeline",
return_value=(upserted_project, test_pipeline),
return_value=(found_project, test_pipeline),
):
docs = [Document(text="world")]
index = LlamaCloudIndex.from_documents(
@@ -141,15 +142,15 @@ def test_from_documents_upserts_project_when_project_id_missing(
organization_id=organization_id,
)
# Assert - project was upserted with org id and default project name
mock_client.projects.upsert_project.assert_called_once()
kwargs = mock_client.projects.upsert_project.call_args.kwargs
# Assert - project was listed with org id and default project name
mock_client.projects.list_projects.assert_called_once()
kwargs = mock_client.projects.list_projects.call_args.kwargs
assert kwargs["organization_id"] == organization_id
assert kwargs["request"].name == DEFAULT_PROJECT_NAME
assert kwargs["project_name"] == DEFAULT_PROJECT_NAME
# Pipeline created under the upserted project id
# Pipeline created under the found project id
assert (
mock_client.pipelines.upsert_pipeline.call_args.kwargs["project_id"]
== upserted_project.id
== found_project.id
)
assert index.project.id == upserted_project.id
assert index.project.id == found_project.id
Generated
+5 -5
View File
@@ -1595,21 +1595,21 @@ wheels = [
[[package]]
name = "llama-cloud"
version = "0.1.45"
version = "0.1.46"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "certifi" },
{ name = "httpx" },
{ name = "pydantic" },
]
sdist = { url = "https://files.pythonhosted.org/packages/e0/b7/3a2a209f1c3fa516de172cb13e03f5a897adea5523f2ee0f544d035e3704/llama_cloud-0.1.45.tar.gz", hash = "sha256:140244008cc5710e31ae97c6043973a3a9969a51b0f38155fa33a8434078e8aa", size = 140968, upload-time = "2025-12-03T02:22:49.484Z" }
sdist = { url = "https://files.pythonhosted.org/packages/40/f3/f4d6520f8d546e6c5a02f6ebeed5c09774a074b8d2c24ad559ace97a56a6/llama_cloud-0.1.46.tar.gz", hash = "sha256:e86f8791c053590d70cc59e0fc13ce72f9b681a8e658bc61df86d0285288d8ee", size = 127752, upload-time = "2026-01-21T18:40:57.103Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/62/1d/466b0df69b81ce9410ad6ec7229a1e6601ff69640f02f246e06cfcc7428c/llama_cloud-0.1.45-py3-none-any.whl", hash = "sha256:500299a6d3f25f97bcf6755d6338523023564fa8f376955c2cf299bbc9561cc2", size = 397184, upload-time = "2025-12-03T02:22:48.335Z" },
{ url = "https://files.pythonhosted.org/packages/c4/3a/6caaea28c8c804add33c91d356ed7d5a5412d6c9598e1450af95a15e0bcd/llama_cloud-0.1.46-py3-none-any.whl", hash = "sha256:6c6546c09c04a038c86d84d42f00eae8fd3bff49991ad3aab844bd866ecdf352", size = 361989, upload-time = "2026-01-21T18:40:54.863Z" },
]
[[package]]
name = "llama-cloud-services"
version = "0.6.88"
version = "0.6.90"
source = { editable = "." }
dependencies = [
{ name = "click", version = "8.1.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
@@ -1649,7 +1649,7 @@ dev = [
requires-dist = [
{ name = "click", specifier = ">=8.1.7,<9" },
{ name = "eval-type-backport", marker = "python_full_version < '3.10'", specifier = ">=0.2.0,<0.3" },
{ name = "llama-cloud", specifier = "==0.1.45" },
{ name = "llama-cloud", specifier = "==0.1.46" },
{ name = "llama-index-core", specifier = ">=0.12.0" },
{ name = "packaging", specifier = ">=23.0" },
{ name = "platformdirs", specifier = ">=4.3.7,<5" },
+6
View File
@@ -1,5 +1,11 @@
# llama-cloud-services
## 0.5.4
### Patch Changes
- 2358df1: add deprecation notices
## 0.5.3
### Patch Changes
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "llama-cloud-services",
"version": "0.5.3",
"version": "0.5.4",
"type": "module",
"license": "MIT",
"scripts": {
+10
View File
@@ -1,3 +1,13 @@
// Emit deprecation warning once when package is imported
if (typeof console !== "undefined" && console.warn) {
console.warn(
"⚠️ DEPRECATION WARNING: This package (llama_cloud_services) is deprecated and will be maintained until May 1, 2026. " +
"Please migrate to the new package: npm install @llamaindex/llama-cloud " +
"(https://github.com/run-llama/llama-cloud-ts). " +
"The new package provides the same functionality with improved performance and support.",
);
}
export { LLamaCloudFileService } from "./LLamaCloudFileService.js";
export { LlamaCloudIndex } from "./LlamaCloudIndex.js";
export {