Compare commits
97 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| f385e96ab8 | |||
| c3e4696b5f | |||
| 1e40c9cf94 | |||
| 802bc2a9f8 | |||
| 5ea758b853 | |||
| 208b6f2fa5 | |||
| e1b9143f79 | |||
| 232c55bd6a | |||
| ab6f2f8da5 | |||
| 66c2639ec8 | |||
| da1916c69f | |||
| 345e272573 | |||
| d70fbac1ce | |||
| 2358df10c6 | |||
| 829628cc86 | |||
| 42b7bbd1ae | |||
| 38da9a52d7 | |||
| 1e7ec40ee7 | |||
| dd83c1a9d0 | |||
| 7cb83f5cd3 | |||
| b05266be6d | |||
| eab4798165 | |||
| b174fa8fab | |||
| b12ffef916 | |||
| 07ec282257 | |||
| 013b689812 | |||
| 3040951cb8 | |||
| 9239498945 | |||
| 19cbb25631 | |||
| 812e2f7d72 | |||
| d7864afe3f | |||
| ade8d027a5 | |||
| 997bcc8531 | |||
| 8be554c234 | |||
| f777cab0c5 | |||
| b9b83c953d | |||
| 3ec7024626 | |||
| d5b18a03fa | |||
| 18dd04b6de | |||
| 685a5e6ccc | |||
| 576c3d9076 | |||
| c8321d2bc5 | |||
| 131bbed7aa | |||
| 41c8ac2348 | |||
| 32c53cdf96 | |||
| 71db318fc2 | |||
| dac0f79e51 | |||
| 32487763d5 | |||
| 06c3c556e6 | |||
| e5dcaa83df | |||
| 1b7198dc62 | |||
| 9cfe074206 | |||
| ae30990ada | |||
| 8f1c359abc | |||
| 0a110de9c7 | |||
| d705b16923 | |||
| ca781132c8 | |||
| 7a68b0fb68 | |||
| 87dec5433d | |||
| 99f4eba8d0 | |||
| 54561e2dd2 | |||
| bfaec79a8f | |||
| 3e0e522a6b | |||
| f70b6d87ec | |||
| 693b5b83b1 | |||
| ad38ef5cd7 | |||
| 4c4c6e6575 | |||
| 740b47d9dc | |||
| f3233deb2e | |||
| fd45127678 | |||
| 0506c88735 | |||
| 4bc9eb6c0d | |||
| 5a3dac655c | |||
| 519254efbe | |||
| 6ab56b79f3 | |||
| e020e3e2b1 | |||
| f293547910 | |||
| 662bc37462 | |||
| 9f1ef4ef1f | |||
| 1243573924 | |||
| 407292b177 | |||
| a7df7c0912 | |||
| c758144bfe | |||
| fee516dd19 | |||
| 032fbd5768 | |||
| 970e864514 | |||
| d0649ece6e | |||
| 5d4cabd843 | |||
| 9070a6ac16 | |||
| 4f24f537f6 | |||
| 8859a203e2 | |||
| b091364054 | |||
| 43b1a013ca | |||
| f81532e7f2 | |||
| 986d3987d3 | |||
| 1bf522311f | |||
| 24166dcfc8 |
@@ -27,7 +27,7 @@ jobs:
|
||||
- uses: actions/checkout@v5
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v6
|
||||
uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
version: ${{ env.UV_VERSION }}
|
||||
|
||||
|
||||
@@ -30,12 +30,12 @@ jobs:
|
||||
|
||||
# Initializes the CodeQL tools for scanning.
|
||||
- name: Initialize CodeQL
|
||||
uses: github/codeql-action/init@v3
|
||||
uses: github/codeql-action/init@v4
|
||||
with:
|
||||
languages: python
|
||||
dependency-caching: true
|
||||
|
||||
- name: Perform CodeQL Analysis
|
||||
uses: github/codeql-action/analyze@v3
|
||||
uses: github/codeql-action/analyze@v4
|
||||
with:
|
||||
category: "/language:python"
|
||||
|
||||
@@ -0,0 +1,162 @@
|
||||
name: Extract E2E Tests (every 4 hours)
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: "0 */4 * * *"
|
||||
workflow_dispatch:
|
||||
# Allows manual triggering
|
||||
inputs:
|
||||
environment:
|
||||
description: "Environment to run the tests in"
|
||||
required: false
|
||||
default: staging
|
||||
type: choice
|
||||
options:
|
||||
- staging
|
||||
- production
|
||||
notify_slack:
|
||||
description: "Notify Slack"
|
||||
required: false
|
||||
default: false
|
||||
type: boolean
|
||||
workflow_call:
|
||||
|
||||
env:
|
||||
UV_VERSION: "0.7.20"
|
||||
PYTHON_VERSION: "3.12"
|
||||
SLACK_CHANNEL_ID: C078PHNTF44 # Extract channel ID
|
||||
API_E2E_LOG_PATH: ${{ github.workspace }}/extract-e2e.log
|
||||
|
||||
jobs:
|
||||
extract-e2e:
|
||||
name: "Extract E2E Tests (${{ matrix.environment }})"
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 30
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.environment }}
|
||||
cancel-in-progress: true
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
environment: ${{ github.event_name == 'schedule' && fromJson('["staging", "production"]') || fromJson(format('["{0}"]', github.event.inputs.environment || 'staging')) }}
|
||||
steps:
|
||||
- name: Set runtime inputs
|
||||
id: runtime
|
||||
run: |
|
||||
environment=${{ matrix.environment }}
|
||||
notify_slack=${{ github.event.inputs.notify_slack || github.event_name == 'schedule' }}
|
||||
echo "environment=${environment}" >> $GITHUB_OUTPUT
|
||||
echo "notify_slack=${notify_slack}" >> $GITHUB_OUTPUT
|
||||
|
||||
if [ "${environment}" = "production" ]; then
|
||||
echo "LLAMA_CLOUD_BASE_URL=https://api.cloud.llamaindex.ai" >> $GITHUB_ENV
|
||||
api_key_secret="${{ secrets.LLAMA_CLOUD_API_KEY }}"
|
||||
project_id_secret="${{ secrets.LLAMA_CLOUD_PROJECT_ID }}"
|
||||
else
|
||||
echo "LLAMA_CLOUD_BASE_URL=https://api.staging.llamaindex.ai" >> $GITHUB_ENV
|
||||
api_key_secret="${{ secrets.LLAMA_CLOUD_API_KEY_STAGING }}"
|
||||
project_id_secret="${{ secrets.LLAMA_CLOUD_PROJECT_ID_STAGING }}"
|
||||
fi
|
||||
|
||||
if [ -n "$api_key_secret" ]; then
|
||||
echo "LLAMA_CLOUD_API_KEY=$api_key_secret" >> $GITHUB_ENV
|
||||
fi
|
||||
|
||||
if [ -n "$project_id_secret" ]; then
|
||||
echo "LLAMA_CLOUD_PROJECT_ID=$project_id_secret" >> $GITHUB_ENV
|
||||
fi
|
||||
|
||||
- uses: actions/checkout@v5
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
version: ${{ env.UV_VERSION }}
|
||||
|
||||
- name: Set up Python
|
||||
run: uv python install ${{ env.PYTHON_VERSION }} && uv python pin ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Run Extract E2E tests
|
||||
id: extract-tests
|
||||
continue-on-error: true
|
||||
working-directory: py
|
||||
run: |
|
||||
set -o pipefail
|
||||
rm -f "$API_E2E_LOG_PATH"
|
||||
uv run pytest -v -n 8 --timeout=300 --session-timeout=1740 tests/extract/ 2>&1 | tee "$API_E2E_LOG_PATH"
|
||||
|
||||
- name: Extract pytest failure summary
|
||||
id: failed-tests
|
||||
if: steps.extract-tests.outcome == 'failure' || cancelled()
|
||||
run: |
|
||||
summary="$(python3 - <<'PY'
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
log_path = Path(os.environ["API_E2E_LOG_PATH"])
|
||||
if not log_path.exists():
|
||||
print("Test log not found.")
|
||||
raise SystemExit(0)
|
||||
|
||||
lines = log_path.read_text(errors="ignore").splitlines()
|
||||
|
||||
# Find the "short test summary info" section
|
||||
start = None
|
||||
for i, line in enumerate(lines):
|
||||
if line.startswith("=") and "short test summary info" in line:
|
||||
start = i + 1
|
||||
break
|
||||
|
||||
if start is None:
|
||||
print("No test summary found.")
|
||||
raise SystemExit(0)
|
||||
|
||||
# Extract just the FAILED/ERROR lines (test name + short reason)
|
||||
failed_tests = []
|
||||
for line in lines[start:]:
|
||||
if line.startswith("="):
|
||||
break # End of section
|
||||
if line.startswith("FAILED ") or line.startswith("ERROR "):
|
||||
# Extract test name and truncate the error message
|
||||
match = re.match(r"(FAILED|ERROR) ([\w/:.\[\]_-]+)", line)
|
||||
if match:
|
||||
failed_tests.append(f"{match.group(1)}: {match.group(2)}")
|
||||
|
||||
if failed_tests:
|
||||
print("\n".join(failed_tests[:20])) # Limit to 20 tests max
|
||||
else:
|
||||
print("No failed tests found in summary.")
|
||||
PY
|
||||
)"
|
||||
if [ -z "$summary" ]; then
|
||||
summary="Failed test summary not available. Review the full run logs."
|
||||
fi
|
||||
{
|
||||
printf 'summary<<EOF\n%s\nEOF\n' "$summary"
|
||||
} >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Check test results
|
||||
if: always()
|
||||
run: |
|
||||
if [ "${{ steps.extract-tests.outcome }}" == "failure" ]; then
|
||||
echo "Extract E2E tests failed"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Post to Extract Slack channel
|
||||
id: slack
|
||||
if: (failure() || cancelled()) && steps.runtime.outputs.notify_slack == 'true'
|
||||
uses: slackapi/slack-github-action@v2.1.1
|
||||
with:
|
||||
channel-id: ${{ env.SLACK_CHANNEL_ID }}
|
||||
slack-message: |
|
||||
:red_circle: *Extract E2E Failed* (${{ steps.runtime.outputs.environment }})
|
||||
```
|
||||
${{ steps.failed-tests.outputs.summary }}
|
||||
```
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View Run>
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
@@ -22,7 +22,7 @@ jobs:
|
||||
with:
|
||||
fetch-depth: ${{ github.event_name == 'pull_request' && 2 || 0 }}
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v6
|
||||
uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
version: ${{ env.UV_VERSION }}
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@ env:
|
||||
jobs:
|
||||
test_e2e:
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 30
|
||||
strategy:
|
||||
# You can use PyPy versions in python-version.
|
||||
# For example, pypy-2.7 and pypy-3.8
|
||||
@@ -22,7 +23,7 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v6
|
||||
uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
version: ${{ env.UV_VERSION }}
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v6
|
||||
uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
version: ${{ env.UV_VERSION }}
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ jobs:
|
||||
python-version: "3.11"
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v6
|
||||
uses: astral-sh/setup-uv@v7
|
||||
|
||||
- name: Install dependencies
|
||||
run: pnpm install
|
||||
|
||||
@@ -22,7 +22,7 @@ repos:
|
||||
hooks:
|
||||
- id: ruff
|
||||
args: [--fix, --exit-non-zero-on-fix]
|
||||
exclude: ".*uv.lock"
|
||||
exclude: ".*uv.lock|examples/"
|
||||
- repo: https://github.com/psf/black-pre-commit-mirror
|
||||
rev: 23.10.1
|
||||
hooks:
|
||||
@@ -34,7 +34,7 @@ repos:
|
||||
rev: v1.0.1
|
||||
hooks:
|
||||
- id: mypy
|
||||
exclude: ^py/tests|^py/unit_tests
|
||||
exclude: ^py/tests|^py/unit_tests|^examples
|
||||
additional_dependencies:
|
||||
[
|
||||
"types-requests",
|
||||
|
||||
@@ -4,77 +4,12 @@
|
||||
|
||||
# Llama Cloud Services
|
||||
|
||||
This repository contains the code for hand-written SDKs and clients for interacting with LlamaCloud.
|
||||
|
||||
This includes:
|
||||
|
||||
- [LlamaParse](./parse.md) - A GenAI-native document parser that can parse complex document data for any downstream LLM use case (Agents, RAG, data processing, etc.).
|
||||
- [LlamaExtract](./extract.md) - A prebuilt agentic data extractor that can be used to transform data into a structured JSON representation.
|
||||
- [LlamaCloud Index](./index.md) - A widely customizable and fully automated document ingestion pipeline that also serves retrieval purposes.
|
||||
|
||||
## Getting Started
|
||||
|
||||
Install the package:
|
||||
|
||||
```bash
|
||||
pip install llama-cloud-services
|
||||
```
|
||||
|
||||
Then, get your API key from [LlamaCloud](https://cloud.llamaindex.ai/).
|
||||
|
||||
Then, you can use the services in your code:
|
||||
|
||||
```python
|
||||
from llama_cloud_services import (
|
||||
LlamaParse,
|
||||
LlamaExtract,
|
||||
LlamaCloudIndex,
|
||||
)
|
||||
|
||||
parser = LlamaParse(api_key="YOUR_API_KEY")
|
||||
extract = LlamaExtract(api_key="YOUR_API_KEY")
|
||||
index = LlamaCloudIndex(
|
||||
"my_first_index", project_name="default", api_key="YOUR_API_KEY"
|
||||
)
|
||||
```
|
||||
|
||||
See the quickstart guides for each service for more information:
|
||||
|
||||
- [LlamaParse](./parse.md)
|
||||
- [LlamaExtract](./extract.md)
|
||||
- [LlamaCloud Index](./index.md)
|
||||
|
||||
## Switch to EU SaaS 🇪🇺
|
||||
|
||||
If you are interested in using LlamaCloud services in the EU, you can adjust your base URL to `https://api.cloud.eu.llamaindex.ai`.
|
||||
|
||||
You can also create your API key in the EU region [here](https://cloud.eu.llamaindex.ai).
|
||||
|
||||
```python
|
||||
from llama_cloud_services import (
|
||||
LlamaParse,
|
||||
LlamaExtract,
|
||||
EU_BASE_URL,
|
||||
)
|
||||
|
||||
parser = LlamaParse(api_key="YOUR_API_KEY", base_url=EU_BASE_URL)
|
||||
extract = LlamaExtract(api_key="YOUR_API_KEY", base_url=EU_BASE_URL)
|
||||
index = LlamaCloudIndex(
|
||||
"my_first_index",
|
||||
project_name="default",
|
||||
api_key="YOUR_API_KEY",
|
||||
base_url=EU_BASE_URL,
|
||||
)
|
||||
```
|
||||
|
||||
## Documentation
|
||||
|
||||
You can see complete SDK and API documentation for each service on [our official docs](https://docs.cloud.llamaindex.ai/).
|
||||
|
||||
## Terms of Service
|
||||
|
||||
See the [Terms of Service Here](./TOS.pdf).
|
||||
|
||||
## Get in Touch (LlamaCloud)
|
||||
|
||||
You can get in touch with us by following our [contact link](https://www.llamaindex.ai/contact).
|
||||
> **⚠️ DEPRECATION NOTICE**
|
||||
>
|
||||
> This repository and its packages are deprecated and will be maintained until **May 1, 2026**.
|
||||
>
|
||||
> **Please migrate to the new packages:**
|
||||
> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))
|
||||
> - **TypeScript**: `npm install @llamaindex/llama-cloud` ([GitHub](https://github.com/run-llama/llama-cloud-ts))
|
||||
>
|
||||
> The new packages provide the same functionality with improved performance, better support, and active development.
|
||||
|
||||
@@ -0,0 +1,21 @@
|
||||
node_modules
|
||||
package-lock.json
|
||||
yarn.lock
|
||||
|
||||
.DS_Store
|
||||
.cache
|
||||
.env
|
||||
.vercel
|
||||
.output
|
||||
.nitro
|
||||
/build/
|
||||
/api/
|
||||
/server/build
|
||||
/public/build# Sentry Config File
|
||||
.env.sentry-build-plugin
|
||||
/test-results/
|
||||
/playwright-report/
|
||||
/blob-report/
|
||||
/playwright/.cache/
|
||||
.tanstack
|
||||
.vscode
|
||||
@@ -0,0 +1,4 @@
|
||||
**/build
|
||||
**/public
|
||||
pnpm-lock.yaml
|
||||
routeTree.gen.ts
|
||||
@@ -0,0 +1,88 @@
|
||||
# LlamaClassify Demo
|
||||
|
||||
A TypeScript demo application showcasing the power of **LlamaClassify** - an agentic documents classification service from [LlamaCloud](https://cloud.llamaindex.ai). This demo allows you to classify financial documents among three different types (Cash flow statement, Income Statement and Balance Sheet).
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Features](#features)
|
||||
- [Prerequisites](#prerequisites)
|
||||
- [Installation](#installation)
|
||||
- [Usage](#usage)
|
||||
- [Start the Demo](#start-the-demo)
|
||||
- [How It Works](#how-it-works)
|
||||
- [Troubleshooting](#troubleshooting)
|
||||
- [Common Issues](#common-issues)
|
||||
- [License](#license)
|
||||
- [Contributing](#contributing)
|
||||
|
||||
## Features
|
||||
|
||||
- 📄 **Documemt Classification**: Classify files based on well-defined rules you can customized and play around with.
|
||||
- 🤖 **Reasoning-based Actionable Insights**: Get in-depth, reasoning based insights on the document classification, accompanied by confidence scores.
|
||||
- 🎨 **Beautiful UI**: [DaisyUI](https://daisyui.com)-based interface powered by [TanStack](https://tanstack.com)
|
||||
- ⚡ **Fast Development**: Hot reload support with development mode
|
||||
- 🛠️ **TypeScript**: Full TypeScript support with strict type checking
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Node.js (version 22 or higher)
|
||||
- pnpm package manager
|
||||
- LlamaCloud API key
|
||||
|
||||
## Installation
|
||||
|
||||
1. Clone the repository:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/run-llama/llama_cloud_services
|
||||
cd lama_cloud_services/examples-ts/classify/
|
||||
```
|
||||
|
||||
2. Install dependencies:
|
||||
|
||||
```bash
|
||||
npm install
|
||||
```
|
||||
|
||||
3. Set up your environment variables:
|
||||
|
||||
```bash
|
||||
# Add your API key to your environment
|
||||
export LLAMA_CLOUD_API_KEY="your-llamacloud-api-key"
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Start the Demo
|
||||
|
||||
```bash
|
||||
npm run dev
|
||||
```
|
||||
|
||||
The application will be up and running on http://localhost:3000
|
||||
|
||||
## How It Works
|
||||
|
||||
1. **Document Input**: Enter the path to your document when prompted
|
||||
2. **Parsing**: LlamaClassify, based on the rules you can find [here](./src/utils/classifier.ts), processes the document and classifies it
|
||||
3. **Results**: The classification outcome, as well as the reasoning behind it and the confidence score, are displayed in the UI.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
1. **Module Resolution Errors**: Ensure you're using Node.js 22+ and have all dependencies installed
|
||||
2. **API Key Issues**: Verify your LlamaCloud API key is correctly set
|
||||
3. **File Path Errors**: Use absolute paths or ensure relative paths are correct from the project root
|
||||
|
||||
## License
|
||||
|
||||
MIT License - see the [LICENSE](../../LICENSE) file for details.
|
||||
|
||||
## Contributing
|
||||
|
||||
1. Fork the repository
|
||||
2. Create a feature branch
|
||||
3. Make your changes
|
||||
4. Run `npm run format` and `npm run lint`
|
||||
5. Submit a pull request
|
||||
@@ -0,0 +1,34 @@
|
||||
{
|
||||
"name": "tanstack-start-example-basic",
|
||||
"private": true,
|
||||
"sideEffects": false,
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"dev": "vite dev",
|
||||
"build": "vite build && tsc --noEmit",
|
||||
"start": "node .output/server/index.mjs"
|
||||
},
|
||||
"dependencies": {
|
||||
"@tanstack/react-router": "^1.133.22",
|
||||
"@tanstack/react-router-devtools": "^1.133.22",
|
||||
"@tanstack/react-start": "^1.133.22",
|
||||
"llama-cloud-services": "file:../../ts/llama_cloud_services",
|
||||
"react": "^19.0.0",
|
||||
"react-dom": "^19.0.0",
|
||||
"tailwind-merge": "^2.6.0",
|
||||
"zod": "^3.24.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@tailwindcss/postcss": "^4.1.15",
|
||||
"@types/node": "^22.5.4",
|
||||
"@types/react": "^19.0.8",
|
||||
"@types/react-dom": "^19.0.3",
|
||||
"@vitejs/plugin-react": "^4.6.0",
|
||||
"daisyui": "^5.3.7",
|
||||
"postcss": "^8.5.1",
|
||||
"tailwindcss": "^4.1.15",
|
||||
"typescript": "^5.7.2",
|
||||
"vite": "^7.1.7",
|
||||
"vite-tsconfig-paths": "^5.1.4"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
export default {
|
||||
plugins: {
|
||||
'@tailwindcss/postcss': {},
|
||||
},
|
||||
}
|
||||
|
After Width: | Height: | Size: 3.3 KiB |
|
After Width: | Height: | Size: 21 KiB |
|
After Width: | Height: | Size: 3.8 KiB |
|
After Width: | Height: | Size: 862 B |
|
After Width: | Height: | Size: 1.1 KiB |
|
After Width: | Height: | Size: 1.1 KiB |
|
After Width: | Height: | Size: 2.0 KiB |
@@ -0,0 +1,19 @@
|
||||
{
|
||||
"name": "",
|
||||
"short_name": "",
|
||||
"icons": [
|
||||
{
|
||||
"src": "/android-chrome-192x192.png",
|
||||
"sizes": "192x192",
|
||||
"type": "image/png"
|
||||
},
|
||||
{
|
||||
"src": "/android-chrome-512x512.png",
|
||||
"sizes": "512x512",
|
||||
"type": "image/png"
|
||||
}
|
||||
],
|
||||
"theme_color": "#ffffff",
|
||||
"background_color": "#ffffff",
|
||||
"display": "standalone"
|
||||
}
|
||||
@@ -0,0 +1,53 @@
|
||||
import {
|
||||
ErrorComponent,
|
||||
Link,
|
||||
rootRouteId,
|
||||
useMatch,
|
||||
useRouter,
|
||||
} from '@tanstack/react-router'
|
||||
import type { ErrorComponentProps } from '@tanstack/react-router'
|
||||
|
||||
export function DefaultCatchBoundary({ error }: ErrorComponentProps) {
|
||||
const router = useRouter()
|
||||
const isRoot = useMatch({
|
||||
strict: false,
|
||||
select: (state) => state.id === rootRouteId,
|
||||
})
|
||||
|
||||
console.error('DefaultCatchBoundary Error:', error)
|
||||
|
||||
return (
|
||||
<div className="min-w-0 flex-1 p-4 flex flex-col items-center justify-center gap-6">
|
||||
<ErrorComponent error={error} />
|
||||
<div className="flex gap-2 items-center flex-wrap">
|
||||
<button
|
||||
onClick={() => {
|
||||
router.invalidate()
|
||||
}}
|
||||
className={`px-2 py-1 bg-gray-600 dark:bg-gray-700 rounded-sm text-white uppercase font-extrabold`}
|
||||
>
|
||||
Try Again
|
||||
</button>
|
||||
{isRoot ? (
|
||||
<Link
|
||||
to="/"
|
||||
className={`px-2 py-1 bg-gray-600 dark:bg-gray-700 rounded-sm text-white uppercase font-extrabold`}
|
||||
>
|
||||
Home
|
||||
</Link>
|
||||
) : (
|
||||
<Link
|
||||
to="/"
|
||||
className={`px-2 py-1 bg-gray-600 dark:bg-gray-700 rounded-sm text-white uppercase font-extrabold`}
|
||||
onClick={(e) => {
|
||||
e.preventDefault()
|
||||
window.history.back()
|
||||
}}
|
||||
>
|
||||
Go Back
|
||||
</Link>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
import { Link } from '@tanstack/react-router'
|
||||
|
||||
export function NotFound({ children }: { children?: any }) {
|
||||
return (
|
||||
<div className="space-y-2 p-2">
|
||||
<div className="text-gray-600 dark:text-gray-400">
|
||||
{children || <p>The page you are looking for does not exist.</p>}
|
||||
</div>
|
||||
<p className="flex items-center gap-2 flex-wrap">
|
||||
<button
|
||||
onClick={() => window.history.back()}
|
||||
className="bg-emerald-500 text-white px-2 py-1 rounded-sm uppercase font-black text-sm"
|
||||
>
|
||||
Go back
|
||||
</button>
|
||||
<Link
|
||||
to="/"
|
||||
className="bg-cyan-600 text-white px-2 py-1 rounded-sm uppercase font-black text-sm"
|
||||
>
|
||||
Start Over
|
||||
</Link>
|
||||
</p>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,225 @@
|
||||
/* eslint-disable */
|
||||
|
||||
// @ts-nocheck
|
||||
|
||||
// noinspection JSUnusedGlobalSymbols
|
||||
|
||||
// This file was automatically generated by TanStack Router.
|
||||
// You should NOT make any changes in this file as it will be overwritten.
|
||||
// Additionally, you should also exclude this file from your linter and/or formatter to prevent it from being checked or modified.
|
||||
|
||||
import { Route as rootRouteImport } from './routes/__root'
|
||||
import { Route as UsersRouteImport } from './routes/users'
|
||||
import { Route as IndexRouteImport } from './routes/index'
|
||||
import { Route as UsersIndexRouteImport } from './routes/users.index'
|
||||
import { Route as PostsIndexRouteImport } from './routes/posts.index'
|
||||
import { Route as UsersUserIdRouteImport } from './routes/users.$userId'
|
||||
import { Route as PostsPostIdRouteImport } from './routes/posts.$postId'
|
||||
import { Route as ApiClassifyRouteImport } from './routes/api/classify'
|
||||
import { Route as PostsPostIdDeepRouteImport } from './routes/posts_.$postId.deep'
|
||||
|
||||
const UsersRoute = UsersRouteImport.update({
|
||||
id: '/users',
|
||||
path: '/users',
|
||||
getParentRoute: () => rootRouteImport,
|
||||
} as any)
|
||||
const IndexRoute = IndexRouteImport.update({
|
||||
id: '/',
|
||||
path: '/',
|
||||
getParentRoute: () => rootRouteImport,
|
||||
} as any)
|
||||
const UsersIndexRoute = UsersIndexRouteImport.update({
|
||||
id: '/',
|
||||
path: '/',
|
||||
getParentRoute: () => UsersRoute,
|
||||
} as any)
|
||||
const PostsIndexRoute = PostsIndexRouteImport.update({
|
||||
id: '/posts/',
|
||||
path: '/posts/',
|
||||
getParentRoute: () => rootRouteImport,
|
||||
} as any)
|
||||
const UsersUserIdRoute = UsersUserIdRouteImport.update({
|
||||
id: '/$userId',
|
||||
path: '/$userId',
|
||||
getParentRoute: () => UsersRoute,
|
||||
} as any)
|
||||
const PostsPostIdRoute = PostsPostIdRouteImport.update({
|
||||
id: '/posts/$postId',
|
||||
path: '/posts/$postId',
|
||||
getParentRoute: () => rootRouteImport,
|
||||
} as any)
|
||||
const ApiClassifyRoute = ApiClassifyRouteImport.update({
|
||||
id: '/api/classify',
|
||||
path: '/api/classify',
|
||||
getParentRoute: () => rootRouteImport,
|
||||
} as any)
|
||||
const PostsPostIdDeepRoute = PostsPostIdDeepRouteImport.update({
|
||||
id: '/posts_/$postId/deep',
|
||||
path: '/posts/$postId/deep',
|
||||
getParentRoute: () => rootRouteImport,
|
||||
} as any)
|
||||
|
||||
export interface FileRoutesByFullPath {
|
||||
'/': typeof IndexRoute
|
||||
'/users': typeof UsersRouteWithChildren
|
||||
'/api/classify': typeof ApiClassifyRoute
|
||||
'/posts/$postId': typeof PostsPostIdRoute
|
||||
'/users/$userId': typeof UsersUserIdRoute
|
||||
'/posts': typeof PostsIndexRoute
|
||||
'/users/': typeof UsersIndexRoute
|
||||
'/posts/$postId/deep': typeof PostsPostIdDeepRoute
|
||||
}
|
||||
export interface FileRoutesByTo {
|
||||
'/': typeof IndexRoute
|
||||
'/api/classify': typeof ApiClassifyRoute
|
||||
'/posts/$postId': typeof PostsPostIdRoute
|
||||
'/users/$userId': typeof UsersUserIdRoute
|
||||
'/posts': typeof PostsIndexRoute
|
||||
'/users': typeof UsersIndexRoute
|
||||
'/posts/$postId/deep': typeof PostsPostIdDeepRoute
|
||||
}
|
||||
export interface FileRoutesById {
|
||||
__root__: typeof rootRouteImport
|
||||
'/': typeof IndexRoute
|
||||
'/users': typeof UsersRouteWithChildren
|
||||
'/api/classify': typeof ApiClassifyRoute
|
||||
'/posts/$postId': typeof PostsPostIdRoute
|
||||
'/users/$userId': typeof UsersUserIdRoute
|
||||
'/posts/': typeof PostsIndexRoute
|
||||
'/users/': typeof UsersIndexRoute
|
||||
'/posts_/$postId/deep': typeof PostsPostIdDeepRoute
|
||||
}
|
||||
export interface FileRouteTypes {
|
||||
fileRoutesByFullPath: FileRoutesByFullPath
|
||||
fullPaths:
|
||||
| '/'
|
||||
| '/users'
|
||||
| '/api/classify'
|
||||
| '/posts/$postId'
|
||||
| '/users/$userId'
|
||||
| '/posts'
|
||||
| '/users/'
|
||||
| '/posts/$postId/deep'
|
||||
fileRoutesByTo: FileRoutesByTo
|
||||
to:
|
||||
| '/'
|
||||
| '/api/classify'
|
||||
| '/posts/$postId'
|
||||
| '/users/$userId'
|
||||
| '/posts'
|
||||
| '/users'
|
||||
| '/posts/$postId/deep'
|
||||
id:
|
||||
| '__root__'
|
||||
| '/'
|
||||
| '/users'
|
||||
| '/api/classify'
|
||||
| '/posts/$postId'
|
||||
| '/users/$userId'
|
||||
| '/posts/'
|
||||
| '/users/'
|
||||
| '/posts_/$postId/deep'
|
||||
fileRoutesById: FileRoutesById
|
||||
}
|
||||
export interface RootRouteChildren {
|
||||
IndexRoute: typeof IndexRoute
|
||||
UsersRoute: typeof UsersRouteWithChildren
|
||||
ApiClassifyRoute: typeof ApiClassifyRoute
|
||||
PostsPostIdRoute: typeof PostsPostIdRoute
|
||||
PostsIndexRoute: typeof PostsIndexRoute
|
||||
PostsPostIdDeepRoute: typeof PostsPostIdDeepRoute
|
||||
}
|
||||
|
||||
declare module '@tanstack/react-router' {
|
||||
interface FileRoutesByPath {
|
||||
'/users': {
|
||||
id: '/users'
|
||||
path: '/users'
|
||||
fullPath: '/users'
|
||||
preLoaderRoute: typeof UsersRouteImport
|
||||
parentRoute: typeof rootRouteImport
|
||||
}
|
||||
'/': {
|
||||
id: '/'
|
||||
path: '/'
|
||||
fullPath: '/'
|
||||
preLoaderRoute: typeof IndexRouteImport
|
||||
parentRoute: typeof rootRouteImport
|
||||
}
|
||||
'/users/': {
|
||||
id: '/users/'
|
||||
path: '/'
|
||||
fullPath: '/users/'
|
||||
preLoaderRoute: typeof UsersIndexRouteImport
|
||||
parentRoute: typeof UsersRoute
|
||||
}
|
||||
'/posts/': {
|
||||
id: '/posts/'
|
||||
path: '/posts'
|
||||
fullPath: '/posts'
|
||||
preLoaderRoute: typeof PostsIndexRouteImport
|
||||
parentRoute: typeof rootRouteImport
|
||||
}
|
||||
'/users/$userId': {
|
||||
id: '/users/$userId'
|
||||
path: '/$userId'
|
||||
fullPath: '/users/$userId'
|
||||
preLoaderRoute: typeof UsersUserIdRouteImport
|
||||
parentRoute: typeof UsersRoute
|
||||
}
|
||||
'/posts/$postId': {
|
||||
id: '/posts/$postId'
|
||||
path: '/posts/$postId'
|
||||
fullPath: '/posts/$postId'
|
||||
preLoaderRoute: typeof PostsPostIdRouteImport
|
||||
parentRoute: typeof rootRouteImport
|
||||
}
|
||||
'/api/classify': {
|
||||
id: '/api/classify'
|
||||
path: '/api/classify'
|
||||
fullPath: '/api/classify'
|
||||
preLoaderRoute: typeof ApiClassifyRouteImport
|
||||
parentRoute: typeof rootRouteImport
|
||||
}
|
||||
'/posts_/$postId/deep': {
|
||||
id: '/posts_/$postId/deep'
|
||||
path: '/posts/$postId/deep'
|
||||
fullPath: '/posts/$postId/deep'
|
||||
preLoaderRoute: typeof PostsPostIdDeepRouteImport
|
||||
parentRoute: typeof rootRouteImport
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
interface UsersRouteChildren {
|
||||
UsersUserIdRoute: typeof UsersUserIdRoute
|
||||
UsersIndexRoute: typeof UsersIndexRoute
|
||||
}
|
||||
|
||||
const UsersRouteChildren: UsersRouteChildren = {
|
||||
UsersUserIdRoute: UsersUserIdRoute,
|
||||
UsersIndexRoute: UsersIndexRoute,
|
||||
}
|
||||
|
||||
const UsersRouteWithChildren = UsersRoute._addFileChildren(UsersRouteChildren)
|
||||
|
||||
const rootRouteChildren: RootRouteChildren = {
|
||||
IndexRoute: IndexRoute,
|
||||
UsersRoute: UsersRouteWithChildren,
|
||||
ApiClassifyRoute: ApiClassifyRoute,
|
||||
PostsPostIdRoute: PostsPostIdRoute,
|
||||
PostsIndexRoute: PostsIndexRoute,
|
||||
PostsPostIdDeepRoute: PostsPostIdDeepRoute,
|
||||
}
|
||||
export const routeTree = rootRouteImport
|
||||
._addFileChildren(rootRouteChildren)
|
||||
._addFileTypes<FileRouteTypes>()
|
||||
|
||||
import type { getRouter } from './router.tsx'
|
||||
import type { createStart } from '@tanstack/react-start'
|
||||
declare module '@tanstack/react-start' {
|
||||
interface Register {
|
||||
ssr: true
|
||||
router: Awaited<ReturnType<typeof getRouter>>
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
import { createRouter } from '@tanstack/react-router'
|
||||
import { routeTree } from './routeTree.gen'
|
||||
import { DefaultCatchBoundary } from './components/DefaultCatchBoundary'
|
||||
import { NotFound } from './components/NotFound'
|
||||
|
||||
export function getRouter() {
|
||||
const router = createRouter({
|
||||
routeTree,
|
||||
defaultPreload: 'intent',
|
||||
defaultErrorComponent: DefaultCatchBoundary,
|
||||
defaultNotFoundComponent: () => <NotFound />,
|
||||
scrollRestoration: true,
|
||||
})
|
||||
return router
|
||||
}
|
||||
@@ -0,0 +1,128 @@
|
||||
/// <reference types="vite/client" />
|
||||
import {
|
||||
HeadContent,
|
||||
Scripts,
|
||||
createRootRoute,
|
||||
} from '@tanstack/react-router'
|
||||
import * as React from 'react'
|
||||
import { DefaultCatchBoundary } from '~/components/DefaultCatchBoundary'
|
||||
import { NotFound } from '~/components/NotFound'
|
||||
import { seo } from '~/utils/seo'
|
||||
|
||||
export const Route = createRootRoute({
|
||||
head: () => ({
|
||||
meta: [
|
||||
{
|
||||
charSet: 'utf-8',
|
||||
},
|
||||
{
|
||||
name: 'viewport',
|
||||
content: 'width=device-width, initial-scale=1',
|
||||
},
|
||||
...seo({
|
||||
title:
|
||||
'Financial Documents Classification Agent',
|
||||
description: `Classify financial documents as balance sheets, income statements and cash flow statemets. `,
|
||||
}),
|
||||
],
|
||||
links: [
|
||||
{ rel: 'stylesheet', href: "https://cdn.jsdelivr.net/npm/daisyui@5" },
|
||||
{
|
||||
rel: 'apple-touch-icon',
|
||||
sizes: '180x180',
|
||||
href: '/apple-touch-icon.png',
|
||||
},
|
||||
{
|
||||
rel: 'icon',
|
||||
type: 'image/png',
|
||||
sizes: '32x32',
|
||||
href: '/favicon-32x32.png',
|
||||
},
|
||||
{
|
||||
rel: 'icon',
|
||||
type: 'image/png',
|
||||
sizes: '16x16',
|
||||
href: '/favicon-16x16.png',
|
||||
},
|
||||
{ rel: 'manifest', href: '/site.webmanifest', color: '#fffff' },
|
||||
{ rel: 'icon', href: '/favicon.ico' },
|
||||
],
|
||||
scripts: [
|
||||
{
|
||||
src: '/customScript.js',
|
||||
type: 'text/javascript',
|
||||
},
|
||||
{
|
||||
src: "https://cdn.jsdelivr.net/npm/@tailwindcss/browser@4",
|
||||
type: "text/javascript",
|
||||
}
|
||||
],
|
||||
}),
|
||||
errorComponent: DefaultCatchBoundary,
|
||||
notFoundComponent: () => <NotFound />,
|
||||
shellComponent: RootDocument,
|
||||
})
|
||||
|
||||
function RootDocument({ children }: { children: React.ReactNode }) {
|
||||
return (
|
||||
<html>
|
||||
<head>
|
||||
<HeadContent />
|
||||
</head>
|
||||
<body>
|
||||
<div className="navbar bg-base-100 shadow-sm">
|
||||
<div className="navbar-start">
|
||||
<div className="dropdown">
|
||||
<div tabIndex={0} role="button" className="btn btn-ghost btn-circle">
|
||||
<svg
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
className="h-5 w-5"
|
||||
fill="none"
|
||||
viewBox="0 0 24 24"
|
||||
stroke="currentColor"
|
||||
>
|
||||
<path
|
||||
strokeLinecap="round"
|
||||
strokeLinejoin="round"
|
||||
strokeWidth="2"
|
||||
d="M4 6h16M4 12h16M4 18h7"
|
||||
/>
|
||||
</svg>
|
||||
</div>
|
||||
<ul
|
||||
tabIndex={0}
|
||||
className="menu menu-lg dropdown-content bg-base-100 rounded-box z-1 mt-3 w-80 p-2 shadow"
|
||||
>
|
||||
<li><a href="/">Home</a></li>
|
||||
<li><a href="https://cloud.llamaindex.ai">Get Started with LlamaCloud</a></li>
|
||||
<li><a href="https://developers.llamaindex.ai/python/cloud/llamaclassify/getting_started/">LlamaClassify Docs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
<div className="navbar-center">
|
||||
<a className="btn btn-ghost text-xl" href="/">Financial Documents Classification Agent</a>
|
||||
</div>
|
||||
<div className="navbar-end">
|
||||
<a href="https://github.com/run-llama/llama_cloud_services/main/blob/examples-ts/classify">
|
||||
<button className="btn btn-ghost btn-circle">
|
||||
<div className="indicator">
|
||||
<svg
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
className="h-10 w-10"
|
||||
fill="currentColor"
|
||||
viewBox="0 0 640 512"
|
||||
>
|
||||
<path d="M237.9 461.4C237.9 463.4 235.6 465 232.7 465C229.4 465.3 227.1 463.7 227.1 461.4C227.1 459.4 229.4 457.8 232.3 457.8C235.3 457.5 237.9 459.1 237.9 461.4zM206.8 456.9C206.1 458.9 208.1 461.2 211.1 461.8C213.7 462.8 216.7 461.8 217.3 459.8C217.9 457.8 216 455.5 213 454.6C210.4 453.9 207.5 454.9 206.8 456.9zM251 455.2C248.1 455.9 246.1 457.8 246.4 460.1C246.7 462.1 249.3 463.4 252.3 462.7C255.2 462 257.2 460.1 256.9 458.1C256.6 456.2 253.9 454.9 251 455.2zM316.8 72C178.1 72 72 177.3 72 316C72 426.9 141.8 521.8 241.5 555.2C254.3 557.5 258.8 549.6 258.8 543.1C258.8 536.9 258.5 502.7 258.5 481.7C258.5 481.7 188.5 496.7 173.8 451.9C173.8 451.9 162.4 422.8 146 415.3C146 415.3 123.1 399.6 147.6 399.9C147.6 399.9 172.5 401.9 186.2 425.7C208.1 464.3 244.8 453.2 259.1 446.6C261.4 430.6 267.9 419.5 275.1 412.9C219.2 406.7 162.8 398.6 162.8 302.4C162.8 274.9 170.4 261.1 186.4 243.5C183.8 237 175.3 210.2 189 175.6C209.9 169.1 258 202.6 258 202.6C278 197 299.5 194.1 320.8 194.1C342.1 194.1 363.6 197 383.6 202.6C383.6 202.6 431.7 169 452.6 175.6C466.3 210.3 457.8 237 455.2 243.5C471.2 261.2 481 275 481 302.4C481 398.9 422.1 406.6 366.2 412.9C375.4 420.8 383.2 435.8 383.2 459.3C383.2 493 382.9 534.7 382.9 542.9C382.9 549.4 387.5 557.3 400.2 555C500.2 521.8 568 426.9 568 316C568 177.3 455.5 72 316.8 72zM169.2 416.9C167.9 417.9 168.2 420.2 169.9 422.1C171.5 423.7 173.8 424.4 175.1 423.1C176.4 422.1 176.1 419.8 174.4 417.9C172.8 416.3 170.5 415.6 169.2 416.9zM158.4 408.8C157.7 410.1 158.7 411.7 160.7 412.7C162.3 413.7 164.3 413.4 165 412C165.7 410.7 164.7 409.1 162.7 408.1C160.7 407.5 159.1 407.8 158.4 408.8zM190.8 444.4C189.2 445.7 189.8 448.7 192.1 450.6C194.4 452.9 197.3 453.2 198.6 451.6C199.9 450.3 199.3 447.3 197.3 445.4C195.1 443.1 192.1 442.8 190.8 444.4zM179.4 429.7C177.8 430.7 177.8 433.3 179.4 435.6C181 437.9 183.7 438.9 185 437.9C186.6 436.6 186.6 434 185 431.7C183.6 429.4 181 428.4 179.4 429.7z" />
|
||||
</svg>
|
||||
</div>
|
||||
</button>
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
<hr />
|
||||
{children}
|
||||
<Scripts />
|
||||
</body>
|
||||
</html>
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
import { createFileRoute } from '@tanstack/react-router'
|
||||
import { classifier, classificationRules, parsingConfig } from '~/utils/classifier'
|
||||
|
||||
export const Route = createFileRoute('/api/classify')({
|
||||
component: RouteComponent,
|
||||
server: {
|
||||
handlers: {
|
||||
POST: async ({ request }) => {
|
||||
const body = await request.formData()
|
||||
const fl = body.get("file") as File;
|
||||
if (!fl) {
|
||||
return new Response(JSON.stringify({"result": "you need to provide a file"}))
|
||||
}
|
||||
const buff = await fl.arrayBuffer()
|
||||
const rawRes = await classifier.classify(
|
||||
classificationRules,
|
||||
parsingConfig,
|
||||
{ fileContents: [new Uint8Array(buff)] },
|
||||
)
|
||||
const results = rawRes.items
|
||||
let classification = ""
|
||||
|
||||
for (const result of results) {
|
||||
if ("result" in result && result.result) {
|
||||
classification += `
|
||||
<div class="card bg-base-100 shadow-xl p-6 mb-4">
|
||||
<div class="space-y-3">
|
||||
<p><span class="font-semibold">📄 Document:</span> ${fl.name}</p>
|
||||
<p><span class="font-semibold">🏷️ Type:</span> <span class="badge badge-primary">${result.result.type}</span></p>
|
||||
<p><span class="font-semibold">📊 Confidence:</span> ${result.result.confidence*100}%</p>
|
||||
<p><span class="font-semibold">💭 Reasoning:</span> ${result.result.reasoning}</p>
|
||||
</div>
|
||||
</div>
|
||||
`
|
||||
}
|
||||
}
|
||||
return new Response(JSON.stringify({"result": classification}))
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
function RouteComponent() {
|
||||
return
|
||||
}
|
||||
@@ -0,0 +1,99 @@
|
||||
import { createFileRoute } from '@tanstack/react-router'
|
||||
import { useRef, useState } from 'react'
|
||||
|
||||
export const Route = createFileRoute('/')({
|
||||
component: Home,
|
||||
})
|
||||
|
||||
function Home() {
|
||||
const [file, setFile] = useState<null | File>(null)
|
||||
const fileInputRef = useRef<HTMLInputElement>(null)
|
||||
const [reply, setReply] = useState<null | string>(null)
|
||||
const [loading, setLoading] = useState<boolean>(false)
|
||||
const handleFileChange = (event: React.ChangeEvent<HTMLInputElement>) => {
|
||||
const selectedFile = event.target.files?.[0]
|
||||
if (selectedFile) {
|
||||
setFile(selectedFile)
|
||||
}
|
||||
}
|
||||
const handleClearFile = () => {
|
||||
if (file) {
|
||||
setFile(null)
|
||||
}
|
||||
if (fileInputRef.current) {
|
||||
fileInputRef.current.value = ''
|
||||
}
|
||||
if (reply) {
|
||||
setReply(null)
|
||||
}
|
||||
}
|
||||
|
||||
const handleClassify = async () => {
|
||||
if (!file) return
|
||||
|
||||
if (reply) {
|
||||
setReply(null)
|
||||
}
|
||||
setLoading(true)
|
||||
try {
|
||||
const formData = new FormData()
|
||||
formData.append('file', file)
|
||||
|
||||
const res = await fetch('/api/classify', {
|
||||
method: 'POST',
|
||||
body: formData,
|
||||
})
|
||||
|
||||
const data = await res.json()
|
||||
setReply(data.result)
|
||||
} catch (error) {
|
||||
console.error('Error:', error)
|
||||
} finally {
|
||||
setLoading(false)
|
||||
}
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="flex flex-col justify-center items-center gap-y-8">
|
||||
<br />
|
||||
<h1 className="text-xl font-bold text-gray-700">AI-Powered finacial document classification</h1>
|
||||
<h2 className="text-lg font-semibold text-gray-500">Need help sorting out the financial documents jungle? Let our classification agent handle it!</h2>
|
||||
<fieldset className="fieldset bg-base-100 border-base-300 rounded-box w-200 border p-4">
|
||||
<legend className="fieldset-legend text-lg">Upload your financial document here</legend>
|
||||
<label className="label flex justify-center">
|
||||
<input type="file" className="file-input" onChange={handleFileChange} accept='application/pdf' ref={fileInputRef} />
|
||||
</label>
|
||||
</fieldset>
|
||||
{file && (
|
||||
<div className="flex flex-col justify-center items-center gap-y-8">
|
||||
<p className="text-sm text-gray-600">Selected file: {file.name}</p>
|
||||
<div className='grid grid-cols-2 gap-x-6'>
|
||||
<button
|
||||
type="button"
|
||||
className='btn bg-gray-500 text-white shadow-lg hover:bg-gray-600 hover:shadow-xl rounded'
|
||||
onClick={handleClassify}
|
||||
>
|
||||
Classify
|
||||
</button>
|
||||
<button
|
||||
onClick={handleClearFile}
|
||||
type="button"
|
||||
className="px-4 py-2 bg-red-300 text-black rounded hover:bg-red-400 hover:shadow-xl shadow-lg"
|
||||
>
|
||||
Clear
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
{loading && (
|
||||
<span className="loading loading-spinner text-primary"></span>
|
||||
)}
|
||||
{reply && (
|
||||
<div
|
||||
className="max-w-2xl w-full"
|
||||
dangerouslySetInnerHTML={{ __html: reply }}
|
||||
/>
|
||||
)}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
import { LlamaClassify, ClassifierRule, ClassifyParsingConfiguration } from "llama-cloud-services"
|
||||
|
||||
export const classifier = new LlamaClassify(process.env.LLAMA_CLOUD_API_KEY);
|
||||
|
||||
export const classificationRules: ClassifierRule[] = [
|
||||
{
|
||||
description: "Shows a company's assets, liabilities, and shareholders' equity at a specific point in time, providing a snapshot of financial position.",
|
||||
type: "balance_sheet"
|
||||
},
|
||||
{
|
||||
description: "Reports cash inflows and outflows from operating, investing, and financing activities, highlighting liquidity and cash management.",
|
||||
type: "cash_flow_statement"
|
||||
},
|
||||
{
|
||||
description: "Summarizes revenues, expenses, and profits over a period, indicating financial performance and profitability.",
|
||||
type: "income_statement"
|
||||
},
|
||||
];
|
||||
|
||||
export const parsingConfig: ClassifyParsingConfiguration = {
|
||||
lang: "en",
|
||||
max_pages: 20,
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
export const seo = ({
|
||||
title,
|
||||
description,
|
||||
keywords,
|
||||
image,
|
||||
}: {
|
||||
title: string
|
||||
description?: string
|
||||
image?: string
|
||||
keywords?: string
|
||||
}) => {
|
||||
const tags = [
|
||||
{ title },
|
||||
{ name: 'description', content: description },
|
||||
{ name: 'keywords', content: keywords },
|
||||
{ name: 'twitter:title', content: title },
|
||||
{ name: 'twitter:description', content: description },
|
||||
{ name: 'twitter:creator', content: '@tannerlinsley' },
|
||||
{ name: 'twitter:site', content: '@tannerlinsley' },
|
||||
{ name: 'og:type', content: 'website' },
|
||||
{ name: 'og:title', content: title },
|
||||
{ name: 'og:description', content: description },
|
||||
...(image
|
||||
? [
|
||||
{ name: 'twitter:image', content: image },
|
||||
{ name: 'twitter:card', content: 'summary_large_image' },
|
||||
{ name: 'og:image', content: image },
|
||||
]
|
||||
: []),
|
||||
]
|
||||
|
||||
return tags
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"include": ["**/*.ts", "**/*.tsx"],
|
||||
"compilerOptions": {
|
||||
"strict": true,
|
||||
"esModuleInterop": true,
|
||||
"jsx": "react-jsx",
|
||||
"module": "ESNext",
|
||||
"moduleResolution": "Bundler",
|
||||
"lib": ["DOM", "DOM.Iterable", "ES2022"],
|
||||
"isolatedModules": true,
|
||||
"resolveJsonModule": true,
|
||||
"skipLibCheck": true,
|
||||
"target": "ES2022",
|
||||
"allowJs": true,
|
||||
"forceConsistentCasingInFileNames": true,
|
||||
"baseUrl": ".",
|
||||
"paths": {
|
||||
"~/*": ["./src/*"]
|
||||
},
|
||||
"noEmit": true
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
import { tanstackStart } from '@tanstack/react-start/plugin/vite'
|
||||
import { defineConfig } from 'vite'
|
||||
import tsConfigPaths from 'vite-tsconfig-paths'
|
||||
import viteReact from '@vitejs/plugin-react'
|
||||
|
||||
export default defineConfig({
|
||||
server: {
|
||||
port: 3000,
|
||||
},
|
||||
plugins: [
|
||||
tsConfigPaths({
|
||||
projects: ['./tsconfig.json'],
|
||||
}),
|
||||
tanstackStart({
|
||||
srcDirectory: 'src',
|
||||
}),
|
||||
viteReact(),
|
||||
],
|
||||
})
|
||||
@@ -1,4 +1,14 @@
|
||||
# LlamaCloud Services Examples - Python
|
||||
> **⚠️ DEPRECATION NOTICE**
|
||||
>
|
||||
> This repository and its packages are deprecated and will be maintained until **May 1, 2026**.
|
||||
>
|
||||
> **Please migrate to the new packages:**
|
||||
> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))
|
||||
> - **TypeScript**: `npm install @llamaindex/llama-cloud` ([GitHub](https://github.com/run-llama/llama-cloud-ts))
|
||||
>
|
||||
> The new packages provide the same functionality with improved performance, better support, and active development.
|
||||
|
||||
|
||||
In this folder you will find several python notebooks that contain examples regarding:
|
||||
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
sample_files/
|
||||
@@ -0,0 +1,815 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cell-0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Batch Parse with LlamaCloud Directories\n",
|
||||
"\n",
|
||||
"This notebook demonstrates how to use LlamaCloud's batch processing API to parse multiple files in a directory. The workflow includes:\n",
|
||||
"\n",
|
||||
"1. **Creating a Directory** - Set up a directory to organize your files\n",
|
||||
"2. **Uploading Files** - Upload multiple files to the directory\n",
|
||||
"3. **Starting a Batch Parse Job** - Kick off batch processing on all files\n",
|
||||
"4. **Monitoring Progress** - Check the status and view results\n",
|
||||
"\n",
|
||||
"This is useful when you need to parse many documents at once, as the batch API handles the orchestration and provides progress tracking."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0c2b5e1a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cell-1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Setup and Installation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cell-2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install llama-cloud python-dotenv"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cell-3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"import httpx\n",
|
||||
"\n",
|
||||
"# Load environment variables\n",
|
||||
"load_dotenv()\n",
|
||||
"\n",
|
||||
"# Set your API key\n",
|
||||
"LLAMA_CLOUD_API_KEY = os.environ.get(\"LLAMA_CLOUD_API_KEY\", \"llx-...\")\n",
|
||||
"\n",
|
||||
"# Optional: Set base URL (defaults to https://api.cloud.llamaindex.ai if not set)\n",
|
||||
"LLAMA_CLOUD_BASE_URL = os.environ.get(\n",
|
||||
" \"LLAMA_CLOUD_BASE_URL\", \"https://api.cloud.llamaindex.ai\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Optional: Set project_id if you have one, otherwise it will use your default project\n",
|
||||
"PROJECT_ID = os.environ.get(\"LLAMA_CLOUD_PROJECT_ID\", None)\n",
|
||||
"\n",
|
||||
"print(\"✅ API key configured\")\n",
|
||||
"print(f\" Base URL: {LLAMA_CLOUD_BASE_URL}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cell-4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Setup HTTP Client\n",
|
||||
"\n",
|
||||
"Since the current version of the llama-cloud SDK has some issues with the beta endpoints, we'll use direct HTTP requests with httpx for reliability."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cell-5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create HTTP client with authentication\n",
|
||||
"headers = {\n",
|
||||
" \"Authorization\": f\"Bearer {LLAMA_CLOUD_API_KEY}\",\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"print(\"✅ HTTP client configured\")\n",
|
||||
"print(f\" Using base URL: {LLAMA_CLOUD_BASE_URL}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cell-6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 1: Create a Directory\n",
|
||||
"\n",
|
||||
"First, we'll create a directory to organize our files. Directories help you group related files together for batch processing."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cell-7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from datetime import datetime\n",
|
||||
"\n",
|
||||
"# Create a directory with a timestamp in the name\n",
|
||||
"timestamp = datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",
|
||||
"directory_name = f\"batch-parse-demo-{timestamp}\"\n",
|
||||
"\n",
|
||||
"# Create directory using HTTP request\n",
|
||||
"response = httpx.post(\n",
|
||||
" f\"{LLAMA_CLOUD_BASE_URL}/api/v1/beta/directories\",\n",
|
||||
" headers=headers,\n",
|
||||
" params={\"project_id\": PROJECT_ID},\n",
|
||||
" json={\n",
|
||||
" \"name\": directory_name,\n",
|
||||
" \"description\": \"Demo directory for batch parse example\",\n",
|
||||
" },\n",
|
||||
" timeout=60.0,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"if response.status_code in [200, 201]:\n",
|
||||
" directory = response.json()\n",
|
||||
" directory_id = directory[\"id\"]\n",
|
||||
" project_id = directory[\"project_id\"]\n",
|
||||
"\n",
|
||||
" print(f\"✅ Created directory: {directory['name']}\")\n",
|
||||
" print(f\" Directory ID: {directory_id}\")\n",
|
||||
" print(f\" Project ID: {project_id}\")\n",
|
||||
"else:\n",
|
||||
" raise Exception(\n",
|
||||
" f\"Failed to create directory: {response.status_code} - {response.text}\"\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cell-8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 2: Upload Files to the Directory\n",
|
||||
"\n",
|
||||
"Now we'll upload some files to our directory. For this demo, we'll download some sample PDFs and upload them.\n",
|
||||
"\n",
|
||||
"You can replace these with your own files."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cell-9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create a directory for sample files\n",
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"os.makedirs(\"sample_files\", exist_ok=True)\n",
|
||||
"\n",
|
||||
"# Sample documents to download\n",
|
||||
"sample_docs = {\n",
|
||||
" \"attention.pdf\": \"https://arxiv.org/pdf/1706.03762.pdf\",\n",
|
||||
" \"bert.pdf\": \"https://arxiv.org/pdf/1810.04805.pdf\",\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Download sample documents\n",
|
||||
"for filename, url in sample_docs.items():\n",
|
||||
" filepath = f\"sample_files/{filename}\"\n",
|
||||
" if not os.path.exists(filepath):\n",
|
||||
" print(f\"📥 Downloading {filename}...\")\n",
|
||||
" response = requests.get(url)\n",
|
||||
" if response.status_code == 200:\n",
|
||||
" with open(filepath, \"wb\") as f:\n",
|
||||
" f.write(response.content)\n",
|
||||
" print(f\" ✅ Downloaded {filename}\")\n",
|
||||
" else:\n",
|
||||
" print(f\" ❌ Failed to download {filename}\")\n",
|
||||
" else:\n",
|
||||
" print(f\"📁 {filename} already exists\")\n",
|
||||
"\n",
|
||||
"print(\"\\n✅ Sample files ready!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cell-10",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Upload Files to Directory\n",
|
||||
"\n",
|
||||
"Now let's upload the files to our directory using the `upload_file_to_directory` endpoint."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cell-11",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"uploaded_files = []\n",
|
||||
"\n",
|
||||
"# Workaround: Use direct HTTP requests instead of SDK due to SDK bug\n",
|
||||
"import httpx\n",
|
||||
"\n",
|
||||
"for filename in os.listdir(\"sample_files\"):\n",
|
||||
" if filename.endswith(\".pdf\"):\n",
|
||||
" filepath = f\"sample_files/{filename}\"\n",
|
||||
"\n",
|
||||
" print(f\"📤 Uploading {filename}...\")\n",
|
||||
"\n",
|
||||
" # Upload file using direct HTTP request (SDK has a bug with file uploads)\n",
|
||||
" with open(filepath, \"rb\") as f:\n",
|
||||
" # Prepare the multipart form data correctly\n",
|
||||
" files = {\"upload_file\": (filename, f, \"application/pdf\")}\n",
|
||||
"\n",
|
||||
" # Make the request directly\n",
|
||||
" response = httpx.post(\n",
|
||||
" f\"{LLAMA_CLOUD_BASE_URL}/api/v1/beta/directories/{directory_id}/files/upload\",\n",
|
||||
" params={\"project_id\": project_id},\n",
|
||||
" files=files,\n",
|
||||
" headers={\"Authorization\": f\"Bearer {LLAMA_CLOUD_API_KEY}\"},\n",
|
||||
" timeout=60.0,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if response.status_code in [200, 201]:\n",
|
||||
" directory_file = response.json()\n",
|
||||
" uploaded_files.append(directory_file)\n",
|
||||
" print(f\" ✅ Uploaded: {directory_file.get('display_name')}\")\n",
|
||||
" print(f\" File ID: {directory_file.get('id')}\")\n",
|
||||
" else:\n",
|
||||
" print(f\" ❌ Upload failed: {response.status_code}\")\n",
|
||||
" print(f\" Error: {response.text[:200]}\")\n",
|
||||
"\n",
|
||||
"print(f\"\\n✅ Uploaded {len(uploaded_files)} files to directory\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cell-12",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 3: Create a Batch Parse Job\n",
|
||||
"\n",
|
||||
"Now that we have files in our directory, let's create a batch parse job to process them all at once.\n",
|
||||
"\n",
|
||||
"The batch processing API uses the same configuration as LlamaParse."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cell-13",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Configure the parse job\n",
|
||||
"# This configuration will apply to all files in the directory\n",
|
||||
"job_config = {\n",
|
||||
" \"job_name\": \"parse_raw_file_job\", # Must match the JobNames enum value\n",
|
||||
" \"partitions\": {},\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"parse\",\n",
|
||||
" \"lang\": \"en\",\n",
|
||||
" \"fast_mode\": True,\n",
|
||||
" },\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"print(\"✅ Job configuration created\")\n",
|
||||
"print(f\" Language: {job_config['parameters']['lang']}\")\n",
|
||||
"print(f\" Fast mode: {job_config['parameters']['fast_mode']}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cell-14",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Submit the Batch Job\n",
|
||||
"\n",
|
||||
"Now let's submit the batch job to process all files in the directory."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cell-15",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(f\"🚀 Submitting batch parse job for directory: {directory_id}\")\n",
|
||||
"print(f\" Processing {len(uploaded_files)} files...\\n\")\n",
|
||||
"\n",
|
||||
"# Submit batch job using HTTP request\n",
|
||||
"response = httpx.post(\n",
|
||||
" f\"{LLAMA_CLOUD_BASE_URL}/api/v1/beta/batch-processing\",\n",
|
||||
" headers=headers,\n",
|
||||
" params={\"project_id\": project_id},\n",
|
||||
" json={\n",
|
||||
" \"directory_id\": directory_id,\n",
|
||||
" \"job_config\": job_config,\n",
|
||||
" \"page_size\": 100, # Number of files to fetch per batch\n",
|
||||
" \"continue_as_new_threshold\": 10, # Workflow continuation threshold\n",
|
||||
" },\n",
|
||||
" timeout=60.0,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"if response.status_code in [200, 201]:\n",
|
||||
" batch_job = response.json()\n",
|
||||
" batch_job_id = batch_job[\"id\"]\n",
|
||||
"\n",
|
||||
" print(\"✅ Batch job submitted successfully!\")\n",
|
||||
" print(f\" Batch Job ID: {batch_job_id}\")\n",
|
||||
" print(f\" Workflow ID: {batch_job.get('workflow_id')}\")\n",
|
||||
" print(f\" Status: {batch_job.get('status')}\")\n",
|
||||
" print(f\" Total Items: {batch_job.get('total_items')}\")\n",
|
||||
"else:\n",
|
||||
" raise Exception(\n",
|
||||
" f\"Failed to create batch job: {response.status_code} - {response.text}\"\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cell-16",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 4: Monitor Job Progress\n",
|
||||
"\n",
|
||||
"Now let's monitor the batch job progress. We'll poll the status endpoint to see how the job is progressing."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cell-17",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import time\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def print_job_status(status_data):\n",
|
||||
" \"\"\"Helper function to print job status in a readable format.\"\"\"\n",
|
||||
" job = status_data[\"job\"]\n",
|
||||
" progress_pct = status_data[\"progress_percentage\"]\n",
|
||||
"\n",
|
||||
" print(f\"\\n{'='*60}\")\n",
|
||||
" print(f\"Job Status: {job['status']}\")\n",
|
||||
" print(f\"{'='*60}\")\n",
|
||||
" print(f\"Total Items: {job['total_items']}\")\n",
|
||||
" print(f\"Completed: {job['processed_items']}\")\n",
|
||||
" print(f\"Failed: {job['failed_items']}\")\n",
|
||||
" print(f\"Skipped: {job['skipped_items']}\")\n",
|
||||
" print(f\"Progress: {progress_pct:.1f}%\")\n",
|
||||
"\n",
|
||||
" if job.get(\"completed_at\"):\n",
|
||||
" print(f\"Completed At: {job['completed_at']}\")\n",
|
||||
" elif job.get(\"started_at\"):\n",
|
||||
" print(f\"Started At: {job['started_at']}\")\n",
|
||||
"\n",
|
||||
" print(f\"{'='*60}\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Poll for status updates\n",
|
||||
"print(\"🔄 Monitoring batch job progress...\")\n",
|
||||
"print(\n",
|
||||
" \"Note: It may take a few seconds for the workflow to initialize and count files.\\n\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"max_polls = 60 # Maximum number of status checks (increased for longer jobs)\n",
|
||||
"poll_interval = 10 # Seconds between checks\n",
|
||||
"\n",
|
||||
"for i in range(max_polls):\n",
|
||||
" response = httpx.get(\n",
|
||||
" f\"{LLAMA_CLOUD_BASE_URL}/api/v1/beta/batch-processing/{batch_job_id}\",\n",
|
||||
" headers=headers,\n",
|
||||
" params={\"project_id\": project_id},\n",
|
||||
" timeout=60.0,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if response.status_code == 200:\n",
|
||||
" status_data = response.json()\n",
|
||||
" print_job_status(status_data)\n",
|
||||
"\n",
|
||||
" # Check if job is complete\n",
|
||||
" job_status = status_data[\"job\"][\"status\"]\n",
|
||||
" if job_status in [\"completed\", \"failed\", \"cancelled\"]:\n",
|
||||
" print(f\"\\n✅ Job finished with status: {job_status}\")\n",
|
||||
" break\n",
|
||||
"\n",
|
||||
" if i < max_polls - 1:\n",
|
||||
" print(f\"\\n⏳ Waiting {poll_interval} seconds before next check...\")\n",
|
||||
" time.sleep(poll_interval)\n",
|
||||
" else:\n",
|
||||
" print(f\"Error getting status: {response.status_code} - {response.text}\")\n",
|
||||
" break\n",
|
||||
"else:\n",
|
||||
" print(f\"\\n⚠️ Reached maximum polling attempts. Job may still be running.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cell-18",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 5: View Job Items\n",
|
||||
"\n",
|
||||
"Let's look at the individual items in the batch job to see which files were processed successfully."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cell-19",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Get all items in the batch job\n",
|
||||
"response = httpx.get(\n",
|
||||
" f\"{LLAMA_CLOUD_BASE_URL}/api/v1/beta/batch-processing/{batch_job_id}/items\",\n",
|
||||
" headers=headers,\n",
|
||||
" params={\"project_id\": project_id, \"limit\": 100},\n",
|
||||
" timeout=60.0,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"if response.status_code == 200:\n",
|
||||
" items_response = response.json()\n",
|
||||
"\n",
|
||||
" print(f\"\\n📋 Batch Job Items ({items_response['total_size']} total)\")\n",
|
||||
" print(f\"{'='*80}\\n\")\n",
|
||||
"\n",
|
||||
" for item in items_response[\"items\"]:\n",
|
||||
" status_emoji = (\n",
|
||||
" \"✅\"\n",
|
||||
" if item[\"status\"] == \"completed\"\n",
|
||||
" else \"❌\"\n",
|
||||
" if item[\"status\"] == \"failed\"\n",
|
||||
" else \"⏳\"\n",
|
||||
" )\n",
|
||||
" print(f\"{status_emoji} {item['item_name']}\")\n",
|
||||
" print(f\" Status: {item['status']}\")\n",
|
||||
" print(f\" Item ID: {item['item_id']}\")\n",
|
||||
"\n",
|
||||
" if item.get(\"error_message\"):\n",
|
||||
" print(f\" Error: {item['error_message']}\")\n",
|
||||
"\n",
|
||||
" print()\n",
|
||||
"else:\n",
|
||||
" print(f\"Error listing items: {response.status_code} - {response.text}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cell-20",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 6: Retrieve Processing Results\n",
|
||||
"\n",
|
||||
"For each completed file, we can retrieve the processing results to see where the parsed output is stored."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cell-21",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Get processing results for a specific item\n",
|
||||
"if items_response[\"items\"]:\n",
|
||||
" first_item = items_response[\"items\"][0]\n",
|
||||
"\n",
|
||||
" print(f\"\\n🔍 Processing results for: {first_item['item_name']}\")\n",
|
||||
" print(f\"{'='*80}\\n\")\n",
|
||||
"\n",
|
||||
" response = httpx.get(\n",
|
||||
" f\"{LLAMA_CLOUD_BASE_URL}/api/v1/beta/batch-processing/items/{first_item['item_id']}/processing-results\",\n",
|
||||
" headers=headers,\n",
|
||||
" params={\"project_id\": project_id},\n",
|
||||
" timeout=60.0,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if response.status_code == 200:\n",
|
||||
" results = response.json()\n",
|
||||
"\n",
|
||||
" print(f\"Item: {results['item_name']}\")\n",
|
||||
" print(f\"Total processing runs: {len(results['processing_results'])}\\n\")\n",
|
||||
"\n",
|
||||
" for i, result in enumerate(results[\"processing_results\"], 1):\n",
|
||||
" print(f\"Run {i}:\")\n",
|
||||
" print(f\" Job Type: {result['job_type']}\")\n",
|
||||
" print(f\" Processed At: {result['processed_at']}\")\n",
|
||||
" print(f\" Parameters Hash: {result['parameters_hash']}\")\n",
|
||||
"\n",
|
||||
" if result.get(\"output_s3_path\"):\n",
|
||||
" print(f\" Output S3 Path: {result['output_s3_path']}\")\n",
|
||||
"\n",
|
||||
" if result.get(\"output_metadata\"):\n",
|
||||
" print(f\" Output Metadata: {result['output_metadata']}\")\n",
|
||||
"\n",
|
||||
" print()\n",
|
||||
" else:\n",
|
||||
" print(f\"Error getting results: {response.status_code} - {response.text}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cell-22",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Optional: List All Batch Jobs\n",
|
||||
"\n",
|
||||
"You can also list all batch jobs in your project to see the history of batch processing operations."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cell-23",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# List all parse jobs in the project\n",
|
||||
"response = httpx.get(\n",
|
||||
" f\"{LLAMA_CLOUD_BASE_URL}/api/v1/beta/batch-processing\",\n",
|
||||
" headers=headers,\n",
|
||||
" params={\"project_id\": project_id, \"job_type\": \"parse\", \"limit\": 10},\n",
|
||||
" timeout=60.0,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"if response.status_code == 200:\n",
|
||||
" jobs_response = response.json()\n",
|
||||
"\n",
|
||||
" print(f\"\\n📊 Recent Batch Parse Jobs ({jobs_response['total_size']} total)\")\n",
|
||||
" print(f\"{'='*80}\\n\")\n",
|
||||
"\n",
|
||||
" for job in jobs_response[\"items\"]:\n",
|
||||
" status_emoji = (\n",
|
||||
" \"✅\"\n",
|
||||
" if job[\"status\"] == \"completed\"\n",
|
||||
" else \"❌\"\n",
|
||||
" if job[\"status\"] == \"failed\"\n",
|
||||
" else \"⏳\"\n",
|
||||
" )\n",
|
||||
" print(f\"{status_emoji} Job ID: {job['id']}\")\n",
|
||||
" print(f\" Status: {job['status']}\")\n",
|
||||
" print(f\" Directory: {job['directory_id']}\")\n",
|
||||
" print(f\" Total Items: {job['total_items']}\")\n",
|
||||
" print(f\" Completed: {job['processed_items']}\")\n",
|
||||
" print(f\" Created: {job['created_at']}\")\n",
|
||||
" print()\n",
|
||||
"else:\n",
|
||||
" print(f\"Error listing jobs: {response.status_code} - {response.text}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "uug7591rkq",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 7: Retrieve Parsed Text Results\n",
|
||||
"\n",
|
||||
"Once the batch job is complete, each BatchJobItem will have a `job_id` field that maps to a parse job ID. We can use this ID with the standard parse client methods to fetch the actual parsed text results."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "vpp0vxtc0y",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Get all completed items and their job IDs\n",
|
||||
"completed_items = [\n",
|
||||
" item for item in items_response[\"items\"] if item[\"status\"] == \"completed\"\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"print(f\"📄 Found {len(completed_items)} completed items\\n\")\n",
|
||||
"print(f\"{'='*80}\\n\")\n",
|
||||
"\n",
|
||||
"# Display the job_id for each completed item\n",
|
||||
"for item in completed_items:\n",
|
||||
" print(f\"📝 {item['item_name']}\")\n",
|
||||
" print(f\" Item ID: {item['item_id']}\")\n",
|
||||
" print(f\" Parse Job ID: {item['job_id']}\")\n",
|
||||
" print()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4gck6hwpnl6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Fetch Parsed Text for a Specific Document\n",
|
||||
"\n",
|
||||
"Now let's use the `job_id` to retrieve the actual parsed text content using the parse client methods."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "g191kvgxxvk",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Get the parsed text for the first completed item\n",
|
||||
"if completed_items:\n",
|
||||
" first_completed = completed_items[0]\n",
|
||||
"\n",
|
||||
" print(f\"📖 Retrieving parsed text for: {first_completed['item_name']}\")\n",
|
||||
" print(f\" Using Parse Job ID: {first_completed['job_id']}\\n\")\n",
|
||||
" print(f\"{'='*80}\\n\")\n",
|
||||
"\n",
|
||||
" # Use the job_id to fetch the parse result\n",
|
||||
" response = httpx.get(\n",
|
||||
" f\"{LLAMA_CLOUD_BASE_URL}/api/v1/parsing/job/{first_completed['job_id']}/result/text\",\n",
|
||||
" headers=headers,\n",
|
||||
" params={\"project_id\": project_id},\n",
|
||||
" timeout=60.0,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if response.status_code == 200:\n",
|
||||
" parse_result = response.text\n",
|
||||
"\n",
|
||||
" print(f\"✅ Retrieved parsed text ({len(parse_result)} characters)\\n\")\n",
|
||||
"\n",
|
||||
" # Display first 1000 characters as a preview\n",
|
||||
" print(\"Preview (first 1000 characters):\")\n",
|
||||
" print(\"-\" * 80)\n",
|
||||
" print(parse_result[:1000])\n",
|
||||
" print(\"-\" * 80)\n",
|
||||
"\n",
|
||||
" if len(parse_result) > 1000:\n",
|
||||
" print(f\"\\n... and {len(parse_result) - 1000} more characters\")\n",
|
||||
" else:\n",
|
||||
" print(\n",
|
||||
" f\"Error retrieving parse result: {response.status_code} - {response.text}\"\n",
|
||||
" )\n",
|
||||
"else:\n",
|
||||
" print(\"⚠️ No completed items found to retrieve results from\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2olccb4l8fj",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Retrieve Parsed Results in Other Formats\n",
|
||||
"\n",
|
||||
"You can also retrieve the parsed results in JSON or Markdown format using different client methods."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "lcqsfxiw0sr",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if completed_items:\n",
|
||||
" first_completed = completed_items[0]\n",
|
||||
"\n",
|
||||
" print(\n",
|
||||
" f\"📋 Retrieving parse results in different formats for: {first_completed['item_name']}\\n\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Get as JSON (includes structured data with pages, images, etc.)\n",
|
||||
" print(\"1️⃣ Retrieving as JSON...\")\n",
|
||||
" response = httpx.get(\n",
|
||||
" f\"{LLAMA_CLOUD_BASE_URL}/api/v1/parsing/job/{first_completed['job_id']}/result/json\",\n",
|
||||
" headers=headers,\n",
|
||||
" params={\"project_id\": project_id},\n",
|
||||
" timeout=60.0,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if response.status_code == 200:\n",
|
||||
" json_result = response.json()\n",
|
||||
" print(f\" ✅ JSON result with {len(json_result['pages'])} pages\")\n",
|
||||
" print(f\" Keys: {list(json_result.keys())}\\n\")\n",
|
||||
" else:\n",
|
||||
" print(f\" Error: {response.status_code}\\n\")\n",
|
||||
"\n",
|
||||
" # Get as Markdown\n",
|
||||
" print(\"2️⃣ Retrieving as Markdown...\")\n",
|
||||
" response = httpx.get(\n",
|
||||
" f\"{LLAMA_CLOUD_BASE_URL}/api/v1/parsing/job/{first_completed['job_id']}/result/markdown\",\n",
|
||||
" headers=headers,\n",
|
||||
" params={\"project_id\": project_id},\n",
|
||||
" timeout=60.0,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if response.status_code == 200:\n",
|
||||
" markdown_result = response.text\n",
|
||||
" print(f\" ✅ Markdown result ({len(markdown_result)} characters)\\n\")\n",
|
||||
"\n",
|
||||
" # Display markdown preview\n",
|
||||
" print(\"Markdown Preview (first 500 characters):\")\n",
|
||||
" print(\"-\" * 80)\n",
|
||||
" print(markdown_result[:500])\n",
|
||||
" print(\"-\" * 80)\n",
|
||||
"\n",
|
||||
" if len(markdown_result) > 500:\n",
|
||||
" print(f\"\\n... and {len(markdown_result) - 500} more characters\")\n",
|
||||
" else:\n",
|
||||
" print(f\" Error: {response.status_code}\")\n",
|
||||
"else:\n",
|
||||
" print(\"⚠️ No completed items found to retrieve results from\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "lr61wqkfq3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Batch Process All Parsed Results\n",
|
||||
"\n",
|
||||
"You can also loop through all completed items to retrieve and process all the parsed results."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "kltydf9xzkl",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Process all completed items\n",
|
||||
"print(f\"🔄 Processing all {len(completed_items)} completed items...\\n\")\n",
|
||||
"print(f\"{'='*80}\\n\")\n",
|
||||
"\n",
|
||||
"all_results = {}\n",
|
||||
"\n",
|
||||
"for item in completed_items:\n",
|
||||
" print(f\"📄 Processing: {item['item_name']}\")\n",
|
||||
" print(f\" Parse Job ID: {item['job_id']}\")\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" # Retrieve the parsed text for this item\n",
|
||||
" response = httpx.get(\n",
|
||||
" f\"{LLAMA_CLOUD_BASE_URL}/api/v1/parsing/job/{item['job_id']}/result/text\",\n",
|
||||
" headers=headers,\n",
|
||||
" params={\"project_id\": project_id},\n",
|
||||
" timeout=60.0,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if response.status_code == 200:\n",
|
||||
" parsed_text = response.text\n",
|
||||
"\n",
|
||||
" all_results[item[\"item_name\"]] = {\n",
|
||||
" \"job_id\": item[\"job_id\"],\n",
|
||||
" \"text\": parsed_text,\n",
|
||||
" \"length\": len(parsed_text),\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" print(f\" ✅ Retrieved {len(parsed_text)} characters\")\n",
|
||||
" else:\n",
|
||||
" all_results[item[\"item_name\"]] = {\n",
|
||||
" \"job_id\": item[\"job_id\"],\n",
|
||||
" \"error\": f\"HTTP {response.status_code}\",\n",
|
||||
" }\n",
|
||||
" print(f\" ❌ Error: HTTP {response.status_code}\")\n",
|
||||
"\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\" ❌ Error: {str(e)}\")\n",
|
||||
" all_results[item[\"item_name\"]] = {\"job_id\": item[\"job_id\"], \"error\": str(e)}\n",
|
||||
"\n",
|
||||
" print()\n",
|
||||
"\n",
|
||||
"print(f\"{'='*80}\")\n",
|
||||
"print(f\"\\n✅ Processed {len(all_results)} items\")\n",
|
||||
"print(f\"\\nSummary:\")\n",
|
||||
"for name, result in all_results.items():\n",
|
||||
" if \"error\" in result:\n",
|
||||
" print(f\" ❌ {name}: Error - {result['error']}\")\n",
|
||||
" else:\n",
|
||||
" print(f\" ✅ {name}: {result['length']:,} characters\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -16,6 +16,14 @@
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cbafd7ee",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cda2e5e9-fe9d-42d9-9387-f529d970ff7b",
|
||||
|
||||
@@ -20,6 +20,14 @@
|
||||
"This workflow is designed for equity research analysts and investment professionals."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e7979faf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
|
After Width: | Height: | Size: 287 KiB |
|
After Width: | Height: | Size: 769 KiB |
|
After Width: | Height: | Size: 942 KiB |
|
After Width: | Height: | Size: 1.5 MiB |
@@ -19,6 +19,13 @@
|
||||
"The example we go through below is also replicable within Llama Cloud as well, where you will also be able to pick between a number of pre-defined schemas, instead of building your own."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
@@ -15,6 +15,13 @@
|
||||
"Dow Jones Industrial Average (DJIA) is a stock market index that consists of 30 large companies listed on the New York Stock Exchange and the NASDAQ and is considered a good proxy for the overall US stock market. For this exercise, we will extract the insider transactions for all the companies in the DJIA. Let's first get the list of tickers in the Dow Jones Industrial Average using Wikipedia."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
@@ -16,6 +16,14 @@
|
||||
"This approach reduces manual data entry, improves extraction accuracy and standardization, and provides traceability for each technical detail."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8d1efe6e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a3b8c8d5-ff3e-48ce-b0b8-29b6b1f517f8",
|
||||
|
||||
@@ -11,6 +11,13 @@
|
||||
"Take a look at one of the resumes in the `data/resumes` directory. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
@@ -20,6 +20,14 @@
|
||||
"> **Note:** This principle of what fields generalize across your target documents and what might be optional is an important one to keep in mind when designing your schema. \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "355adfd4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
@@ -21,6 +21,14 @@
|
||||
"The following notebook uses the event‑driven syntax (with custom events, steps, and a workflow class) adapted from the technical datasheet and contract review examples."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ab7be988",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "36d8e34e-ed98-46ac-b744-1642f6e253d5",
|
||||
|
||||
@@ -0,0 +1,516 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a7oq3cfnync",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Extracting Repeating Entities from Documents\n",
|
||||
"\n",
|
||||
"This notebook demonstrates how to use the `PER_TABLE_ROW` extraction target to extract structured data from documents containing repeating entities like tables, lists, or catalogs.\n",
|
||||
"\n",
|
||||
"## Why Use the Tabular Extraction Target?\n",
|
||||
"\n",
|
||||
"`PER_DOC` (refer to the table below for a quick overview of the different extraction targets) is the default extraction target in LlamaExtract, which looks at the entire document's context when doing an extraction. When extracting lists of entities, LLM-based extraction has a critical failure mode — it often **only extracts the first few tens of entries** from a long list. This happens because LLMs have limited attention spans for repetitive data. Document-level extraction doesn't guarantee exhaustive coverage, and long lists lead to incomplete extractions.\n",
|
||||
"\n",
|
||||
"**The Solution**: `PER_TABLE_ROW` solves this by processing each entity individually or in smaller batches, ensuring **exhaustive extraction** of all entries regardless of list length.\n",
|
||||
"\n",
|
||||
"### Entity-Level Extraction\n",
|
||||
"\n",
|
||||
"When using `extraction_target=ExtractTarget.PER_TABLE_ROW`, you define a schema for a **single entity** (e.g., one hospital, one product, one invoice line item), not the full document. LlamaExtract automatically:\n",
|
||||
"- Detects the formatting patterns that distinguish individual entities (table rows, list items, section headers, etc.)\n",
|
||||
"- Applies your schema to each identified entity\n",
|
||||
"- Returns a `list[YourSchema]` with one object per entity\n",
|
||||
"\n",
|
||||
"This approach is ideal when each entity locally contains all the information needed for your schema.\n",
|
||||
"\n",
|
||||
"### Choosing the Right Extraction Target\n",
|
||||
"\n",
|
||||
"| Extraction Target | Best For | Returns |\n",
|
||||
"|-------------------|----------|---------|\n",
|
||||
"| `PER_DOC` | Single-entity documents, summaries, or short lists | One JSON object for entire document |\n",
|
||||
"| `PER_PAGE` | Multi-page documents where each page is independent | One JSON object per page |\n",
|
||||
"| `PER_TABLE_ROW` | **Long lists, tables, catalogs with repeating entities** | List of JSON objects (one per entity) |\n",
|
||||
"\n",
|
||||
"📖 For more details, see the [Extraction Target documentation](https://developers.llamaindex.ai/python/cloud/llamaextract/features/concepts/#extraction-target)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cb760594",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9427d1de",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from llama_cloud_services import LlamaExtract\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Load environment variables (put LLAMA_CLOUD_API_KEY in your .env file)\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"\n",
|
||||
"# Optionally, add your project id/organization id\n",
|
||||
"llama_extract = LlamaExtract()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4426b360",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Table of Hospitals by County and Insurance Plans\n",
|
||||
"\n",
|
||||
"We have a PDF document with a list of hospitals by county and different insurance plans offered by Blue Shield of California. \n",
|
||||
"\n",
|
||||
"\n",
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c86sjymhn1r",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We want to extract each hospital from this table along with a list of applicable insurance plans. \n",
|
||||
"\n",
|
||||
"### Example 1: Structured Table\n",
|
||||
"\n",
|
||||
"This is an ideal use case for `PER_TABLE_ROW` extraction:\n",
|
||||
"- **Clear structure**: The document has explicit table formatting with rows and columns\n",
|
||||
"- **Repeating entities**: Each row represents one hospital with consistent attributes\n",
|
||||
"- **Local information**: All data for each hospital (county, name, plans) is contained within its row\n",
|
||||
"\n",
|
||||
"Notice that our `Hospital` schema describes a **single hospital**, not the full document. LlamaExtract will return a `list[Hospital]` with one entry per table row."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7c61a802",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pydantic import BaseModel, Field\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class Hospital(BaseModel):\n",
|
||||
" \"\"\"List of hospitals by county available for different BSC plans\"\"\"\n",
|
||||
"\n",
|
||||
" county: str = Field(description=\"County name\")\n",
|
||||
" hospital_name: str = Field(description=\"Name of the hospital\")\n",
|
||||
" plan_names: list[str] = Field(\n",
|
||||
" description=\"List of plans available at the hospital. One of: Trio HMO, SaveNet, Access+ HMO, BlueHPN PPO, Tandem PPO, PPO\"\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b8a69b7a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_cloud_services.extract import ExtractConfig, ExtractMode, ExtractTarget\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"result = await llama_extract.aextract(\n",
|
||||
" data_schema=Hospital,\n",
|
||||
" files=\"./data/tables/BSC-Hospital-List-by-County.pdf\",\n",
|
||||
" config=ExtractConfig(\n",
|
||||
" extraction_mode=ExtractMode.PREMIUM,\n",
|
||||
" extraction_target=ExtractTarget.PER_TABLE_ROW,\n",
|
||||
" parse_model=\"anthropic-sonnet-4.5\",\n",
|
||||
" ),\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "43722cda",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Results"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "95b5aca6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"380"
|
||||
]
|
||||
},
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(result.data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1e355770",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[{'county': 'Alameda',\n",
|
||||
" 'hospital_name': 'Alameda Hospital',\n",
|
||||
" 'plan_names': ['Trio HMO',\n",
|
||||
" 'SaveNet',\n",
|
||||
" 'Access+ HMO',\n",
|
||||
" 'BlueHPN PPO',\n",
|
||||
" 'Tandem PPO',\n",
|
||||
" 'PPO']},\n",
|
||||
" {'county': 'Alameda',\n",
|
||||
" 'hospital_name': 'Alta Bates Med Ctr Herrick Campus',\n",
|
||||
" 'plan_names': ['Trio HMO',\n",
|
||||
" 'Access+ HMO',\n",
|
||||
" 'BlueHPN PPO',\n",
|
||||
" 'Tandem PPO',\n",
|
||||
" 'PPO']},\n",
|
||||
" {'county': 'Alameda',\n",
|
||||
" 'hospital_name': 'Alta Bates Summit Med Ctr Alta Bates Campus',\n",
|
||||
" 'plan_names': ['Trio HMO',\n",
|
||||
" 'Access+ HMO',\n",
|
||||
" 'BlueHPN PPO',\n",
|
||||
" 'Tandem PPO',\n",
|
||||
" 'PPO']},\n",
|
||||
" {'county': 'Alameda',\n",
|
||||
" 'hospital_name': 'Alta Bates Summit Med Ctr Summit Campus',\n",
|
||||
" 'plan_names': ['Trio HMO',\n",
|
||||
" 'Access+ HMO',\n",
|
||||
" 'BlueHPN PPO',\n",
|
||||
" 'Tandem PPO',\n",
|
||||
" 'PPO']},\n",
|
||||
" {'county': 'Alameda',\n",
|
||||
" 'hospital_name': 'Alta Bates Summit Medical Center',\n",
|
||||
" 'plan_names': ['Trio HMO',\n",
|
||||
" 'Access+ HMO',\n",
|
||||
" 'BlueHPN PPO',\n",
|
||||
" 'Tandem PPO',\n",
|
||||
" 'PPO']},\n",
|
||||
" {'county': 'Alameda',\n",
|
||||
" 'hospital_name': 'BHC Fremont Hospital',\n",
|
||||
" 'plan_names': ['Trio HMO',\n",
|
||||
" 'SaveNet',\n",
|
||||
" 'Access+ HMO',\n",
|
||||
" 'BlueHPN PPO',\n",
|
||||
" 'Tandem PPO',\n",
|
||||
" 'PPO']},\n",
|
||||
" {'county': 'Alameda',\n",
|
||||
" 'hospital_name': 'Centre For Neuro Skills San Francisco',\n",
|
||||
" 'plan_names': ['Trio HMO',\n",
|
||||
" 'SaveNet',\n",
|
||||
" 'Access+ HMO',\n",
|
||||
" 'BlueHPN PPO',\n",
|
||||
" 'Tandem PPO',\n",
|
||||
" 'PPO']},\n",
|
||||
" {'county': 'Alameda',\n",
|
||||
" 'hospital_name': 'Eden Medical Center',\n",
|
||||
" 'plan_names': ['Trio HMO', 'Access+ HMO', 'PPO']},\n",
|
||||
" {'county': 'Alameda',\n",
|
||||
" 'hospital_name': 'Fairmont Hospital',\n",
|
||||
" 'plan_names': ['Trio HMO',\n",
|
||||
" 'SaveNet',\n",
|
||||
" 'Access+ HMO',\n",
|
||||
" 'BlueHPN PPO',\n",
|
||||
" 'Tandem PPO',\n",
|
||||
" 'PPO']},\n",
|
||||
" {'county': 'Alameda',\n",
|
||||
" 'hospital_name': 'Highland Hospital',\n",
|
||||
" 'plan_names': ['Trio HMO',\n",
|
||||
" 'SaveNet',\n",
|
||||
" 'Access+ HMO',\n",
|
||||
" 'BlueHPN PPO',\n",
|
||||
" 'Tandem PPO',\n",
|
||||
" 'PPO']}]"
|
||||
]
|
||||
},
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"result.data[:10]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e28f0de8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "di156pb7s6j",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Success!** We extracted all **380 hospitals** from the multi-page PDF. Each entity was correctly parsed with its county, hospital name, and applicable insurance plans. With `PER_DOC`, we would likely have only gotten the first 20-30 entries."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "gelvl6db268",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Extracting from a Toy Catalog\n",
|
||||
"\n",
|
||||
"### Example 2: Semi-Structured List\n",
|
||||
"\n",
|
||||
"The `PER_TABLE_ROW` extraction target also works well for documents that aren't explicit tables but have similar properties:\n",
|
||||
"- **Ordered listing**: The toys are listed sequentially with visual separation (section headers, spacing)\n",
|
||||
"- **Repeating pattern**: Each toy entry has a consistent structure (code, name, specs, description)\n",
|
||||
"- **Local information**: All attributes for each toy are grouped together in its entry\n",
|
||||
"\n",
|
||||
"Even though this isn't a traditional table format, each toy entity locally contains all the information needed for our schema. LlamaExtract detects the formatting patterns that distinguish each toy and extracts them as separate entities.\n",
|
||||
"\n",
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8cf0b2db",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pydantic import BaseModel, Field\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class ToyCatalog(BaseModel):\n",
|
||||
" \"\"\"Product information from a toy catalog.\"\"\"\n",
|
||||
"\n",
|
||||
" section_name: str = Field(\n",
|
||||
" description=\"The name of the toy section (e.g. Table Toys, Active Toys).\"\n",
|
||||
" )\n",
|
||||
" product_code: str = Field(\n",
|
||||
" description=\"The unique product code for the toy (e.g., GA457).\"\n",
|
||||
" )\n",
|
||||
" toy_name: str = Field(description=\"The name of the toy.\")\n",
|
||||
" age_range: str = Field(\n",
|
||||
" description=\"The recommended age range for the toy (e.g., 6 +, 4 +).\",\n",
|
||||
" )\n",
|
||||
" player_range: str = Field(\n",
|
||||
" description=\"The number of players the toy is designed for (e.g., 2, 2-4, 1-6).\",\n",
|
||||
" )\n",
|
||||
" material: str = Field(\n",
|
||||
" description=\"The primary material(s) the toy is made of (e.g., wood, cardboard).\",\n",
|
||||
" )\n",
|
||||
" description: str = Field(\n",
|
||||
" description=\"A brief description of the toy and its components and dimensions.\",\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "mysu1i2qo9e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Results\n",
|
||||
"\n",
|
||||
"Again, our schema represents a **single toy product**, not the entire catalog. The system will return a `list[ToyCatalog]` with one entry per toy."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5b38b806",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"result = await llama_extract.aextract(\n",
|
||||
" data_schema=ToyCatalog,\n",
|
||||
" files=\"./data/tables/Click-BS-Toys-Catalogue-2024.pdf\",\n",
|
||||
" config=ExtractConfig(\n",
|
||||
" extraction_mode=ExtractMode.PREMIUM,\n",
|
||||
" extraction_target=ExtractTarget.PER_TABLE_ROW,\n",
|
||||
" parse_model=\"anthropic-sonnet-4.5\",\n",
|
||||
" ),\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "91aface0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"153"
|
||||
]
|
||||
},
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(result.data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "51278736",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[{'section_name': 'Table Toys',\n",
|
||||
" 'product_code': 'GA457',\n",
|
||||
" 'toy_name': 'Dots and Boxes',\n",
|
||||
" 'age_range': '6+',\n",
|
||||
" 'player_range': '2',\n",
|
||||
" 'material': 'wood',\n",
|
||||
" 'description': 'base 17x17 cm\\n50 border pieces 4x1,2x0,3 cm\\n34 trees 2,6x1,4 cm'},\n",
|
||||
" {'section_name': 'Table Toys',\n",
|
||||
" 'product_code': 'GA456',\n",
|
||||
" 'toy_name': '3 In a Row',\n",
|
||||
" 'age_range': '8+',\n",
|
||||
" 'player_range': '2',\n",
|
||||
" 'material': 'wood, pine, cardboard',\n",
|
||||
" 'description': 'base 24x22,5x2,5 cm\\n30 cards 5,5x5 cm\\n6 chips'},\n",
|
||||
" {'section_name': 'Table Toys',\n",
|
||||
" 'product_code': 'GA467',\n",
|
||||
" 'toy_name': 'Which Cow am i?',\n",
|
||||
" 'age_range': '6+',\n",
|
||||
" 'player_range': '2',\n",
|
||||
" 'material': 'wood, beech',\n",
|
||||
" 'description': '2 cow bases 56x4x4,5 cm\\n16 cards 4x5 cm'},\n",
|
||||
" {'section_name': 'Table Toys',\n",
|
||||
" 'product_code': 'GA460',\n",
|
||||
" 'toy_name': 'Balance Bunnies',\n",
|
||||
" 'age_range': '4+',\n",
|
||||
" 'player_range': '2',\n",
|
||||
" 'material': 'wood',\n",
|
||||
" 'description': '1 base 35x12x25 cm\\n7 bunnies 7 foxes\\n1 dice 3 cm'},\n",
|
||||
" {'section_name': 'Table Toys',\n",
|
||||
" 'product_code': 'GA462',\n",
|
||||
" 'toy_name': 'Color Combination Race',\n",
|
||||
" 'age_range': '4+',\n",
|
||||
" 'player_range': '2-4',\n",
|
||||
" 'material': 'wood, cardboard',\n",
|
||||
" 'description': 'base 6,5x6,5x15 cm, rings 5,5x5,5x0,5 mm\\ncardholder 6x6x2 cm, cards 5,5x5,5 cm\\ncolor cards Ø 15,5 cm - Ø 7 cm'},\n",
|
||||
" {'section_name': 'Table Toys',\n",
|
||||
" 'product_code': 'GA465',\n",
|
||||
" 'toy_name': 'Plop It',\n",
|
||||
" 'age_range': '6+',\n",
|
||||
" 'player_range': '2-4',\n",
|
||||
" 'material': 'wood, elastic, cardboard',\n",
|
||||
" 'description': 'Catch the right balls and plop them in the net!\\n* 2 ploppers 8x5 cm\\n* 2 net holders Ø 5cm, length 55 cm\\n* 6 cards 1,5x2,5 cm, 30 balls Ø 2,5 cm\\n* 1 rope 120 cm'},\n",
|
||||
" {'section_name': 'Table Toys',\n",
|
||||
" 'product_code': 'GA466',\n",
|
||||
" 'toy_name': 'Whack a Shape',\n",
|
||||
" 'age_range': '4+',\n",
|
||||
" 'player_range': '2-4',\n",
|
||||
" 'material': 'wood',\n",
|
||||
" 'description': '* base 38,5x15,5 cm\\n* 2 stands 36 half balls, 4 hammers\\n* 1 dice 2,5 cm\\n* 4 cards'},\n",
|
||||
" {'section_name': 'Table Toys',\n",
|
||||
" 'product_code': 'GA458',\n",
|
||||
" 'toy_name': 'Sling Puck | Table Hockey',\n",
|
||||
" 'age_range': '6+',\n",
|
||||
" 'player_range': '2',\n",
|
||||
" 'material': 'wood',\n",
|
||||
" 'description': '* double sides base 39x21x3 cm\\n* 10 chips Ø 2,5 cm\\n* 2 pushers 4x4x3 cm'},\n",
|
||||
" {'section_name': 'Table Toys',\n",
|
||||
" 'product_code': 'GA039',\n",
|
||||
" 'toy_name': 'DIY Birdhouse',\n",
|
||||
" 'age_range': '3+',\n",
|
||||
" 'player_range': '1',\n",
|
||||
" 'material': 'wood',\n",
|
||||
" 'description': '* house 9x9x13 cm'},\n",
|
||||
" {'section_name': 'Table Toys',\n",
|
||||
" 'product_code': 'GA319',\n",
|
||||
" 'toy_name': 'Triangle Domino',\n",
|
||||
" 'age_range': '6+',\n",
|
||||
" 'player_range': '2-4',\n",
|
||||
" 'material': 'wood',\n",
|
||||
" 'description': '* 35 triangles 10x10 x10 cm'}]"
|
||||
]
|
||||
},
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"result.data[:10]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d1810c0a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ezur9gnhmsb",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Success!** Despite the semi-structured format, we extracted all **152 toy products** from the catalog (there's an extra repeated extracted toy from the Appendix section). LlamaExtract automatically detected the visual patterns separating each toy entry and applied our schema to each one."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "aeyr3io29u",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Summary\n",
|
||||
"\n",
|
||||
"The `PER_TABLE_ROW` extraction target is powerful for extracting repeating structured entities from documents. Key takeaways:\n",
|
||||
"\n",
|
||||
"1. **Schema design**: Define your schema for a single entity, not the full document. The system returns `list[YourSchema]`.\n",
|
||||
"\n",
|
||||
"2. **Works with various formats**: Not just traditional tables—any document with distinguishable repeating entities (bullets, numbering, headers, visual separation, etc.). The common requirement is that each entity should contain all the necessary data for your schema within its local context.\n",
|
||||
"\n",
|
||||
"3. **Automatic pattern detection**: LlamaExtract identifies the formatting patterns that distinguish entities and applies your schema to each one."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -31,6 +31,13 @@
|
||||
"| Sep-02-2025 | 0.6.62 | Active |\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
@@ -4,31 +4,26 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Complete Parse → Classify → Extract Workflow with LlamaCloud Services\n",
|
||||
"# Document Classification + Extraction Workflow with LlamaCloud + LlamaIndex Workflows\n",
|
||||
"\n",
|
||||
"This notebook demonstrates the complete workflow for processing documents using LlamaCloud services:\n",
|
||||
"1. **Parse** - Extract and convert documents to markdown\n",
|
||||
"<a href=\"https://colab.research.google.com/github/run-llama/llama_cloud_services/blob/main/examples/misc/parse_classify_extract_workflow.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
|
||||
"\n",
|
||||
"This notebook shows a multi-step agentic document workflow that uses the **parsing**, **classification** and **extraction** modules in LlamaCloud, orchestrated through **LlamaIndex Workflows**. The workflow can take in a complex input document, parse it into clean markdown, classify it according to its subtype, and extract data according to a specified schema for that subtype. This allows you to automate document extraction of various types within the same workflow instead of having to manually separate the data beforehand. \n",
|
||||
"\n",
|
||||
"This notebook uses the following modules:\n",
|
||||
"1. **Parse (LlamaParse)** - Extract and convert documents to markdown\n",
|
||||
"2. **Classify** - Categorize documents based on their content\n",
|
||||
"3. **Extract** - Extract structured data using the markdown as input via SourceText\n",
|
||||
"3. **Extract (LlamaExtract)** - Extract structured data using the markdown as input via SourceText\n",
|
||||
"4. **LlamaIndex Workflows** - Event-driven orchestration of the parse, classify and extract steps\n",
|
||||
"\n",
|
||||
"## Overview of the Workflow\n",
|
||||
"\n",
|
||||
"### 1. Parse Phase\n",
|
||||
"- Use `LlamaParse` to convert documents (PDFs, Word docs, etc.) into structured formats\n",
|
||||
"- Extract markdown content that preserves document structure\n",
|
||||
"- Get both raw text and markdown representations\n",
|
||||
"\n",
|
||||
"### 2. Classify Phase\n",
|
||||
"- Use `ClassifyClient` to categorize documents based on content\n",
|
||||
"- Apply classification rules to route documents appropriately\n",
|
||||
"- Handle different document types with specific processing logic\n",
|
||||
"\n",
|
||||
"### 3. Extract Phase\n",
|
||||
"- Use `LlamaExtract` with `SourceText` to extract structured data\n",
|
||||
"- Pass the markdown content as input for more accurate extraction\n",
|
||||
"- Define custom schemas for structured data extraction\n",
|
||||
"\n",
|
||||
"Let's walk through each step with practical examples."
|
||||
"The workflow is implemented as a proper LlamaIndex Workflow with separate steps for parsing, classification, and extraction, connected by typed events. This provides modularity, observability, and type safety."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -45,8 +40,8 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Install required packages\n",
|
||||
"!pip install llama-cloud-services\n",
|
||||
"!pip install python-dotenv"
|
||||
"%pip install llama-cloud-services\n",
|
||||
"%pip install python-dotenv"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -73,7 +68,7 @@
|
||||
"nest_asyncio.apply()\n",
|
||||
"\n",
|
||||
"# Set up API key\n",
|
||||
"os.environ[\"LLAMA_CLOUD_API_KEY\"] = \"\" # edit it\n",
|
||||
"# os.environ[\"LLAMA_CLOUD_API_KEY\"] = \"\" # edit it\n",
|
||||
"\n",
|
||||
"# Setup Base URL\n",
|
||||
"# os.envrion[\"LLAMA_CLOUD_BASE_URL\"] = \"https://api.cloud.eu.llamaindex.ai/\" # update if necessay\n",
|
||||
@@ -99,7 +94,8 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"📁 financial_report.pdf already exists\n",
|
||||
"Downloading financial_report.pdf...\n",
|
||||
"✅ Downloaded financial_report.pdf\n",
|
||||
"📁 technical_spec.pdf already exists\n",
|
||||
"\n",
|
||||
"📂 Sample documents ready!\n"
|
||||
@@ -115,7 +111,7 @@
|
||||
"\n",
|
||||
"# Download sample documents\n",
|
||||
"docs_to_download = {\n",
|
||||
" \"financial_report.pdf\": \"https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/10k/uber_2021.pdf\",\n",
|
||||
" \"financial_report.pdf\": \"https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/10k/uber_2021.pdf\",\n",
|
||||
" \"technical_spec.pdf\": \"https://www.ti.com/lit/ds/symlink/lm317.pdf\",\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
@@ -155,10 +151,10 @@
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"🔄 Parsing documents...\n",
|
||||
"Started parsing the file under job_id 8a8c76f9-354d-4275-91d8-312ff1adc762\n",
|
||||
"...✅ Parsed financial report (Job ID: 8a8c76f9-354d-4275-91d8-312ff1adc762)\n",
|
||||
"Started parsing the file under job_id 7e603448-ed80-4d18-948b-6801ed51c41b\n",
|
||||
"✅ Parsed technical spec (Job ID: 7e603448-ed80-4d18-948b-6801ed51c41b)\n",
|
||||
"Started parsing the file under job_id 530c187a-bd2d-4eea-b38d-9e5738eab465\n",
|
||||
".✅ Parsed financial report (Job ID: 530c187a-bd2d-4eea-b38d-9e5738eab465)\n",
|
||||
"Started parsing the file under job_id a6e27710-776b-4445-8b94-8d75959ff5db\n",
|
||||
"✅ Parsed technical spec (Job ID: a6e27710-776b-4445-8b94-8d75959ff5db)\n",
|
||||
"\n",
|
||||
"📄 Parsing complete!\n"
|
||||
]
|
||||
@@ -246,23 +242,23 @@
|
||||
"\n",
|
||||
"## 1 Features\n",
|
||||
"\n",
|
||||
"• Output voltage range:\n",
|
||||
"- Output voltage range:\n",
|
||||
" – Adjustable: 1.25V to 37V\n",
|
||||
"• Output current: 1.5A\n",
|
||||
"• Line regulation: 0.01%/V (typ)\n",
|
||||
"• Load regulation: 0.1% (typ)\n",
|
||||
"• Internal short-circuit current limiting\n",
|
||||
"• Thermal overload protection\n",
|
||||
"• Output safe-area compensation (new chip)\n",
|
||||
"• PSRR: 80dB at 120Hz for CADJ = 10μF (new chip)\n",
|
||||
"• Packages:\n",
|
||||
"- Output current: 1.5A\n",
|
||||
"- Line regulation: 0.01%/V (typ)\n",
|
||||
"- Load regulation: 0.1% (typ)\n",
|
||||
"- Internal short-circuit current limiting\n",
|
||||
"- Thermal overload protection\n",
|
||||
"- Output safe-area compensation (new chip)\n",
|
||||
"- PSRR: 80dB at 120Hz for CADJ = 10μF (new chip)\n",
|
||||
"- Packages:\n",
|
||||
" – 4-pin, SOT-223 (DCY)\n",
|
||||
" – 3-pin, TO-263 (KTT)\n",
|
||||
" – 3-pin, TO-220 (KCS, KCT),\n",
|
||||
"...\n",
|
||||
"\n",
|
||||
"📏 Financial report markdown length: 1348671 characters\n",
|
||||
"📏 Technical spec markdown length: 90971 characters\n"
|
||||
"📏 Financial report markdown length: 1338499 characters\n",
|
||||
"📏 Technical spec markdown length: 92483 characters\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -291,7 +287,7 @@
|
||||
"source": [
|
||||
"## Phase 2: Document Classification\n",
|
||||
"\n",
|
||||
"Next, let's classify our documents based on their content using the ClassifyClient."
|
||||
"Next, let's classify our documents based on their content using `LlamaClassify`."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -309,14 +305,14 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from llama_cloud_services.beta.classifier.client import ClassifyClient\n",
|
||||
"from llama_cloud_services.beta.classifier.client import LlamaClassify\n",
|
||||
"from llama_cloud.types import ClassifierRule\n",
|
||||
"from llama_cloud_services.files.client import FileClient\n",
|
||||
"from llama_cloud.client import AsyncLlamaCloud\n",
|
||||
"\n",
|
||||
"# Initialize the classify client\n",
|
||||
"api_key = os.environ[\"LLAMA_CLOUD_API_KEY\"]\n",
|
||||
"classify_client = ClassifyClient.from_api_key(api_key)\n",
|
||||
"classify_client = LlamaClassify.from_api_key(api_key)\n",
|
||||
"\n",
|
||||
"print(\"🏷️ Setting up document classification...\")\n",
|
||||
"\n",
|
||||
@@ -339,6 +335,72 @@
|
||||
"print(f\"📝 Created {len(classification_rules)} classification rules\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Try Classification Independently\n",
|
||||
"\n",
|
||||
"Let's test the classification on one of our parsed documents to see how it works:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"🔍 Classifying financial document...\n",
|
||||
" Document length: 1,338,499 characters\n",
|
||||
"\n",
|
||||
"✅ Classification Result:\n",
|
||||
" Type: financial_document\n",
|
||||
" Confidence: 100.00%\n",
|
||||
" Reasoning: This document is a Form 10-K, which is an annual report required by the U.S. Securities and Exchange Commission (SEC) for publicly traded companies. It contains financial data, information about the c...\n",
|
||||
"\n",
|
||||
"======================================================================\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Let's classify the financial document\n",
|
||||
"print(\"🔍 Classifying financial document...\")\n",
|
||||
"print(f\" Document length: {len(financial_markdown):,} characters\\n\")\n",
|
||||
"\n",
|
||||
"# Write to temp file for classification\n",
|
||||
"import tempfile\n",
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"with tempfile.NamedTemporaryFile(\n",
|
||||
" mode=\"w\", suffix=\".md\", delete=False, encoding=\"utf-8\"\n",
|
||||
") as tmp:\n",
|
||||
" tmp.write(financial_markdown)\n",
|
||||
" temp_financial_path = Path(tmp.name)\n",
|
||||
"\n",
|
||||
"# Classify the document\n",
|
||||
"financial_classification = await classify_client.aclassify_file_path(\n",
|
||||
" rules=classification_rules, file_input_path=str(temp_financial_path)\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"doc_type = financial_classification.items[0].result.type\n",
|
||||
"confidence = financial_classification.items[0].result.confidence\n",
|
||||
"reasoning = financial_classification.items[0].result.reasoning\n",
|
||||
"\n",
|
||||
"print(f\"✅ Classification Result:\")\n",
|
||||
"print(f\" Type: {doc_type}\")\n",
|
||||
"print(f\" Confidence: {confidence:.2%}\")\n",
|
||||
"print(\n",
|
||||
" f\" Reasoning: {reasoning[:200]}...\"\n",
|
||||
" if reasoning and len(reasoning) > 200\n",
|
||||
" else f\" Reasoning: {reasoning}\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(\"\\n\" + \"=\" * 70)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -444,9 +506,31 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Complete Workflow Summary\n",
|
||||
"## Building the Complete Workflow\n",
|
||||
"\n",
|
||||
"Let's create a function that demonstrates the complete workflow:"
|
||||
"Now that we've seen how parsing works, let's build a complete 3-step workflow (Parse → Classify → Extract) using LlamaIndex Workflows. We'll define the workflow structure here, and you can see it in action below where we also demonstrate the classification and extraction modules independently.\n",
|
||||
"\n",
|
||||
"### Install Workflows Package\n",
|
||||
"\n",
|
||||
"First, let's install the LlamaIndex workflows package:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install llama-index-workflows llama-index-utils-workflow"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Define the Workflow\n",
|
||||
"\n",
|
||||
"Let's restructure the document processing into a proper LlamaIndex Workflow with separate classification and extraction steps:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -458,7 +542,7 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"🔧 Workflow function defined!\n"
|
||||
"🔧 Workflow defined!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -466,81 +550,286 @@
|
||||
"import tempfile\n",
|
||||
"from pathlib import Path\n",
|
||||
"from llama_cloud import ExtractConfig\n",
|
||||
"from workflows import Workflow, step, Context\n",
|
||||
"from workflows.events import Event, StartEvent, StopEvent\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"async def complete_document_workflow(markdown_content: str):\n",
|
||||
"# Define workflow events\n",
|
||||
"class ParseEvent(Event):\n",
|
||||
" \"\"\"Event emitted after parsing\"\"\"\n",
|
||||
"\n",
|
||||
" file_path: str\n",
|
||||
" markdown_content: str\n",
|
||||
" job_id: str\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class ClassifyEvent(Event):\n",
|
||||
" \"\"\"Event emitted after classification\"\"\"\n",
|
||||
"\n",
|
||||
" markdown_content: str\n",
|
||||
" temp_path: str\n",
|
||||
" doc_type: str\n",
|
||||
" confidence: float\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class ExtractEvent(Event):\n",
|
||||
" \"\"\"Event emitted after extraction\"\"\"\n",
|
||||
"\n",
|
||||
" doc_type: str\n",
|
||||
" confidence: float\n",
|
||||
" extracted_data: dict\n",
|
||||
" markdown_length: int\n",
|
||||
" temp_path: str\n",
|
||||
" markdown_sample: str\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class DocumentWorkflow(Workflow):\n",
|
||||
" \"\"\"\n",
|
||||
" Complete workflow: Parse → Classify → Extract\n",
|
||||
" Complete document processing workflow: Parse → Classify → Extract\n",
|
||||
" \"\"\"\n",
|
||||
" print(f\"🚀 Starting complete workflow\")\n",
|
||||
" print(\"=\" * 60)\n",
|
||||
"\n",
|
||||
" # Step 1: Classify\n",
|
||||
" print(\"🏷️ Step 2: Classifying document...\")\n",
|
||||
" def __init__(\n",
|
||||
" self,\n",
|
||||
" parser,\n",
|
||||
" classify_client,\n",
|
||||
" classification_rules,\n",
|
||||
" llama_extract,\n",
|
||||
" financial_schema,\n",
|
||||
" technical_schema,\n",
|
||||
" **kwargs,\n",
|
||||
" ):\n",
|
||||
" super().__init__(**kwargs)\n",
|
||||
" self.parser = parser\n",
|
||||
" self.classify_client = classify_client\n",
|
||||
" self.classification_rules = classification_rules\n",
|
||||
" self.llama_extract = llama_extract\n",
|
||||
" self.financial_schema = financial_schema\n",
|
||||
" self.technical_schema = technical_schema\n",
|
||||
"\n",
|
||||
" with tempfile.NamedTemporaryFile(\n",
|
||||
" mode=\"w\", suffix=\".md\", delete=False, encoding=\"utf-8\"\n",
|
||||
" ) as tmp:\n",
|
||||
" tmp.write(markdown_content)\n",
|
||||
" temp_path = Path(tmp.name)\n",
|
||||
" @step\n",
|
||||
" async def parse_document(self, ctx: Context, ev: StartEvent) -> ParseEvent:\n",
|
||||
" \"\"\"\n",
|
||||
" Step 1: Parse the document to extract markdown\n",
|
||||
" \"\"\"\n",
|
||||
" file_path = ev.file_path\n",
|
||||
" print(f\"📄 Step 1: Parsing document: {file_path}...\")\n",
|
||||
"\n",
|
||||
" print(temp_path)\n",
|
||||
" # Parse the document\n",
|
||||
" parse_result = await self.parser.aparse(file_path)\n",
|
||||
" markdown_content = await parse_result.aget_markdown()\n",
|
||||
" job_id = parse_result.job_id\n",
|
||||
"\n",
|
||||
" classification = await classify_client.aclassify_file_path(\n",
|
||||
" rules=classification_rules, file_input_path=str(temp_path)\n",
|
||||
" )\n",
|
||||
" doc_type = classification.items[0].result.type\n",
|
||||
" confidence = classification.items[0].result.confidence\n",
|
||||
" print(f\" ✅ Classified as: {doc_type} (confidence: {confidence:.2f})\")\n",
|
||||
" print(f\" ✅ Parsed successfully (Job ID: {job_id})\")\n",
|
||||
" print(f\" 📝 Extracted {len(markdown_content):,} characters\")\n",
|
||||
"\n",
|
||||
" # Step 2: Extract based on classification\n",
|
||||
" print(\"🔍 Step 3: Extracting structured data using SourceText...\")\n",
|
||||
" source_text = SourceText(\n",
|
||||
" text_content=markdown_content,\n",
|
||||
" filename=f\"{os.path.basename(temp_path)}_markdown.md\",\n",
|
||||
" )\n",
|
||||
" # Write event to stream for monitoring\n",
|
||||
" parse_event = ParseEvent(\n",
|
||||
" file_path=file_path,\n",
|
||||
" markdown_content=markdown_content,\n",
|
||||
" job_id=job_id,\n",
|
||||
" )\n",
|
||||
" ctx.write_event_to_stream(parse_event)\n",
|
||||
"\n",
|
||||
" # Choose schema based on classification\n",
|
||||
" if \"financial\" in doc_type.lower():\n",
|
||||
" schema = FinancialMetrics\n",
|
||||
" print(\" 📊 Using FinancialMetrics schema\")\n",
|
||||
" elif \"technical\" in doc_type.lower():\n",
|
||||
" schema = TechnicalSpec\n",
|
||||
" print(\" 🔧 Using TechnicalSpec schema\")\n",
|
||||
" else:\n",
|
||||
" schema = FinancialMetrics # Default fallback\n",
|
||||
" print(\" 📊 Using default FinancialMetrics schema\")\n",
|
||||
" return parse_event\n",
|
||||
"\n",
|
||||
" extract_config = ExtractConfig(\n",
|
||||
" extraction_mode=\"BALANCED\",\n",
|
||||
" )\n",
|
||||
" @step\n",
|
||||
" async def classify_document(self, ctx: Context, ev: ParseEvent) -> ClassifyEvent:\n",
|
||||
" \"\"\"\n",
|
||||
" Step 2: Classify the document based on its content\n",
|
||||
" \"\"\"\n",
|
||||
" markdown_content = ev.markdown_content\n",
|
||||
" print(\"🏷️ Step 2: Classifying document...\")\n",
|
||||
"\n",
|
||||
" extraction_result = llama_extract.extract(\n",
|
||||
" data_schema=schema, config=extract_config, files=source_text\n",
|
||||
" )\n",
|
||||
" # Write markdown to temp file for classification\n",
|
||||
" with tempfile.NamedTemporaryFile(\n",
|
||||
" mode=\"w\", suffix=\".md\", delete=False, encoding=\"utf-8\"\n",
|
||||
" ) as tmp:\n",
|
||||
" tmp.write(markdown_content)\n",
|
||||
" temp_path = Path(tmp.name)\n",
|
||||
"\n",
|
||||
" print(\" ✅ Extraction complete!\")\n",
|
||||
" # Classify the document\n",
|
||||
" classification = await self.classify_client.aclassify_file_path(\n",
|
||||
" rules=self.classification_rules, file_input_path=str(temp_path)\n",
|
||||
" )\n",
|
||||
" doc_type = classification.items[0].result.type\n",
|
||||
" confidence = classification.items[0].result.confidence\n",
|
||||
"\n",
|
||||
" return {\n",
|
||||
" \"file_path\": temp_path,\n",
|
||||
" \"markdown_length\": len(markdown_content),\n",
|
||||
" \"classification\": doc_type,\n",
|
||||
" \"confidence\": confidence,\n",
|
||||
" \"extracted_data\": extraction_result.data,\n",
|
||||
" \"markdown_sample\": markdown_content[:200] + \"...\"\n",
|
||||
" if len(markdown_content) > 200\n",
|
||||
" else markdown_content,\n",
|
||||
" }\n",
|
||||
" print(f\" ✅ Classified as: {doc_type} (confidence: {confidence:.2f})\")\n",
|
||||
"\n",
|
||||
" # Write event to stream for monitoring\n",
|
||||
" classify_event = ClassifyEvent(\n",
|
||||
" markdown_content=markdown_content,\n",
|
||||
" temp_path=str(temp_path),\n",
|
||||
" doc_type=doc_type,\n",
|
||||
" confidence=confidence,\n",
|
||||
" )\n",
|
||||
" ctx.write_event_to_stream(classify_event)\n",
|
||||
"\n",
|
||||
" return classify_event\n",
|
||||
"\n",
|
||||
" @step\n",
|
||||
" async def extract_data(self, ctx: Context, ev: ClassifyEvent) -> ExtractEvent:\n",
|
||||
" \"\"\"\n",
|
||||
" Step 3: Extract structured data based on classification\n",
|
||||
" \"\"\"\n",
|
||||
" print(\"🔍 Step 3: Extracting structured data using SourceText...\")\n",
|
||||
"\n",
|
||||
" # Choose schema based on classification\n",
|
||||
" if \"financial\" in ev.doc_type.lower():\n",
|
||||
" schema = self.financial_schema\n",
|
||||
" print(\" 📊 Using FinancialMetrics schema\")\n",
|
||||
" elif \"technical\" in ev.doc_type.lower():\n",
|
||||
" schema = self.technical_schema\n",
|
||||
" print(\" 🔧 Using TechnicalSpec schema\")\n",
|
||||
" else:\n",
|
||||
" schema = self.financial_schema # Default fallback\n",
|
||||
" print(\" 📊 Using default FinancialMetrics schema\")\n",
|
||||
"\n",
|
||||
" # Create SourceText from markdown content\n",
|
||||
" source_text = SourceText(\n",
|
||||
" text_content=ev.markdown_content,\n",
|
||||
" filename=f\"{os.path.basename(ev.temp_path)}_markdown.md\",\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Configure extraction\n",
|
||||
" extract_config = ExtractConfig(\n",
|
||||
" extraction_mode=\"BALANCED\",\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Perform extraction\n",
|
||||
" extraction_result = self.llama_extract.extract(\n",
|
||||
" data_schema=schema, config=extract_config, files=source_text\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" print(\" ✅ Extraction complete!\")\n",
|
||||
"\n",
|
||||
" # Create markdown sample\n",
|
||||
" markdown_sample = (\n",
|
||||
" ev.markdown_content[:200] + \"...\"\n",
|
||||
" if len(ev.markdown_content) > 200\n",
|
||||
" else ev.markdown_content\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" extract_event = ExtractEvent(\n",
|
||||
" doc_type=ev.doc_type,\n",
|
||||
" confidence=ev.confidence,\n",
|
||||
" extracted_data=extraction_result.data,\n",
|
||||
" markdown_length=len(ev.markdown_content),\n",
|
||||
" temp_path=ev.temp_path,\n",
|
||||
" markdown_sample=markdown_sample,\n",
|
||||
" )\n",
|
||||
" ctx.write_event_to_stream(extract_event)\n",
|
||||
"\n",
|
||||
" return extract_event\n",
|
||||
"\n",
|
||||
" @step\n",
|
||||
" async def finalize_results(self, ctx: Context, ev: ExtractEvent) -> StopEvent:\n",
|
||||
" \"\"\"\n",
|
||||
" Step 4: Finalize and return results\n",
|
||||
" \"\"\"\n",
|
||||
" result = {\n",
|
||||
" \"file_path\": ev.temp_path,\n",
|
||||
" \"markdown_length\": ev.markdown_length,\n",
|
||||
" \"classification\": ev.doc_type,\n",
|
||||
" \"confidence\": ev.confidence,\n",
|
||||
" \"extracted_data\": ev.extracted_data,\n",
|
||||
" \"markdown_sample\": ev.markdown_sample,\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" return StopEvent(result=result)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"print(\"🔧 Workflow function defined!\")"
|
||||
"print(\"🔧 Workflow defined!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Run Complete Workflow on Both Documents"
|
||||
"### Workflow Structure\n",
|
||||
"\n",
|
||||
"The workflow consists of four steps connected by typed events:\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"┌─────────────┐\n",
|
||||
"│ StartEvent │ (file_path)\n",
|
||||
"└──────┬──────┘\n",
|
||||
" │\n",
|
||||
" ▼\n",
|
||||
"┌──────────────────┐\n",
|
||||
"│ parse_document │ Step 1: Parse PDF to markdown\n",
|
||||
"└──────┬───────────┘\n",
|
||||
" │\n",
|
||||
" ▼\n",
|
||||
"┌─────────────┐\n",
|
||||
"│ ParseEvent │ (markdown_content, job_id)\n",
|
||||
"└──────┬──────┘\n",
|
||||
" │\n",
|
||||
" ▼\n",
|
||||
"┌─────────────────────┐\n",
|
||||
"│ classify_document │ Step 2: Classification\n",
|
||||
"└──────┬──────────────┘\n",
|
||||
" │\n",
|
||||
" ▼\n",
|
||||
"┌──────────────┐\n",
|
||||
"│ ClassifyEvent│ (doc_type, confidence, markdown_content)\n",
|
||||
"└──────┬───────┘\n",
|
||||
" │\n",
|
||||
" ▼\n",
|
||||
"┌──────────────┐\n",
|
||||
"│ extract_data │ Step 3: Extraction with schema selection\n",
|
||||
"└──────┬───────┘\n",
|
||||
" │\n",
|
||||
" ▼\n",
|
||||
"┌──────────────┐\n",
|
||||
"│ ExtractEvent │ (extracted_data, doc_type, confidence)\n",
|
||||
"└──────┬───────┘\n",
|
||||
" │\n",
|
||||
" ▼\n",
|
||||
"┌──────────────────┐\n",
|
||||
"│ finalize_results │ Step 4: Format and return results\n",
|
||||
"└──────┬───────────┘\n",
|
||||
" │\n",
|
||||
" ▼\n",
|
||||
"┌─────────────┐\n",
|
||||
"│ StopEvent │ (final result dictionary)\n",
|
||||
"└─────────────┘\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"**Key Features:**\n",
|
||||
"- **Step 1 (parse_document)**: Takes a file path and parses the document into clean markdown\n",
|
||||
"- **Step 2 (classify_document)**: Takes markdown content and classifies it into document types\n",
|
||||
"- **Step 3 (extract_data)**: Selects appropriate schema based on classification and extracts structured data\n",
|
||||
"- **Step 4 (finalize_results)**: Packages all results into final output format\n",
|
||||
"- Events are written to the stream for real-time monitoring\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Visualize the Workflow\n",
|
||||
"\n",
|
||||
"Let's visualize the workflow structure to see the flow of events:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Initialize the workflow\n",
|
||||
"workflow = DocumentWorkflow(\n",
|
||||
" parser=parser,\n",
|
||||
" classify_client=classify_client,\n",
|
||||
" classification_rules=classification_rules,\n",
|
||||
" llama_extract=llama_extract,\n",
|
||||
" financial_schema=FinancialMetrics,\n",
|
||||
" technical_schema=TechnicalSpec,\n",
|
||||
" timeout=300,\n",
|
||||
" verbose=True,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -552,53 +841,173 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"🚀 Starting complete workflow\n",
|
||||
"============================================================\n",
|
||||
"🏷️ Step 2: Classifying document...\n",
|
||||
"/var/folders/g6/4b5lpp5974gcpr890ybhbw4r0000gn/T/tmpos3b62tm.md\n",
|
||||
" ✅ Classified as: financial_document (confidence: 1.00)\n",
|
||||
"🔍 Step 3: Extracting structured data using SourceText...\n",
|
||||
" 📊 Using FinancialMetrics schema\n",
|
||||
".. ✅ Extraction complete!\n",
|
||||
"\n",
|
||||
"============================================================\n",
|
||||
"\n",
|
||||
"🚀 Starting complete workflow\n",
|
||||
"============================================================\n",
|
||||
"🏷️ Step 2: Classifying document...\n",
|
||||
"/var/folders/g6/4b5lpp5974gcpr890ybhbw4r0000gn/T/tmpppz9ub_m.md\n",
|
||||
" ✅ Classified as: technical_specification (confidence: 1.00)\n",
|
||||
"🔍 Step 3: Extracting structured data using SourceText...\n",
|
||||
" 🔧 Using TechnicalSpec schema\n",
|
||||
" ✅ Extraction complete!\n",
|
||||
"\n",
|
||||
"============================================================\n",
|
||||
"\n",
|
||||
"📋 Processed 2 documents successfully!\n"
|
||||
"document_workflow.html\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Process both documents through the complete workflow\n",
|
||||
"results = []\n",
|
||||
"# Draw the workflow visualization\n",
|
||||
"from llama_index.utils.workflow import draw_all_possible_flows\n",
|
||||
"\n",
|
||||
"for doc_text in document_texts:\n",
|
||||
" try:\n",
|
||||
" result = await complete_document_workflow(doc_text)\n",
|
||||
" results.append(result)\n",
|
||||
" print(\"\\n\" + \"=\" * 60 + \"\\n\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"❌ Error processing {doc_path}: {str(e)}\")\n",
|
||||
" print(\"\\n\" + \"=\" * 60 + \"\\n\")\n",
|
||||
"\n",
|
||||
"print(f\"📋 Processed {len(results)} documents successfully!\")"
|
||||
"draw_all_possible_flows(\n",
|
||||
" workflow,\n",
|
||||
" filename=\"document_workflow.html\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Final Results Summary"
|
||||
"The workflow has been visualized and saved to `document_workflow.html`. You can open this file in a browser to see the interactive workflow diagram.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The workflow visualization shows:\n",
|
||||
"1. **StartEvent** → **parse_document** step\n",
|
||||
"2. **ParseEvent** → **classify_document** step\n",
|
||||
"3. **ClassifyEvent** → **extract_data** step \n",
|
||||
"4. **ExtractEvent** → **finalize_results** step\n",
|
||||
"5. **StopEvent** (final output)\n",
|
||||
"\n",
|
||||
"Each step is connected by typed events, allowing for clean separation of concerns and easy monitoring of the workflow execution.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Run the Workflow on Both Documents\n",
|
||||
"\n",
|
||||
"Now let's run the workflow on both documents and monitor the events:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"======================================================================\n",
|
||||
"🚀 Processing Document 1: sample_docs/financial_report.pdf\n",
|
||||
"======================================================================\n",
|
||||
"\n",
|
||||
"Running step parse_document\n",
|
||||
"📄 Step 1: Parsing document: sample_docs/financial_report.pdf...\n",
|
||||
"Started parsing the file under job_id bb53c6bf-79cc-4f63-9c97-16983d59f29d\n",
|
||||
". ✅ Parsed successfully (Job ID: bb53c6bf-79cc-4f63-9c97-16983d59f29d)\n",
|
||||
" 📝 Extracted 1,338,499 characters\n",
|
||||
"Step parse_document produced event ParseEvent\n",
|
||||
"📄 Parse Event: Extracted 1,338,499 characters\n",
|
||||
"Running step classify_document\n",
|
||||
"🏷️ Step 2: Classifying document...\n",
|
||||
" ✅ Classified as: financial_document (confidence: 1.00)\n",
|
||||
"Step classify_document produced event ClassifyEvent\n",
|
||||
"📊 Classification Event: financial_document (1.00)\n",
|
||||
"Running step extract_data\n",
|
||||
"🔍 Step 3: Extracting structured data using SourceText...\n",
|
||||
" 📊 Using FinancialMetrics schema\n",
|
||||
".. ✅ Extraction complete!\n",
|
||||
"Step extract_data produced event ExtractEvent\n",
|
||||
"Running step finalize_results\n",
|
||||
"Step finalize_results produced event StopEvent\n",
|
||||
"✅ Extraction Event: 7 fields extracted\n",
|
||||
"\n",
|
||||
"✅ Document 1 processed successfully!\n",
|
||||
"\n",
|
||||
"======================================================================\n",
|
||||
"🚀 Processing Document 2: sample_docs/technical_spec.pdf\n",
|
||||
"======================================================================\n",
|
||||
"\n",
|
||||
"Running step parse_document\n",
|
||||
"📄 Step 1: Parsing document: sample_docs/technical_spec.pdf...\n",
|
||||
"Started parsing the file under job_id 944905c1-3c49-431a-ad86-4436d16f3d1c\n",
|
||||
" ✅ Parsed successfully (Job ID: 944905c1-3c49-431a-ad86-4436d16f3d1c)\n",
|
||||
" 📝 Extracted 92,483 characters\n",
|
||||
"Step parse_document produced event ParseEvent\n",
|
||||
"📄 Parse Event: Extracted 92,483 characters\n",
|
||||
"Running step classify_document\n",
|
||||
"🏷️ Step 2: Classifying document...\n",
|
||||
" ✅ Classified as: technical_specification (confidence: 1.00)\n",
|
||||
"Step classify_document produced event ClassifyEvent\n",
|
||||
"📊 Classification Event: technical_specification (1.00)\n",
|
||||
"Running step extract_data\n",
|
||||
"🔍 Step 3: Extracting structured data using SourceText...\n",
|
||||
" 🔧 Using TechnicalSpec schema\n",
|
||||
" ✅ Extraction complete!\n",
|
||||
"Step extract_data produced event ExtractEvent\n",
|
||||
"Running step finalize_results\n",
|
||||
"Step finalize_results produced event StopEvent\n",
|
||||
"✅ Extraction Event: 8 fields extracted\n",
|
||||
"\n",
|
||||
"✅ Document 2 processed successfully!\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"📋 Processed 2 documents successfully!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Process both documents through the workflow\n",
|
||||
"results = []\n",
|
||||
"\n",
|
||||
"# Define the document files to process\n",
|
||||
"document_files = [\n",
|
||||
" \"sample_docs/financial_report.pdf\",\n",
|
||||
" \"sample_docs/technical_spec.pdf\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"for i, file_path in enumerate(document_files, 1):\n",
|
||||
" print(f\"\\n{'='*70}\")\n",
|
||||
" print(f\"🚀 Processing Document {i}: {file_path}\")\n",
|
||||
" print(f\"{'='*70}\\n\")\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" # Run the workflow\n",
|
||||
" handler = workflow.run(file_path=file_path)\n",
|
||||
"\n",
|
||||
" # Monitor events as they are emitted\n",
|
||||
" async for event in handler.stream_events():\n",
|
||||
" if isinstance(event, ParseEvent):\n",
|
||||
" print(\n",
|
||||
" f\"📄 Parse Event: Extracted {len(event.markdown_content):,} characters\"\n",
|
||||
" )\n",
|
||||
" elif isinstance(event, ClassifyEvent):\n",
|
||||
" print(\n",
|
||||
" f\"📊 Classification Event: {event.doc_type} ({event.confidence:.2f})\"\n",
|
||||
" )\n",
|
||||
" elif isinstance(event, ExtractEvent):\n",
|
||||
" print(\n",
|
||||
" f\"✅ Extraction Event: {len(event.extracted_data)} fields extracted\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Get final result\n",
|
||||
" result = await handler\n",
|
||||
" results.append(result)\n",
|
||||
"\n",
|
||||
" print(f\"\\n✅ Document {i} processed successfully!\")\n",
|
||||
"\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"❌ Error processing document {i}: {str(e)}\")\n",
|
||||
" import traceback\n",
|
||||
"\n",
|
||||
" traceback.print_exc()\n",
|
||||
"\n",
|
||||
"print(f\"\\n\\n📋 Processed {len(results)} documents successfully!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Final Results Summary\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -613,9 +1022,9 @@
|
||||
"📈 COMPLETE WORKFLOW RESULTS SUMMARY\n",
|
||||
"======================================================================\n",
|
||||
"\n",
|
||||
"📄 Document 1: tmpos3b62tm.md\n",
|
||||
"📄 Document 1: tmpuyxzpd3x.md\n",
|
||||
" 📊 Classification: financial_document (confidence: 1.00)\n",
|
||||
" 📝 Markdown length: 1,348,671 characters\n",
|
||||
" 📝 Markdown length: 1,338,499 characters\n",
|
||||
" 📋 Markdown sample: \n",
|
||||
"\n",
|
||||
"# UNITED STATES\n",
|
||||
@@ -629,14 +1038,14 @@
|
||||
" • company_name: Uber Technologies, Inc.\n",
|
||||
" • document_type: Annual Report on Form 10-K\n",
|
||||
" • fiscal_year: 2021\n",
|
||||
" • revenue_2021: $21,764\n",
|
||||
" • net_income_2021: $(496)\n",
|
||||
" • key_business_segments: ['Mobility', 'Delivery', 'Freight', 'All Other (including former New Mobility, e-bikes, e-scooters, Advanced Technologies Group and other technology programs)']\n",
|
||||
" • risk_factors: [\"The company faces numerous risk factors across its business operations and environment. The COVID-19 pandemic and related mitigation measures have adversely affected parts of the business, including reduced demand for Mobility offerings and creating ongoing uncertainties. The company's operational and financial performance is influenced by competitive pressure in the mobility, delivery, and logistics industries, characterized by well-established alternatives, low barriers to entry, and low switching costs. Driver classification risks exist if Drivers are deemed employees, workers, or quasi-employees rather than independent contractors, exposing the company to legal actions and financial liabilities globally. Competition challenges require the company to sometimes lower fares, offer incentives, and promotions, which impacts profitability. There are significant operating losses historically with substantial future operating expense increases anticipated, and the ability to achieve or maintain profitability is uncertain. Network value depends on maintaining critical mass among Drivers, consumers, merchants, shippers, and carriers, and failures to do so diminish platform attractiveness. Brand and reputation maintenance is critical, with exposure to negative publicity, media coverage, and risks from associated companies' brands or licensed brands in joint ventures.\\n\\nOperational risks include historical workplace culture and compliance challenges, management complexity due to rapid growth, technological infrastructure issues potentially causing disruptions or poor user experience, and security or data privacy breaches that could impact revenue and reputation. Platform users may engage in or be subjected to criminal, violent, or dangerous activity leading to safety incidents and legal actions. New offerings and technologies investments are inherently risky without guaranteed benefits. Economic conditions, inflation, and increased costs (fuel, food, labor, energy) may negatively impact results. Regulatory risks are extensive and global, involving payment and financial services compliance, licensing, anti-money laundering laws, data privacy (GDPR, CCPA, LGPD), and labor laws. Legal and regulatory investigations and inquiries, including antitrust, FCPA, labor classification, data protection, and intellectual property matters, pose risks of fines, penalties, operational changes, and increased costs.\\n\\nGeopolitical and jurisdictional risks include operating limitations or bans in some locations, currency exchange risk, and complex evolving regulations with the potential for fines and loss of licenses or permits. Insurance risks include potential inadequacy of reserves, liability exposure from accidents or impersonation, and insurer insolvency. Driver qualification requirements and background checks may increase costs or fail to expose all relevant information, with associated insurance cost risks and potential for courtroom or regulatory challenges to pricing models.\\n\\nFinancial risks comprise significant accumulated deficits, requirement for additional capital with uncertain availability, debt obligations, tax exposure including uncertain positions and observed changes in tax laws, and volatility in common stock price with no expected cash dividends. Accounting judgments and estimates involve critical assumptions affecting reported financial metrics related to goodwill, revenue recognition, incentive accruals, and stock-based compensation. Cybersecurity risks include exposures to malware, ransomware, phishing, and other cyberattacks. Climate change presents physical and transitional risks that may impact operations and costs, and failure to meet climate commitments may have operational and reputational consequences.\\n\\nOther risks include potential liability under anti-corruption and anti-terrorism laws, adverse effects from defaults under debt agreements, limitations in takeover actions due to corporate governance provisions, and the impact of non-GAAP financial measure limitations. Overall, these diverse and interconnected risk factors contribute to significant uncertainty regarding the company's future business prospects, operating results, and financial condition.\"]\n",
|
||||
" • revenue_2021: $17,455 and $21,764\n",
|
||||
" • net_income_2021: $(496) to (700)\n",
|
||||
" • key_business_segments: ['Borrower and the Restricted Subsidiaries', 'Holdings', 'Guarantors', 'Material Domestic Subsidiaries', 'Material Foreign Subsidiaries']\n",
|
||||
" • risk_factors: ['Indemnification obligations of the borrower for losses, claims, damages, liabilities, and out-of-pocket expenses incurred by agents, lenders, arrangers, and related parties in connection with the agreement or loans, except in certain cases such as gross negligence, bad faith, willful misconduct, or material breach by the indemnitee.', \"Borrower not required to indemnify any indemnitee for settlements entered into without the borrower's consent.\", 'Limitation of liability for special, indirect, consequential, or punitive damages, and for damages from unauthorized use of information, except for direct damages resulting from gross negligence, bad faith, or willful misconduct.', 'Obligation of the borrower to indemnify the administrative agent for liabilities arising from performance of duties, except in cases of gross negligence, bad faith, or willful misconduct.', 'Limitations and conditions on assignments and participations of lender rights, including restrictions on assignments to disqualified institutions, loan parties, affiliates of loan parties, defaulting lenders, and natural persons.', 'Setoff rights for lenders and issuing banks after an event of default, allowing them to apply borrower deposits toward obligations under the agreement.', 'Potential for increased obligations under the agreement as a result of changes in law affecting payment terms.', 'Requirement for the borrower and guarantors to provide information to comply with anti-money laundering rules and the USA PATRIOT Act.']\n",
|
||||
"\n",
|
||||
"📄 Document 2: tmpppz9ub_m.md\n",
|
||||
"📄 Document 2: tmp7ower2xm.md\n",
|
||||
" 📊 Classification: technical_specification (confidence: 1.00)\n",
|
||||
" 📝 Markdown length: 90,971 characters\n",
|
||||
" 📝 Markdown length: 92,483 characters\n",
|
||||
" 📋 Markdown sample: \n",
|
||||
"\n",
|
||||
"LM317\n",
|
||||
@@ -648,20 +1057,14 @@
|
||||
" 🎯 Extracted fields: 8 fields\n",
|
||||
" • component_name: LM317\n",
|
||||
" • manufacturer: Texas Instruments\n",
|
||||
" • part_number: LM317\n",
|
||||
" • description: The LM317 is an adjustable three-pin, positive-voltage regulator capable of supplying up to 1.5A over an output voltage range of 1.25V to 37V. It features line and load regulation, internal current limiting, thermal overload protection, and safe operating area compensation.\n",
|
||||
" • part_number: LM317, SLVS044Z\n",
|
||||
" • description: The LM317 is an adjustable three-pin, positive-voltage regulator capable of supplying more than 1.5A (typically up to 1.5A) over an output voltage range of 1.25V to 37V. The device requires only two external resistors to set the output voltage. It features a typical line regulation of 0.01% and typical load regulation of 0.1%. The LM317 includes current limiting, thermal overload protection, and safe operating area protection. Overload protection remains functional even if the ADJUST pin is disconnected. The regulator is used in applications such as constant-current battery-charger circuits, slow turn-on 15V regulator circuits, AC voltage-regulator circuits, current-limited charger circuits, and high-current and adjustable regulator circuits. It is available in packages including SOT-223 (DCY), TO-220 (KCS), and TO-263 (KTT).\n",
|
||||
" • operating_voltage: {'min_voltage': 1.25, 'max_voltage': 37.0, 'unit': 'V'}\n",
|
||||
" • maximum_current: 1.5\n",
|
||||
" • key_features: ['Adjustable output voltage: 1.25V to 37V', 'Output current up to 1.5A', 'Line regulation: 0.01%/V (typical)', 'Load regulation: 0.1% (typical)', 'Internal short-circuit current limiting', 'Thermal overload protection', 'Output safe-area compensation', 'High power supply rejection ratio (PSRR): 80dB at 120Hz (new chip)', 'Available in SOT-223, TO-263, and TO-220 packages']\n",
|
||||
" • applications: ['Multifunction printers', 'AC drive power stage modules', 'Electricity meters', 'Servo drive control modules', 'Merchant network and server power supply units']\n",
|
||||
" • maximum_current: 4.0\n",
|
||||
" • key_features: ['Adjustable output voltage range: 1.25V to 37V', 'Output current up to 1.5A (up to 4A with external pass elements)', 'Line regulation: typically 0.01%/V', 'Load regulation: typically 0.1%', 'Internal short-circuit current limiting / Current limiting', 'Thermal overload protection / Thermal shutdown', 'Output safe-area compensation / Safe operating area protection', 'PSRR: 80dB at 120Hz for CADJ = 10μF (new chip)', 'NPN Darlington output drive', 'Programmable feedback', 'Multiple package options (SOT-223, TO-220, TO-263)', 'Can be used in constant-current, battery-charging, and regulator applications']\n",
|
||||
" • applications: ['Multifunction printers, AC drive power stage modules, Electricity meters, Servo drive control modules, Merchant network and server PSU, Adjustable voltage regulator, 0V to 30V regulator circuit, Regulator circuit with improved ripple rejection, Precision current-limiter, Tracking preregulator, 1.25V to 20V regulator, Battery charger circuit, Constant-current battery charger circuits, Slow turn-on regulator, AC voltage-regulator, Current-limited charger circuits, High-current adjustable regulator circuits, General-purpose adjustable power supply']\n",
|
||||
"\n",
|
||||
"✨ Workflow completed successfully!\n",
|
||||
"\n",
|
||||
"📚 Key Learnings:\n",
|
||||
" • Parse: Converted documents to clean markdown format\n",
|
||||
" • Classify: Automatically categorized document types\n",
|
||||
" • Extract: Used SourceText with markdown for structured data extraction\n",
|
||||
" • The markdown content provides much better context for extraction than raw PDFs\n"
|
||||
"✨ Workflow completed successfully!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -683,14 +1086,7 @@
|
||||
" for key, value in extracted.items():\n",
|
||||
" print(f\" • {key}: {value}\")\n",
|
||||
"\n",
|
||||
"print(\"\\n✨ Workflow completed successfully!\")\n",
|
||||
"print(\"\\n📚 Key Learnings:\")\n",
|
||||
"print(\" • Parse: Converted documents to clean markdown format\")\n",
|
||||
"print(\" • Classify: Automatically categorized document types\")\n",
|
||||
"print(\" • Extract: Used SourceText with markdown for structured data extraction\")\n",
|
||||
"print(\n",
|
||||
" \" • The markdown content provides much better context for extraction than raw PDFs\"\n",
|
||||
")"
|
||||
"print(\"\\n✨ Workflow completed successfully!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -699,54 +1095,33 @@
|
||||
"source": [
|
||||
"## Conclusion\n",
|
||||
"\n",
|
||||
"This notebook demonstrated the complete **Parse → Classify → Extract** workflow using LlamaCloud services:\n",
|
||||
"The notebook shows you how to build an e2e document **Classify → Extract** workflow using LlamaCloud. This uses some of our core building blocks around **classification** interleaved with **document extraction**.\n",
|
||||
"\n",
|
||||
"### Key Components:\n",
|
||||
"### Main Components:\n",
|
||||
"\n",
|
||||
"1. **LlamaParse** (`llama_cloud_services.parse.base.LlamaParse`):\n",
|
||||
" - Converts documents to clean, structured markdown\n",
|
||||
" - Preserves document structure and formatting\n",
|
||||
" - Handles various file types (PDF, DOCX, etc.)\n",
|
||||
"\n",
|
||||
"2. **ClassifyClient** (`llama_cloud_services.beta.classifier.client.ClassifyClient`):\n",
|
||||
"2. **LlamaClassify** (`llama_cloud_services.beta.classifier.client.LlamaClassify`):\n",
|
||||
" - Automatically categorizes documents based on content\n",
|
||||
" - Uses customizable rules for classification\n",
|
||||
" - Provides confidence scores for classifications\n",
|
||||
"\n",
|
||||
"3. **LlamaExtract with SourceText** (`llama_cloud_services.extract.extract.LlamaExtract`, `SourceText`):\n",
|
||||
" - Extracts structured data using custom Pydantic schemas\n",
|
||||
" - **SourceText** allows using markdown content as input instead of raw files\n",
|
||||
" - Provides much better extraction accuracy when using processed markdown\n",
|
||||
" - You can either feed in the file directly (in which case parsing will happen under the hood), or the parsed text through the **SourceText** object (which is the case in this example) \n",
|
||||
"\n",
|
||||
"### Workflow Benefits:\n",
|
||||
"\n",
|
||||
"- **Better Accuracy**: Using markdown from parsing provides cleaner, more structured input for extraction\n",
|
||||
"- **Automatic Routing**: Classification allows different processing logic for different document types\n",
|
||||
"- **Structured Output**: Custom schemas ensure consistent, structured data extraction\n",
|
||||
"- **Flexible Input**: SourceText supports text content, file paths, and bytes\n",
|
||||
"\n",
|
||||
"### Key Insights:\n",
|
||||
"\n",
|
||||
"1. **SourceText is the bridge**: It allows you to pass the clean markdown content from parsing directly to extraction\n",
|
||||
"2. **Markdown improves extraction**: Pre-processed markdown provides much better context than raw PDFs\n",
|
||||
"3. **Classification enables smart routing**: Different document types can use different extraction schemas\n",
|
||||
"4. **End-to-end automation**: The entire workflow can be automated for production use\n",
|
||||
"\n",
|
||||
"This approach is ideal for production document processing pipelines where you need to:\n",
|
||||
"- Process various document types automatically\n",
|
||||
"- Extract structured data consistently\n",
|
||||
"- Maintain high accuracy and reliability\n",
|
||||
"- Handle documents at scale\n",
|
||||
"\n",
|
||||
"The combination of these three services provides a powerful, flexible document processing pipeline that can handle complex, real-world document processing requirements."
|
||||
"**Benefits of an e2e workflow**: The main benefit of doing Classify -> Extract, instead of only Extract, is the fact that you can handle documents of different types/different expected schemas within the same workflow, without having to separate out the data before and running separate extractions on each data subset. "
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"display_name": "llama_parse",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
"name": "llama_parse"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -27,6 +27,14 @@
|
||||
"| Aug-19-2025 | 0.6.61 | Maintained |"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e2b422f5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2e4f707a-c7b5-473f-b4a6-881e2245e82d",
|
||||
|
||||
@@ -14,6 +14,13 @@
|
||||
"| Aug-19-2025 | 0.6.61 | Maintained |"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
@@ -0,0 +1,188 @@
|
||||
"""
|
||||
⚠️ DEPRECATION NOTICE:
|
||||
This example uses the deprecated llama-cloud-services package, which will be maintained until May 1, 2026.
|
||||
Please migrate to: pip install llama-cloud>=1.0 (https://github.com/run-llama/llama-cloud-py)
|
||||
"""
|
||||
"""
|
||||
Example: Batch Processing a Folder of PDFs with LlamaParse
|
||||
|
||||
This script demonstrates how to process multiple PDFs from a folder
|
||||
using LlamaParse with controlled concurrency using asyncio and semaphores.
|
||||
|
||||
Usage:
|
||||
python batch_parse_folder.py --input-dir ./pdfs --max-concurrent 5
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any
|
||||
from datetime import datetime
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
|
||||
from llama_cloud_services import LlamaParse
|
||||
|
||||
# Load environment variables from .env file
|
||||
load_dotenv()
|
||||
|
||||
|
||||
async def parse_single_file(
|
||||
parser: LlamaParse,
|
||||
file_path: Path,
|
||||
semaphore: asyncio.Semaphore,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Parse a single PDF file with concurrency control.
|
||||
|
||||
Args:
|
||||
parser: LlamaParse instance
|
||||
file_path: Path to the PDF file
|
||||
semaphore: Semaphore to control concurrent requests
|
||||
|
||||
Returns:
|
||||
Dictionary with file info and parse result
|
||||
"""
|
||||
async with semaphore:
|
||||
try:
|
||||
print(f"Starting parse: {file_path.name}")
|
||||
|
||||
result = await parser.aparse(str(file_path))
|
||||
|
||||
print(f"✓ Completed: {file_path.name} ({len(result.pages)} pages)")
|
||||
|
||||
return {
|
||||
"file": file_path.name,
|
||||
"status": "success",
|
||||
"result": result,
|
||||
"pages": len(result.pages) if result.pages else 0,
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"✗ Error parsing {file_path.name}: {str(e)}")
|
||||
return {
|
||||
"file": file_path.name,
|
||||
"status": "error",
|
||||
"error": str(e),
|
||||
}
|
||||
|
||||
|
||||
async def parse_folder(
|
||||
input_dir: Path,
|
||||
max_concurrent: int = 5,
|
||||
api_key: str = None,
|
||||
) -> List[Dict[str, any]]:
|
||||
"""
|
||||
Parse all PDFs in a folder with controlled concurrency.
|
||||
|
||||
Args:
|
||||
input_dir: Directory containing PDF files
|
||||
max_concurrent: Maximum number of concurrent parse operations
|
||||
api_key: LlamaCloud API key (loaded from .env file)
|
||||
|
||||
Returns:
|
||||
List of parse results for each file
|
||||
"""
|
||||
# Find all PDF files
|
||||
pdf_files = list(input_dir.glob("*.pdf"))
|
||||
|
||||
if not pdf_files:
|
||||
print(f"No PDF files found in {input_dir}")
|
||||
return []
|
||||
|
||||
print(f"Found {len(pdf_files)} PDF files to parse")
|
||||
|
||||
# Initialize parser
|
||||
parser = LlamaParse(
|
||||
api_key=api_key,
|
||||
num_workers=1, # We control concurrency with semaphore
|
||||
show_progress=False, # We'll show our own progress
|
||||
)
|
||||
|
||||
# Create semaphore to limit concurrent requests
|
||||
semaphore = asyncio.Semaphore(max_concurrent)
|
||||
|
||||
# Create tasks for all files
|
||||
tasks = [parse_single_file(parser, pdf_file, semaphore) for pdf_file in pdf_files]
|
||||
|
||||
# Run all tasks concurrently (but limited by semaphore)
|
||||
print(
|
||||
f"Processing {len(tasks)} files with max {max_concurrent} concurrent operations..."
|
||||
)
|
||||
start_time = datetime.now()
|
||||
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
end_time = datetime.now()
|
||||
duration = (end_time - start_time).total_seconds()
|
||||
|
||||
# Process results
|
||||
successful = [
|
||||
r for r in results if isinstance(r, dict) and r.get("status") == "success"
|
||||
]
|
||||
failed = [r for r in results if isinstance(r, dict) and r.get("status") == "error"]
|
||||
|
||||
# Print summary
|
||||
print("PARSE SUMMARY \n")
|
||||
print(f"Total files: {len(pdf_files)}")
|
||||
print(f"Successful: {len(successful)}")
|
||||
print(f"Failed: {len(failed)}")
|
||||
print(f"Total time: {duration:.2f} seconds")
|
||||
print(f"Average time per file: {duration / len(pdf_files):.2f} seconds")
|
||||
|
||||
if failed:
|
||||
print("\nFailed files:")
|
||||
for result in failed:
|
||||
print(f" - {result['file']}: {result.get('error', 'Unknown error')}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for the script."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Batch process PDFs in a folder with LlamaParse"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input-dir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Directory containing PDF files to parse",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-concurrent",
|
||||
type=int,
|
||||
default=5,
|
||||
help="Maximum number of concurrent parse operations (default: 5)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
input_dir = Path(args.input_dir)
|
||||
|
||||
# Validate input directory
|
||||
if not input_dir.exists():
|
||||
print(f"Error: Input directory does not exist: {input_dir}")
|
||||
return
|
||||
|
||||
if not input_dir.is_dir():
|
||||
print(f"Error: Input path is not a directory: {input_dir}")
|
||||
return
|
||||
|
||||
# Get API key from environment (loaded from .env file)
|
||||
api_key = os.getenv("LLAMA_CLOUD_API_KEY")
|
||||
if not api_key:
|
||||
print("Error: LLAMA_CLOUD_API_KEY not found. Please set it in your .env file")
|
||||
return
|
||||
|
||||
# Run async function
|
||||
asyncio.run(
|
||||
parse_folder(
|
||||
input_dir=input_dir,
|
||||
max_concurrent=args.max_concurrent,
|
||||
api_key=api_key,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -17,6 +17,14 @@
|
||||
"| Aug-19-2025 | 0.6.61 | Maintained |"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0cb82ca8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ef115dbe-b834-4639-828e-e2c11aef710b",
|
||||
|
||||
@@ -18,6 +18,13 @@
|
||||
"| Aug-18-2025 | 0.6.61 | Maintained |"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
@@ -14,6 +14,13 @@
|
||||
"| Aug-18-2025 | N/A | Maintained |"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
@@ -14,6 +14,13 @@
|
||||
"| Aug-18-2025 | 0.6.61 | Maintained |"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
@@ -18,6 +18,13 @@
|
||||
"| Aug-18-2025 | 0.6.61 | Maintained |\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
@@ -19,6 +19,14 @@
|
||||
"| Aug-18-2025 | 0.6.61 | Maintained |"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bb595498",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a004db48-8d3f-421c-915a-477692f71b90",
|
||||
|
||||
@@ -16,6 +16,13 @@
|
||||
"| Aug-19-2025 | 0.6.61 | Deprecated |"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
@@ -19,6 +19,14 @@
|
||||
"| Aug-19-2025 | 0.6.61 | Maintained |"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8b937443",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a004db48-8d3f-421c-915a-477692f71b90",
|
||||
|
||||
@@ -19,6 +19,14 @@
|
||||
"| Aug-19-2025 | 0.6.61 | Maintained |"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "037cc6d9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a004db48-8d3f-421c-915a-477692f71b90",
|
||||
|
||||
@@ -19,6 +19,14 @@
|
||||
"| Aug-19-2025 | 0.6.61 | Maintained |"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7aa3be47",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
@@ -21,6 +21,13 @@
|
||||
"| Aug-19-2025 | 0.6.61 | Maintained |"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
@@ -8,6 +8,14 @@
|
||||
"<a href=\"https://colab.research.google.com/github/run-llama/llama_cloud_services/blob/main/examples/parse/demo_starter_multimodal.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "da52cfa3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4e081457",
|
||||
|
||||
@@ -7,6 +7,13 @@
|
||||
"<a href=\"https://colab.research.google.com/github/run-llama/llama_cloud_services/blob/main/examples/parse/demo_starter_parse_selected_pages.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
@@ -14,6 +14,13 @@
|
||||
"| Aug-19-2025 | 0.6.61 | Maintained |"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
@@ -17,6 +17,14 @@
|
||||
"| Aug-19-2025 | 0.6.61 | Maintained |\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a3636937",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5f7d99ad-6ebd-47d0-92a7-566630b0c22a",
|
||||
|
||||
@@ -7,6 +7,13 @@
|
||||
"<a href=\"https://colab.research.google.com/github/run-llama/llama_cloud_services/blob/main/examples/parse/excel/o1_excel_rag.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
@@ -17,6 +17,14 @@
|
||||
"| Before Feb 2025 | N/A | Deprecated |"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0facb0b9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e8db8ac2-5221-44de-a53e-cb5ab37ac8f5",
|
||||
|
||||
@@ -19,6 +19,14 @@
|
||||
"| Aug-19-2025 | 0.6.61 | Maintained |\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bb943339",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
@@ -19,6 +19,14 @@
|
||||
"| Aug-19-2025 | 0.6.61 | Maintained |\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "17e62444",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
@@ -19,6 +19,14 @@
|
||||
"| Aug-19-2025 | 0.6.61 | Maintained |"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fe7e837a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "15e60ecf-519c-41fc-911b-765adaf8bad4",
|
||||
|
||||
@@ -9,6 +9,13 @@
|
||||
"<a href=\"https://colab.research.google.com/github/run-llama/llama_cloud_services/blob/main/examples/parse/multimodal/insurance_rag.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
@@ -23,6 +23,13 @@
|
||||
"- [US Immigration Case](https://github.com/user-attachments/files/16536446/us_immigration_case.pdf)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
@@ -27,6 +27,14 @@
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "93d4f9ab",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "54e8d9a7-5036-4d32-818f-00b2e888521f",
|
||||
|
||||
@@ -27,6 +27,14 @@
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fc1b5803",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "54e8d9a7-5036-4d32-818f-00b2e888521f",
|
||||
|
||||
@@ -19,6 +19,14 @@
|
||||
"| Aug-20-2025 | 0.6.61 | Maintained |"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7dafd458",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "54e8d9a7-5036-4d32-818f-00b2e888521f",
|
||||
|
||||
@@ -21,6 +21,14 @@
|
||||
"We use our workflow abstraction to define an agentic system that contains two main phases: a research phase that pulls in relevant files through chunk-level or file-level retrieval, and then a blog generation phase that synthesizes the final report."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8c881021",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "54e8d9a7-5036-4d32-818f-00b2e888521f",
|
||||
|
||||
@@ -9,6 +9,13 @@
|
||||
"<a href=\"https://colab.research.google.com/github/run-llama/llama_cloud_services/blob/main/examples/parse/multimodal/product_manual_rag.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
@@ -19,6 +19,14 @@
|
||||
"| Prior to Feb-2025 | N/A | Deprecated |"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b27f0e78",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
@@ -14,6 +14,13 @@
|
||||
"| Prior to Feb-2025 | N/A | Deprecated |"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
@@ -29,6 +29,13 @@
|
||||
"In this demonstration, we showcase how parsing instructions can be used to extract specific information from unstructured documents. Using a McDonald's Receipt, we show how to ignore parts of the document and only parse the price of each order and the final amount to be paid."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
@@ -18,6 +18,13 @@
|
||||
"Many documents can have varying complexity across pages - some pages have text, and other pages have images. The text-only pages only require cheap parsing modes, whereas the image-based pages require more advanced modes. In this notebook we show you how to take advantage of \"auto mode\" in LlamaParse which adaptively parses different pages according to different modes, which lets you get optimal performance at the cheapest cost.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
@@ -37,6 +37,13 @@
|
||||
"With visual references, you can build applications that preserve document structure and provide users with trustworthy, traceable visual citations. We will now leverage this feature to build our query engine."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
@@ -24,6 +24,13 @@
|
||||
"| Aug-18-2025 | 0.6.61 | Maintained |"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
@@ -26,6 +26,14 @@
|
||||
"We use LlamaParse to parse the context documents as well as the RFP document itself."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ad140aef",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
@@ -22,6 +22,14 @@
|
||||
"**NOTE**: The pricing for LlamaParse + gpt4o is an order more expensive than using LlamaParse by default. Currently, every page parsed with gpt4o counts for 10 pages in the LlamaParse usage tracker.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "211c52fe",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
@@ -0,0 +1,73 @@
|
||||
This project uses LlamaSheets to extract data from spreadsheets for analysis.
|
||||
|
||||
## Current Project Structure
|
||||
|
||||
- `data/` - Contains extracted parquet files from LlamaSheets
|
||||
- `{name}_region_{N}.parquet` - Table data files
|
||||
- `{name}_metadata_{N}.parquet` - Cell metadata files
|
||||
- `{name}_job_metadata.json` - Extraction job information
|
||||
- `scripts/` - Analysis and helper scripts
|
||||
- `reports/` - Your generated reports and outputs
|
||||
|
||||
## Working with LlamaSheets Data
|
||||
|
||||
### Understanding the Files
|
||||
|
||||
When a spreadsheet is extracted, you'll find:
|
||||
|
||||
1. **Table parquet files** (`region_*.parquet`): The actual table data
|
||||
- Columns correspond to spreadsheet columns
|
||||
- Data types are preserved (dates, numbers, strings, booleans)
|
||||
|
||||
2. **Metadata parquet files** (`metadata_*.parquet`): Rich cell-level metadata
|
||||
- Formatting: `font_bold`, `font_italic`, `font_size`, `background_color_rgb`
|
||||
- Position: `row_number`, `column_number`, `coordinate` (e.g., "A1")
|
||||
- Type detection: `data_type`, `is_date_like`, `is_percentage`, `is_currency`
|
||||
- Layout: `is_in_first_row`, `is_merged_cell`, `horizontal_alignment`
|
||||
- Content: `cell_value`, `raw_cell_value`
|
||||
|
||||
3. **Job metadata JSON** (`job_metadata.json`): Overall extraction results
|
||||
- `regions[]`: List of extracted regions with IDs, locations, and titles/descriptions
|
||||
- `worksheet_metadata[]`: Generated titles and descriptions
|
||||
- `status`: Success/failure status
|
||||
|
||||
### Key Principles
|
||||
|
||||
1. **Use metadata to understand structure**: Bold cells often indicate headers, colors indicate groupings
|
||||
2. **Validate before analysis**: Check data types, look for missing values
|
||||
3. **Preserve formatting context**: The metadata tells you what the spreadsheet author emphasized
|
||||
4. **Save intermediate results**: Store cleaned data as new parquet files
|
||||
|
||||
### Common Patterns
|
||||
|
||||
**Loading data:**
|
||||
```python
|
||||
import pandas as pd
|
||||
|
||||
df = pd.read_parquet("data/region_1_Sheet1.parquet")
|
||||
meta_df = pd.read_parquet("data/metadata_1_Sheet1.parquet")
|
||||
```
|
||||
|
||||
**Finding headers:**
|
||||
```python
|
||||
headers = meta_df[meta_df["font_bold"] == True]["cell_value"].tolist()
|
||||
```
|
||||
|
||||
**Finding date columns:**
|
||||
```python
|
||||
date_cols = meta_df[meta_df["is_date_like"] == True]["column_number"].unique()
|
||||
```
|
||||
|
||||
## Tools Available
|
||||
|
||||
- **Python 3.11+**: For data analysis
|
||||
- **pandas**: DataFrame manipulation
|
||||
- **pyarrow**: Parquet file reading
|
||||
- **matplotlib**: Visualization (optional)
|
||||
|
||||
## Guidelines
|
||||
|
||||
- Always read the job_metadata.json first to understand what was extracted
|
||||
- Check both table data and metadata before making assumptions
|
||||
- Write reusable functions for common operations
|
||||
- Document any data quality issues discovered
|
||||
@@ -0,0 +1,283 @@
|
||||
"""
|
||||
⚠️ DEPRECATION NOTICE:
|
||||
This example uses the deprecated llama-cloud-services package, which will be maintained until May 1, 2026.
|
||||
Please migrate to: pip install llama-cloud>=1.0 (https://github.com/run-llama/llama-cloud-py)
|
||||
"""
|
||||
"""
|
||||
Generate sample spreadsheets for LlamaSheets + Claude workflows.
|
||||
|
||||
This script creates example Excel files that demonstrate different use cases:
|
||||
1. Simple data table (for Workflow 1)
|
||||
2. Regional sales data (for Workflow 2)
|
||||
3. Complex budget with formatting (for Workflow 3)
|
||||
4. Weekly sales report (for Workflow 4)
|
||||
|
||||
Usage:
|
||||
python generate_sample_data.py
|
||||
"""
|
||||
|
||||
import random
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
from openpyxl import Workbook
|
||||
from openpyxl.styles import Font, PatternFill, Alignment
|
||||
|
||||
|
||||
def generate_workflow_1_data(output_dir: Path) -> None:
|
||||
"""Generate simple financial report for Workflow 1."""
|
||||
print("📊 Generating Workflow 1: financial_report_q1.xlsx")
|
||||
|
||||
# Create sample quarterly data
|
||||
months = ["January", "February", "March"]
|
||||
categories = ["Revenue", "Cost of Goods Sold", "Operating Expenses", "Net Income"]
|
||||
|
||||
data = []
|
||||
for category in categories:
|
||||
row: dict[str, str | int] = {"Category": category}
|
||||
for month in months:
|
||||
if category == "Revenue":
|
||||
value = random.randint(80000, 120000)
|
||||
elif category == "Cost of Goods Sold":
|
||||
value = random.randint(30000, 50000)
|
||||
elif category == "Operating Expenses":
|
||||
value = random.randint(20000, 35000)
|
||||
else: # Net Income
|
||||
value = int(
|
||||
int(row.get("January", 0))
|
||||
+ int(row.get("February", 0))
|
||||
+ int(row.get("March", 0))
|
||||
)
|
||||
value = random.randint(15000, 40000)
|
||||
row[month] = value
|
||||
data.append(row)
|
||||
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
# Write to Excel
|
||||
output_file = output_dir / "financial_report_q1.xlsx"
|
||||
with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
|
||||
df.to_excel(writer, sheet_name="Q1 Summary", index=False)
|
||||
|
||||
# Format it nicely
|
||||
worksheet = writer.sheets["Q1 Summary"]
|
||||
for cell in worksheet[1]: # Header row
|
||||
cell.font = Font(bold=True)
|
||||
cell.fill = PatternFill(
|
||||
start_color="4F81BD", end_color="4F81BD", fill_type="solid"
|
||||
)
|
||||
cell.font = Font(color="FFFFFF", bold=True)
|
||||
|
||||
print(f" ✅ Created {output_file}")
|
||||
|
||||
|
||||
def generate_workflow_2_data(output_dir: Path) -> None:
|
||||
"""Generate regional sales data for Workflow 2."""
|
||||
print("\n📊 Generating Workflow 2: Regional sales data")
|
||||
|
||||
regions = ["northeast", "southeast", "west"]
|
||||
products = ["Widget A", "Widget B", "Widget C", "Gadget X", "Gadget Y"]
|
||||
|
||||
for region in regions:
|
||||
data = []
|
||||
start_date = datetime(2024, 1, 1)
|
||||
|
||||
# Generate 90 days of sales data
|
||||
for day in range(90):
|
||||
date = start_date + timedelta(days=day)
|
||||
# Random number of sales per day (3-8)
|
||||
for _ in range(random.randint(3, 8)):
|
||||
product = random.choice(products)
|
||||
units_sold = random.randint(1, 20)
|
||||
price_per_unit = random.randint(50, 200)
|
||||
revenue = units_sold * price_per_unit
|
||||
|
||||
data.append(
|
||||
{
|
||||
"Date": date.strftime("%Y-%m-%d"),
|
||||
"Product": product,
|
||||
"Units_Sold": units_sold,
|
||||
"Revenue": revenue,
|
||||
}
|
||||
)
|
||||
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
# Write to Excel
|
||||
output_file = output_dir / f"sales_{region}.xlsx"
|
||||
df.to_excel(output_file, sheet_name="Sales", index=False)
|
||||
print(f" ✅ Created {output_file} ({len(df)} rows)")
|
||||
|
||||
|
||||
def generate_workflow_3_data(output_dir: Path) -> None:
|
||||
"""Generate complex budget spreadsheet with formatting for Workflow 3."""
|
||||
print("\n📊 Generating Workflow 3: company_budget_2024.xlsx")
|
||||
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "Budget"
|
||||
|
||||
# Define departments with colors
|
||||
departments = {
|
||||
"Engineering": "C6E0B4",
|
||||
"Marketing": "FFD966",
|
||||
"Sales": "F4B084",
|
||||
"Operations": "B4C7E7",
|
||||
}
|
||||
|
||||
# Define categories
|
||||
categories = {
|
||||
"Personnel": ["Salaries", "Benefits", "Training"],
|
||||
"Infrastructure": ["Office Rent", "Equipment", "Software Licenses"],
|
||||
"Operations": ["Travel", "Supplies", "Miscellaneous"],
|
||||
}
|
||||
|
||||
# Styles
|
||||
header_font = Font(bold=True, size=12)
|
||||
category_font = Font(bold=True, size=11)
|
||||
|
||||
row = 1
|
||||
|
||||
# Title
|
||||
ws.merge_cells(f"A{row}:E{row}")
|
||||
ws[f"A{row}"] = "2024 Annual Budget"
|
||||
ws[f"A{row}"].font = Font(bold=True, size=14)
|
||||
ws[f"A{row}"].alignment = Alignment(horizontal="center")
|
||||
row += 2
|
||||
|
||||
# Headers
|
||||
ws[f"A{row}"] = "Category"
|
||||
ws[f"B{row}"] = "Item"
|
||||
for i, dept in enumerate(departments.keys()):
|
||||
ws.cell(row, 3 + i, dept)
|
||||
ws.cell(row, 3 + i).font = header_font
|
||||
|
||||
for cell in ws[row]:
|
||||
cell.font = header_font
|
||||
row += 1
|
||||
|
||||
# Data
|
||||
for category, items in categories.items():
|
||||
# Category header (bold)
|
||||
ws[f"A{row}"] = category
|
||||
ws[f"A{row}"].font = category_font
|
||||
row += 1
|
||||
|
||||
# Items with department budgets
|
||||
for item in items:
|
||||
ws[f"A{row}"] = ""
|
||||
ws[f"B{row}"] = item
|
||||
|
||||
# Add budget amounts for each department (with color)
|
||||
for i, (dept, color) in enumerate(departments.items()):
|
||||
amount = random.randint(5000, 50000)
|
||||
cell = ws.cell(row, 3 + i, amount)
|
||||
cell.fill = PatternFill(
|
||||
start_color=color, end_color=color, fill_type="solid"
|
||||
)
|
||||
cell.number_format = "$#,##0"
|
||||
|
||||
row += 1
|
||||
|
||||
row += 1 # Blank row between categories
|
||||
|
||||
# Adjust column widths
|
||||
ws.column_dimensions["A"].width = 20
|
||||
ws.column_dimensions["B"].width = 25
|
||||
for i in range(len(departments)):
|
||||
ws.column_dimensions[chr(67 + i)].width = 15 # C, D, E, F
|
||||
|
||||
output_file = output_dir / "company_budget_2024.xlsx"
|
||||
wb.save(output_file)
|
||||
print(f" ✅ Created {output_file}")
|
||||
print(" • Bold categories, colored departments, merged title cell")
|
||||
|
||||
|
||||
def generate_workflow_4_data(output_dir: Path) -> None:
|
||||
"""Generate weekly sales report for Workflow 4."""
|
||||
print("\n📊 Generating Workflow 4: sales_weekly.xlsx")
|
||||
|
||||
products = [
|
||||
"Product A",
|
||||
"Product B",
|
||||
"Product C",
|
||||
"Product D",
|
||||
"Product E",
|
||||
"Product F",
|
||||
"Product G",
|
||||
"Product H",
|
||||
]
|
||||
|
||||
# Generate one week of data
|
||||
data = []
|
||||
start_date = datetime(2024, 11, 4) # Monday
|
||||
|
||||
for day in range(7):
|
||||
date = start_date + timedelta(days=day)
|
||||
# Each product has 3-10 transactions per day
|
||||
for product in products:
|
||||
for _ in range(random.randint(3, 10)):
|
||||
units = random.randint(1, 15)
|
||||
price = random.randint(20, 150)
|
||||
revenue = units * price
|
||||
|
||||
data.append(
|
||||
{
|
||||
"Date": date.strftime("%Y-%m-%d"),
|
||||
"Product": product,
|
||||
"Units": units,
|
||||
"Revenue": revenue,
|
||||
}
|
||||
)
|
||||
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
# Write to Excel with some formatting
|
||||
output_file = output_dir / "sales_weekly.xlsx"
|
||||
with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
|
||||
df.to_excel(writer, sheet_name="Weekly Sales", index=False)
|
||||
|
||||
# Format header
|
||||
worksheet = writer.sheets["Weekly Sales"]
|
||||
for cell in worksheet[1]:
|
||||
cell.font = Font(bold=True)
|
||||
|
||||
print(f" ✅ Created {output_file} ({len(df)} rows)")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Generate all sample data files."""
|
||||
print("=" * 60)
|
||||
print("Generating Sample Data for LlamaSheets + Coding Agent Workflows")
|
||||
print("=" * 60)
|
||||
|
||||
# Create output directory
|
||||
output_dir = Path("input_data")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Generate data for each workflow
|
||||
generate_workflow_1_data(output_dir)
|
||||
generate_workflow_2_data(output_dir)
|
||||
generate_workflow_3_data(output_dir)
|
||||
generate_workflow_4_data(output_dir)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("✅ All sample data generated!")
|
||||
print("=" * 60)
|
||||
print(f"\nFiles created in {output_dir.absolute()}:")
|
||||
print("\nWorkflow 1 (Understanding a New Spreadsheet):")
|
||||
print(" • financial_report_q1.xlsx")
|
||||
print("\nWorkflow 2 (Generating Analysis Scripts):")
|
||||
print(" • sales_northeast.xlsx")
|
||||
print(" • sales_southeast.xlsx")
|
||||
print(" • sales_west.xlsx")
|
||||
print("\nWorkflow 3 (Using Cell Metadata):")
|
||||
print(" • company_budget_2024.xlsx")
|
||||
print("\nWorkflow 4 (Complete Automation):")
|
||||
print(" • sales_weekly.xlsx")
|
||||
print("\nYou can now use these files with the workflows in the documentation!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,5 @@
|
||||
llama-cloud-services # LlamaSheets SDK
|
||||
pandas>=2.0.0
|
||||
pyarrow>=12.0.0
|
||||
openpyxl>=3.0.0 # For Excel file support
|
||||
matplotlib>=3.7.0 # For visualizations (optional)
|
||||
@@ -0,0 +1,105 @@
|
||||
"""
|
||||
⚠️ DEPRECATION NOTICE:
|
||||
This example uses the deprecated llama-cloud-services package, which will be maintained until May 1, 2026.
|
||||
Please migrate to: pip install llama-cloud>=1.0 (https://github.com/run-llama/llama-cloud-py)
|
||||
"""
|
||||
"""Helper script to extract spreadsheets using LlamaSheets."""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import dotenv
|
||||
from pathlib import Path
|
||||
|
||||
from llama_cloud_services.beta.sheets import LlamaSheets
|
||||
from llama_cloud_services.beta.sheets.types import (
|
||||
SpreadsheetParsingConfig,
|
||||
SpreadsheetResultType,
|
||||
)
|
||||
|
||||
dotenv.load_dotenv()
|
||||
|
||||
|
||||
async def extract_spreadsheet(
|
||||
file_path: str, output_dir: str = "data", generate_metadata: bool = True
|
||||
) -> dict:
|
||||
"""Extract a spreadsheet using LlamaSheets."""
|
||||
|
||||
client = LlamaSheets(
|
||||
base_url="https://api.cloud.llamaindex.ai",
|
||||
api_key=os.getenv("LLAMA_CLOUD_API_KEY"),
|
||||
)
|
||||
|
||||
print(f"Extracting {file_path}...")
|
||||
|
||||
# Extract regions
|
||||
config = SpreadsheetParsingConfig(
|
||||
sheet_names=None, # Extract all sheets
|
||||
generate_additional_metadata=generate_metadata,
|
||||
)
|
||||
|
||||
job_result = await client.aextract_regions(file_path, config=config)
|
||||
|
||||
print(f"Extracted {len(job_result.regions)} region(s)")
|
||||
|
||||
# Create output directory
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Get base name for files
|
||||
base_name = Path(file_path).stem
|
||||
|
||||
# Save job metadata
|
||||
job_metadata_path = output_path / f"{base_name}_job_metadata.json"
|
||||
with open(job_metadata_path, "w") as f:
|
||||
json.dump(job_result.model_dump(mode="json"), f, indent=2)
|
||||
print(f"Saved job metadata to {job_metadata_path}")
|
||||
|
||||
# Download each region
|
||||
for idx, region in enumerate(job_result.regions, 1):
|
||||
sheet_name = region.sheet_name.replace(" ", "_")
|
||||
|
||||
# Download region data
|
||||
region_bytes = await client.adownload_region_result(
|
||||
job_id=job_result.id,
|
||||
region_id=region.region_id,
|
||||
result_type=region.region_type,
|
||||
)
|
||||
|
||||
region_path = output_path / f"{base_name}_region_{idx}_{sheet_name}.parquet"
|
||||
with open(region_path, "wb") as f:
|
||||
f.write(region_bytes)
|
||||
print(f" Table {idx}: {region_path}")
|
||||
|
||||
# Download metadata
|
||||
metadata_bytes = await client.adownload_region_result(
|
||||
job_id=job_result.id,
|
||||
region_id=region.region_id,
|
||||
result_type=SpreadsheetResultType.CELL_METADATA,
|
||||
)
|
||||
|
||||
metadata_path = output_path / f"{base_name}_metadata_{idx}_{sheet_name}.parquet"
|
||||
with open(metadata_path, "wb") as f:
|
||||
f.write(metadata_bytes)
|
||||
print(f" Metadata {idx}: {metadata_path}")
|
||||
|
||||
print(f"\nAll files saved to {output_path}/")
|
||||
|
||||
return job_result.model_dump(mode="json")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python scripts/extract.py <spreadsheet_file>")
|
||||
sys.exit(1)
|
||||
|
||||
file_path = sys.argv[1]
|
||||
|
||||
if not Path(file_path).exists():
|
||||
print(f"❌ File not found: {file_path}")
|
||||
sys.exit(1)
|
||||
|
||||
result = asyncio.run(extract_spreadsheet(file_path))
|
||||
print(f"\n✅ Extraction complete! Job ID: {result['id']}")
|
||||
@@ -0,0 +1,283 @@
|
||||
"""
|
||||
⚠️ DEPRECATION NOTICE:
|
||||
This example uses the deprecated llama-cloud-services package, which will be maintained until May 1, 2026.
|
||||
Please migrate to: pip install llama-cloud>=1.0 (https://github.com/run-llama/llama-cloud-py)
|
||||
"""
|
||||
"""
|
||||
Generate sample spreadsheets for LlamaSheets + LlamaIndex Agent workflows.
|
||||
|
||||
This script creates example Excel files that demonstrate different use cases:
|
||||
1. Simple data table (for Workflow 1)
|
||||
2. Regional sales data (for Workflow 2)
|
||||
3. Complex budget with formatting (for Workflow 3)
|
||||
4. Weekly sales report (for Workflow 4)
|
||||
|
||||
Usage:
|
||||
python generate_sample_data.py
|
||||
"""
|
||||
|
||||
import random
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
from openpyxl import Workbook
|
||||
from openpyxl.styles import Font, PatternFill, Alignment
|
||||
|
||||
|
||||
def generate_workflow_1_data(output_dir: Path) -> None:
|
||||
"""Generate simple financial report for Workflow 1."""
|
||||
print("📊 Generating Workflow 1: financial_report_q1.xlsx")
|
||||
|
||||
# Create sample quarterly data
|
||||
months = ["January", "February", "March"]
|
||||
categories = ["Revenue", "Cost of Goods Sold", "Operating Expenses", "Net Income"]
|
||||
|
||||
data = []
|
||||
for category in categories:
|
||||
row: dict[str, str | int] = {"Category": category}
|
||||
for month in months:
|
||||
if category == "Revenue":
|
||||
value = random.randint(80000, 120000)
|
||||
elif category == "Cost of Goods Sold":
|
||||
value = random.randint(30000, 50000)
|
||||
elif category == "Operating Expenses":
|
||||
value = random.randint(20000, 35000)
|
||||
else: # Net Income
|
||||
value = int(
|
||||
int(row.get("January", 0))
|
||||
+ int(row.get("February", 0))
|
||||
+ int(row.get("March", 0))
|
||||
)
|
||||
value = random.randint(15000, 40000)
|
||||
row[month] = value
|
||||
data.append(row)
|
||||
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
# Write to Excel
|
||||
output_file = output_dir / "financial_report_q1.xlsx"
|
||||
with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
|
||||
df.to_excel(writer, sheet_name="Q1 Summary", index=False)
|
||||
|
||||
# Format it nicely
|
||||
worksheet = writer.sheets["Q1 Summary"]
|
||||
for cell in worksheet[1]: # Header row
|
||||
cell.font = Font(bold=True)
|
||||
cell.fill = PatternFill(
|
||||
start_color="4F81BD", end_color="4F81BD", fill_type="solid"
|
||||
)
|
||||
cell.font = Font(color="FFFFFF", bold=True)
|
||||
|
||||
print(f" ✅ Created {output_file}")
|
||||
|
||||
|
||||
def generate_workflow_2_data(output_dir: Path) -> None:
|
||||
"""Generate regional sales data for Workflow 2."""
|
||||
print("\n📊 Generating Workflow 2: Regional sales data")
|
||||
|
||||
regions = ["northeast", "southeast", "west"]
|
||||
products = ["Widget A", "Widget B", "Widget C", "Gadget X", "Gadget Y"]
|
||||
|
||||
for region in regions:
|
||||
data = []
|
||||
start_date = datetime(2024, 1, 1)
|
||||
|
||||
# Generate 90 days of sales data
|
||||
for day in range(90):
|
||||
date = start_date + timedelta(days=day)
|
||||
# Random number of sales per day (3-8)
|
||||
for _ in range(random.randint(3, 8)):
|
||||
product = random.choice(products)
|
||||
units_sold = random.randint(1, 20)
|
||||
price_per_unit = random.randint(50, 200)
|
||||
revenue = units_sold * price_per_unit
|
||||
|
||||
data.append(
|
||||
{
|
||||
"Date": date.strftime("%Y-%m-%d"),
|
||||
"Product": product,
|
||||
"Units_Sold": units_sold,
|
||||
"Revenue": revenue,
|
||||
}
|
||||
)
|
||||
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
# Write to Excel
|
||||
output_file = output_dir / f"sales_{region}.xlsx"
|
||||
df.to_excel(output_file, sheet_name="Sales", index=False)
|
||||
print(f" ✅ Created {output_file} ({len(df)} rows)")
|
||||
|
||||
|
||||
def generate_workflow_3_data(output_dir: Path) -> None:
|
||||
"""Generate complex budget spreadsheet with formatting for Workflow 3."""
|
||||
print("\n📊 Generating Workflow 3: company_budget_2024.xlsx")
|
||||
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "Budget"
|
||||
|
||||
# Define departments with colors
|
||||
departments = {
|
||||
"Engineering": "C6E0B4",
|
||||
"Marketing": "FFD966",
|
||||
"Sales": "F4B084",
|
||||
"Operations": "B4C7E7",
|
||||
}
|
||||
|
||||
# Define categories
|
||||
categories = {
|
||||
"Personnel": ["Salaries", "Benefits", "Training"],
|
||||
"Infrastructure": ["Office Rent", "Equipment", "Software Licenses"],
|
||||
"Operations": ["Travel", "Supplies", "Miscellaneous"],
|
||||
}
|
||||
|
||||
# Styles
|
||||
header_font = Font(bold=True, size=12)
|
||||
category_font = Font(bold=True, size=11)
|
||||
|
||||
row = 1
|
||||
|
||||
# Title
|
||||
ws.merge_cells(f"A{row}:E{row}")
|
||||
ws[f"A{row}"] = "2024 Annual Budget"
|
||||
ws[f"A{row}"].font = Font(bold=True, size=14)
|
||||
ws[f"A{row}"].alignment = Alignment(horizontal="center")
|
||||
row += 2
|
||||
|
||||
# Headers
|
||||
ws[f"A{row}"] = "Category"
|
||||
ws[f"B{row}"] = "Item"
|
||||
for i, dept in enumerate(departments.keys()):
|
||||
ws.cell(row, 3 + i, dept)
|
||||
ws.cell(row, 3 + i).font = header_font
|
||||
|
||||
for cell in ws[row]:
|
||||
cell.font = header_font
|
||||
row += 1
|
||||
|
||||
# Data
|
||||
for category, items in categories.items():
|
||||
# Category header (bold)
|
||||
ws[f"A{row}"] = category
|
||||
ws[f"A{row}"].font = category_font
|
||||
row += 1
|
||||
|
||||
# Items with department budgets
|
||||
for item in items:
|
||||
ws[f"A{row}"] = ""
|
||||
ws[f"B{row}"] = item
|
||||
|
||||
# Add budget amounts for each department (with color)
|
||||
for i, (dept, color) in enumerate(departments.items()):
|
||||
amount = random.randint(5000, 50000)
|
||||
cell = ws.cell(row, 3 + i, amount)
|
||||
cell.fill = PatternFill(
|
||||
start_color=color, end_color=color, fill_type="solid"
|
||||
)
|
||||
cell.number_format = "$#,##0"
|
||||
|
||||
row += 1
|
||||
|
||||
row += 1 # Blank row between categories
|
||||
|
||||
# Adjust column widths
|
||||
ws.column_dimensions["A"].width = 20
|
||||
ws.column_dimensions["B"].width = 25
|
||||
for i in range(len(departments)):
|
||||
ws.column_dimensions[chr(67 + i)].width = 15 # C, D, E, F
|
||||
|
||||
output_file = output_dir / "company_budget_2024.xlsx"
|
||||
wb.save(output_file)
|
||||
print(f" ✅ Created {output_file}")
|
||||
print(" • Bold categories, colored departments, merged title cell")
|
||||
|
||||
|
||||
def generate_workflow_4_data(output_dir: Path) -> None:
|
||||
"""Generate weekly sales report for Workflow 4."""
|
||||
print("\n📊 Generating Workflow 4: sales_weekly.xlsx")
|
||||
|
||||
products = [
|
||||
"Product A",
|
||||
"Product B",
|
||||
"Product C",
|
||||
"Product D",
|
||||
"Product E",
|
||||
"Product F",
|
||||
"Product G",
|
||||
"Product H",
|
||||
]
|
||||
|
||||
# Generate one week of data
|
||||
data = []
|
||||
start_date = datetime(2024, 11, 4) # Monday
|
||||
|
||||
for day in range(7):
|
||||
date = start_date + timedelta(days=day)
|
||||
# Each product has 3-10 transactions per day
|
||||
for product in products:
|
||||
for _ in range(random.randint(3, 10)):
|
||||
units = random.randint(1, 15)
|
||||
price = random.randint(20, 150)
|
||||
revenue = units * price
|
||||
|
||||
data.append(
|
||||
{
|
||||
"Date": date.strftime("%Y-%m-%d"),
|
||||
"Product": product,
|
||||
"Units": units,
|
||||
"Revenue": revenue,
|
||||
}
|
||||
)
|
||||
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
# Write to Excel with some formatting
|
||||
output_file = output_dir / "sales_weekly.xlsx"
|
||||
with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
|
||||
df.to_excel(writer, sheet_name="Weekly Sales", index=False)
|
||||
|
||||
# Format header
|
||||
worksheet = writer.sheets["Weekly Sales"]
|
||||
for cell in worksheet[1]:
|
||||
cell.font = Font(bold=True)
|
||||
|
||||
print(f" ✅ Created {output_file} ({len(df)} rows)")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Generate all sample data files."""
|
||||
print("=" * 60)
|
||||
print("Generating Sample Data for LlamaSheets + Coding Agent Workflows")
|
||||
print("=" * 60)
|
||||
|
||||
# Create output directory
|
||||
output_dir = Path("input_data")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Generate data for each workflow
|
||||
generate_workflow_1_data(output_dir)
|
||||
generate_workflow_2_data(output_dir)
|
||||
generate_workflow_3_data(output_dir)
|
||||
generate_workflow_4_data(output_dir)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("✅ All sample data generated!")
|
||||
print("=" * 60)
|
||||
print(f"\nFiles created in {output_dir.absolute()}:")
|
||||
print("\nWorkflow 1 (Understanding a New Spreadsheet):")
|
||||
print(" • financial_report_q1.xlsx")
|
||||
print("\nWorkflow 2 (Generating Analysis Scripts):")
|
||||
print(" • sales_northeast.xlsx")
|
||||
print(" • sales_southeast.xlsx")
|
||||
print(" • sales_west.xlsx")
|
||||
print("\nWorkflow 3 (Using Cell Metadata):")
|
||||
print(" • company_budget_2024.xlsx")
|
||||
print("\nWorkflow 4 (Complete Automation):")
|
||||
print(" • sales_weekly.xlsx")
|
||||
print("\nYou can now use these files with the workflows in the documentation!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,297 @@
|
||||
"""
|
||||
⚠️ DEPRECATION NOTICE:
|
||||
This example uses the deprecated llama-cloud-services package, which will be maintained until May 1, 2026.
|
||||
Please migrate to: pip install llama-cloud>=1.0 (https://github.com/run-llama/llama-cloud-py)
|
||||
"""
|
||||
"""
|
||||
LlamaSheets Agent with LlamaIndex
|
||||
|
||||
This example shows how to build an agent that can work with spreadsheet data
|
||||
extracted by LlamaSheets using Python code execution.
|
||||
|
||||
The agent has minimal tools but maximum flexibility - it can execute arbitrary
|
||||
pandas code against the extracted data, similar to a coding agent.
|
||||
|
||||
NOTE: Code execution should be handled safely in a sandboxed environment for security.
|
||||
"""
|
||||
|
||||
import io
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import dotenv
|
||||
import pandas as pd
|
||||
from llama_index.core.agent import FunctionAgent, ToolCall, ToolCallResult, AgentStream
|
||||
from llama_index.llms.openai import OpenAI
|
||||
from workflows import Context
|
||||
|
||||
dotenv.load_dotenv()
|
||||
|
||||
# Global context for executed code
|
||||
_code_context: Dict[str, Any] = {}
|
||||
|
||||
|
||||
# Helper function for initial agent context
|
||||
def list_extracted_data(data_dir: str = "data") -> str:
|
||||
"""
|
||||
List all regions and metadata files extracted by LlamaSheets.
|
||||
|
||||
This helps discover what data is available to work with.
|
||||
|
||||
Args:
|
||||
data_dir: Directory containing extracted parquet files (default: "data")
|
||||
|
||||
Returns:
|
||||
JSON string with information about available files
|
||||
"""
|
||||
data_path = Path(data_dir)
|
||||
|
||||
if not data_path.exists():
|
||||
return json.dumps({"error": f"Data directory '{data_dir}' not found"})
|
||||
|
||||
# Find all parquet and metadata files
|
||||
region_files = list(data_path.glob("*_region_*.parquet"))
|
||||
job_metadata_files = list(data_path.glob("*_job_metadata.json"))
|
||||
|
||||
regions = []
|
||||
for region_file in region_files:
|
||||
# Quick peek at dimensions
|
||||
df = pd.read_parquet(region_file)
|
||||
|
||||
# Find corresponding metadata file
|
||||
base_name = region_file.stem.replace("_region_", "_metadata_")
|
||||
metadata_path = region_file.parent / f"{base_name}.parquet"
|
||||
|
||||
regions.append(
|
||||
{
|
||||
"region_file": str(region_file),
|
||||
"metadata_file": str(metadata_path) if metadata_path.exists() else None,
|
||||
"shape": {"rows": len(df), "columns": len(df.columns)},
|
||||
"columns": list(df.columns),
|
||||
}
|
||||
)
|
||||
|
||||
result = {
|
||||
"data_directory": str(data_path.absolute()),
|
||||
"num_regions": len(regions),
|
||||
"regions": regions,
|
||||
"job_metadata_files": [str(f) for f in job_metadata_files],
|
||||
}
|
||||
|
||||
return json.dumps(result, indent=2)
|
||||
|
||||
|
||||
# Agent tool for code execution against dataframes
|
||||
def execute_code(code: str) -> str:
|
||||
"""
|
||||
Execute Python pandas code against LlamaSheets extracted data.
|
||||
|
||||
This tool allows flexible data analysis by executing arbitrary pandas code.
|
||||
You can load parquet files, manipulate dataframes, and return results.
|
||||
|
||||
The code executes in a context where:
|
||||
- pandas is available as 'pd'
|
||||
- json is available for formatting output
|
||||
|
||||
Args:
|
||||
code: Python code to execute. Any print() statements or stdout/stderr
|
||||
will be captured and returned. Optionally set a 'result' variable
|
||||
for structured output.
|
||||
|
||||
Returns:
|
||||
String containing:
|
||||
- Any stdout/stderr output from the code execution
|
||||
- The 'result' variable if it was set (formatted appropriately)
|
||||
- Error message if execution failed
|
||||
|
||||
Example usage:
|
||||
code = '''
|
||||
# Load and inspect data
|
||||
df = pd.read_parquet("data/sales_region_1.parquet")
|
||||
print(f"Loaded {len(df)} rows")
|
||||
|
||||
result = {
|
||||
"shape": df.shape,
|
||||
"columns": list(df.columns),
|
||||
"sample": df.head(3).to_dict(orient="records")
|
||||
}
|
||||
'''
|
||||
"""
|
||||
global _code_context
|
||||
|
||||
# Capture stdout and stderr
|
||||
stdout_capture = io.StringIO()
|
||||
stderr_capture = io.StringIO()
|
||||
old_stdout = sys.stdout
|
||||
old_stderr = sys.stderr
|
||||
|
||||
try:
|
||||
# Redirect stdout/stderr
|
||||
sys.stdout = stdout_capture
|
||||
sys.stderr = stderr_capture
|
||||
|
||||
# Create execution context with pandas, json, and previously loaded dfs
|
||||
exec_context = {
|
||||
"pd": pd,
|
||||
"json": json,
|
||||
"Path": Path,
|
||||
**_code_context, # Include previously loaded dataframes
|
||||
}
|
||||
|
||||
# Execute the code
|
||||
exec(code, exec_context)
|
||||
|
||||
# Update global context with any new variables (excluding built-ins and modules)
|
||||
for key, value in exec_context.items():
|
||||
if not key.startswith("_") and key not in ["pd", "json", "Path"]:
|
||||
_code_context[key] = value
|
||||
|
||||
# Restore stdout/stderr
|
||||
sys.stdout = old_stdout
|
||||
sys.stderr = old_stderr
|
||||
|
||||
# Collect output
|
||||
stdout_output = stdout_capture.getvalue()
|
||||
stderr_output = stderr_capture.getvalue()
|
||||
|
||||
output_parts = []
|
||||
|
||||
# Add stdout if any
|
||||
if stdout_output:
|
||||
output_parts.append(f"<stdout>{stdout_output}</stdout>")
|
||||
|
||||
# Add stderr if any
|
||||
if stderr_output:
|
||||
output_parts.append(f"<stderr>{stderr_output}</stderr>")
|
||||
|
||||
# Try to get a result (if code set a 'result' variable)
|
||||
if "result" in exec_context:
|
||||
result = exec_context["result"]
|
||||
result_str = None
|
||||
|
||||
if isinstance(result, pd.DataFrame):
|
||||
# Convert DataFrame to readable format
|
||||
result_str = result.to_string()
|
||||
elif isinstance(result, (dict, list)):
|
||||
result_str = json.dumps(result, indent=2, default=str)
|
||||
else:
|
||||
result_str = str(result)
|
||||
|
||||
if result_str:
|
||||
output_parts.append(f"<result_var>{result_str}</result_var>")
|
||||
|
||||
# Return combined output or success message
|
||||
if output_parts:
|
||||
return "\n\n".join(output_parts)
|
||||
else:
|
||||
return "Code executed successfully (no output or result)"
|
||||
|
||||
except Exception as e:
|
||||
# Restore stdout/stderr in case of error
|
||||
sys.stdout = old_stdout
|
||||
sys.stderr = old_stderr
|
||||
|
||||
# Get any partial output
|
||||
stdout_output = stdout_capture.getvalue()
|
||||
stderr_output = stderr_capture.getvalue()
|
||||
|
||||
error_parts = []
|
||||
if stdout_output:
|
||||
error_parts.append(f"=== STDOUT (before error) ===\n{stdout_output}")
|
||||
if stderr_output:
|
||||
error_parts.append(f"=== STDERR (before error) ===\n{stderr_output}")
|
||||
|
||||
error_parts.append(f"=== ERROR ===\n{str(e)}")
|
||||
error_parts.append(f"\n=== CODE ===\n{code}")
|
||||
|
||||
return "\n\n".join(error_parts)
|
||||
|
||||
|
||||
def create_llamasheets_agent(
|
||||
llm_model: str = "gpt-4.1", api_key: Optional[str] = None
|
||||
) -> FunctionAgent:
|
||||
# Initialize LLM
|
||||
llm = OpenAI(model=llm_model, api_key=api_key)
|
||||
|
||||
# Create tools list
|
||||
tools = [execute_code]
|
||||
|
||||
# System prompt to guide the agent
|
||||
available_regions = list_extracted_data()
|
||||
system_prompt = f"""You are an AI assistant that helps analyze spreadsheet data extracted by LlamaSheets.
|
||||
|
||||
LlamaSheets extracts messy spreadsheets into clean parquet files with two types of outputs:
|
||||
1. Region files (*_region_*.parquet) - The actual data with columns and rows
|
||||
2. Metadata files (*_metadata_*.parquet) - Rich cell-level metadata including:
|
||||
- Formatting: font_bold, font_italic, font_size, background_color_rgb
|
||||
- Position: row_number, column_number, coordinate
|
||||
- Type detection: data_type, is_date_like, is_percentage, is_currency
|
||||
- Layout: is_in_first_row, is_merged_cell, horizontal_alignment
|
||||
|
||||
You have access to tools that allow you to execute Python pandas code against these files.
|
||||
Use these tools to load the parquet files, analyze the data, and return results.
|
||||
|
||||
Key tips:
|
||||
- Bold cells in metadata often indicate headers
|
||||
- Background colors often indicate groupings or departments
|
||||
- Load both region and metadata files for complete analysis
|
||||
- Write clear pandas code - you have full pandas functionality available
|
||||
- Store results in variables for reuse across multiple code executions
|
||||
|
||||
Existing Processed Regions:
|
||||
{available_regions}
|
||||
"""
|
||||
|
||||
# Configure agent
|
||||
return FunctionAgent(tools=tools, llm=llm, system_prompt=system_prompt)
|
||||
|
||||
|
||||
async def main():
|
||||
"""Example of using the LlamaSheets agent."""
|
||||
|
||||
# Create the agent
|
||||
agent = create_llamasheets_agent()
|
||||
ctx = Context(agent)
|
||||
|
||||
# Example queries the agent can handle:
|
||||
queries = [
|
||||
# Discovery
|
||||
"What spreadsheet data is available?",
|
||||
# Simple analysis
|
||||
"Load the sales data and show me the first few rows with column info",
|
||||
# Using metadata
|
||||
"Find all bold cells in the metadata - these are likely headers",
|
||||
]
|
||||
|
||||
# Example: Run a query
|
||||
for query in queries:
|
||||
print(f"\n=== Query: {query} ===")
|
||||
handler = agent.run(query, ctx=ctx)
|
||||
async for ev in handler.stream_events():
|
||||
if isinstance(ev, ToolCall):
|
||||
tool_kwargs_str = (
|
||||
str(ev.tool_kwargs)[:500] + " ..."
|
||||
if len(str(ev.tool_kwargs)) > 500
|
||||
else str(ev.tool_kwargs)
|
||||
)
|
||||
print(f"\n[Tool Call] {ev.tool_name} with args:\n{tool_kwargs_str}\n\n")
|
||||
elif isinstance(ev, ToolCallResult):
|
||||
result_str = (
|
||||
str(ev.tool_output)[:500] + " ..."
|
||||
if len(str(ev.tool_output)) > 500
|
||||
else str(ev.tool_output)
|
||||
)
|
||||
print(f"\n[Tool Result] {ev.tool_name}:\n{result_str}\n\n")
|
||||
elif isinstance(ev, AgentStream):
|
||||
print(ev.delta, end="", flush=True)
|
||||
|
||||
_ = await handler
|
||||
print("\n=== End Query ===\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
|
||||
asyncio.run(main())
|
||||