mirror of
https://github.com/run-llama/llama_extract.git
synced 2026-07-01 01:37:54 -04:00
remove (#53)
This commit is contained in:
@@ -1,48 +0,0 @@
|
||||
name: Build Package
|
||||
|
||||
# Build package on its own without additional pip install
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
pull_request:
|
||||
|
||||
env:
|
||||
POETRY_VERSION: "1.6.1"
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
# You can use PyPy versions in python-version.
|
||||
# For example, pypy-2.7 and pypy-3.8
|
||||
matrix:
|
||||
os: [ubuntu-latest, windows-latest]
|
||||
python-version: ["3.9"]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install Poetry
|
||||
uses: snok/install-poetry@v1
|
||||
with:
|
||||
version: ${{ env.POETRY_VERSION }}
|
||||
- name: Install deps
|
||||
shell: bash
|
||||
run: poetry install
|
||||
- name: Ensure lock works
|
||||
shell: bash
|
||||
run: poetry lock
|
||||
- name: Build
|
||||
shell: bash
|
||||
run: poetry build
|
||||
- name: Test installing built package
|
||||
shell: bash
|
||||
run: python -m pip install .
|
||||
- name: Test import
|
||||
shell: bash
|
||||
working-directory: ${{ vars.RUNNER_TEMP }}
|
||||
run: python -c "import llama_extract"
|
||||
@@ -1,81 +0,0 @@
|
||||
# For most projects, this workflow file will not need changing; you simply need
|
||||
# to commit it to your repository.
|
||||
#
|
||||
# You may wish to alter this file to override the set of languages analyzed,
|
||||
# or to provide custom queries or build logic.
|
||||
#
|
||||
# ******** NOTE ********
|
||||
# We have attempted to detect the languages in your repository. Please check
|
||||
# the `language` matrix defined below to confirm you have the correct set of
|
||||
# supported CodeQL languages.
|
||||
#
|
||||
name: "CodeQL"
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: ["main"]
|
||||
pull_request:
|
||||
# The branches below must be a subset of the branches above
|
||||
branches: ["main"]
|
||||
schedule:
|
||||
- cron: "30 16 * * 4"
|
||||
|
||||
jobs:
|
||||
analyze:
|
||||
name: Analyze
|
||||
# Runner size impacts CodeQL analysis time. To learn more, please see:
|
||||
# - https://gh.io/recommended-hardware-resources-for-running-codeql
|
||||
# - https://gh.io/supported-runners-and-hardware-resources
|
||||
# - https://gh.io/using-larger-runners
|
||||
# Consider using larger runners for possible analysis time improvements.
|
||||
runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
|
||||
timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }}
|
||||
permissions:
|
||||
actions: read
|
||||
contents: read
|
||||
security-events: write
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
language: ["python"]
|
||||
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby', 'swift' ]
|
||||
# Use only 'java' to analyze code written in Java, Kotlin or both
|
||||
# Use only 'javascript' to analyze code written in JavaScript, TypeScript or both
|
||||
# Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
# Initializes the CodeQL tools for scanning.
|
||||
- name: Initialize CodeQL
|
||||
uses: github/codeql-action/init@v2
|
||||
with:
|
||||
languages: ${{ matrix.language }}
|
||||
# If you wish to specify custom queries, you can do so here or in a config file.
|
||||
# By default, queries listed here will override any specified in a config file.
|
||||
# Prefix the list here with "+" to use these queries and those in the config file.
|
||||
|
||||
# For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
|
||||
# queries: security-extended,security-and-quality
|
||||
|
||||
# Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift).
|
||||
# If this step fails, then you should remove it and run the build manually (see below)
|
||||
- name: Autobuild
|
||||
uses: github/codeql-action/autobuild@v2
|
||||
|
||||
# ℹ️ Command-line programs to run using the OS shell.
|
||||
# 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
|
||||
|
||||
# If the Autobuild fails above, remove it and uncomment the following three lines.
|
||||
# modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
|
||||
|
||||
# - run: |
|
||||
# echo "Run, Build Application using script"
|
||||
# ./location_of_script_within_repo/buildscript.sh
|
||||
|
||||
- name: Perform CodeQL Analysis
|
||||
uses: github/codeql-action/analyze@v2
|
||||
with:
|
||||
category: "/language:${{matrix.language}}"
|
||||
@@ -1,37 +0,0 @@
|
||||
name: Linting
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
pull_request:
|
||||
|
||||
env:
|
||||
POETRY_VERSION: "1.6.1"
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
# You can use PyPy versions in python-version.
|
||||
# For example, pypy-2.7 and pypy-3.8
|
||||
matrix:
|
||||
python-version: ["3.9"]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: ${{ github.event_name == 'pull_request' && 2 || 0 }}
|
||||
- name: Set up python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install Poetry
|
||||
uses: snok/install-poetry@v1
|
||||
with:
|
||||
version: ${{ env.POETRY_VERSION }}
|
||||
- name: Install pre-commit
|
||||
shell: bash
|
||||
run: poetry run pip install pre-commit
|
||||
- name: Run linter
|
||||
shell: bash
|
||||
run: poetry run make lint
|
||||
@@ -1,76 +0,0 @@
|
||||
name: Publish Release
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- "v*"
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
publish:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
actions: write # To trigger workflow
|
||||
contents: read # To checkout code
|
||||
if: github.repository == 'run-llama/llama_extract'
|
||||
steps:
|
||||
- name: Trigger Unit Tests
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
script: |
|
||||
const result = await github.rest.actions.createWorkflowDispatch({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
workflow_id: 'unit_test.yml',
|
||||
ref: 'main'
|
||||
});
|
||||
|
||||
- name: Wait for tests
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
script: |
|
||||
const TIMEOUT = 600000; // 10 minutes in milliseconds
|
||||
const START_TIME = Date.now();
|
||||
|
||||
while (Date.now() - START_TIME < TIMEOUT) {
|
||||
console.log('Checking test status...');
|
||||
const runs = await github.rest.actions.listWorkflowRuns({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
workflow_id: 'unit_test.yml',
|
||||
status: 'completed',
|
||||
branch: 'main'
|
||||
});
|
||||
|
||||
if(runs.data.workflow_runs.length > 0) {
|
||||
const run = runs.data.workflow_runs[0];
|
||||
if(run.conclusion === 'success') {
|
||||
console.log('Tests passed!');
|
||||
return;
|
||||
} else if(run.conclusion === 'failure') {
|
||||
throw new Error('Tests failed!');
|
||||
}
|
||||
}
|
||||
|
||||
console.log('...');
|
||||
await new Promise(r => setTimeout(r, 30000)); // Wait 30 seconds between checks
|
||||
}
|
||||
throw new Error('Tests did not complete within 10 minutes');
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.9"
|
||||
|
||||
- name: Install Poetry
|
||||
run: |
|
||||
curl -sSL https://install.python-poetry.org | python3 -
|
||||
|
||||
- name: Build package
|
||||
run: poetry build
|
||||
|
||||
- name: Publish to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@v1.8.14
|
||||
with:
|
||||
password: ${{ secrets.LLAMA_EXTRACT_PYPI_TOKEN }}
|
||||
@@ -1,43 +0,0 @@
|
||||
name: Unit Testing
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
tags:
|
||||
- 'v*'
|
||||
pull_request:
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
POETRY_VERSION: "1.6.1"
|
||||
|
||||
jobs:
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
environment: ${{ (startsWith(github.ref, 'refs/tags/v') || github.event_name == 'workflow_dispatch') && 'CI-prod' || 'CI-staging' }}
|
||||
strategy:
|
||||
# You can use PyPy versions in python-version.
|
||||
# For example, pypy-2.7 and pypy-3.8
|
||||
matrix:
|
||||
python-version: ["3.9", "3.10", "3.11"]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Set up python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install Poetry
|
||||
uses: snok/install-poetry@v1
|
||||
with:
|
||||
version: ${{ env.POETRY_VERSION }}
|
||||
- name: Install deps
|
||||
shell: bash
|
||||
run: poetry install --with dev
|
||||
- name: Run testing
|
||||
env:
|
||||
CI: true
|
||||
shell: bash
|
||||
run: poetry run pytest tests
|
||||
@@ -1,21 +0,0 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2024 LlamaIndex
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
@@ -1,14 +0,0 @@
|
||||
GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
|
||||
|
||||
help: ## Show all Makefile targets.
|
||||
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'
|
||||
|
||||
format: ## Run code autoformatters (black).
|
||||
pre-commit install
|
||||
git ls-files | xargs pre-commit run ruff --files
|
||||
|
||||
lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy
|
||||
pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files
|
||||
|
||||
test: ## Run tests via pytest
|
||||
pytest tests
|
||||
@@ -3,183 +3,3 @@
|
||||
> ⚠️ This project has been moved to [LlamaCloud Services](https://github.com/run-llama/llama_cloud_services/)
|
||||
> --------
|
||||
|
||||
LlamaExtract provides a simple API for extracting structured data from unstructured documents like PDFs, text files and images (upcoming).
|
||||
|
||||
## Quick Start
|
||||
|
||||
```python
|
||||
from llama_extract import LlamaExtract
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
# Initialize client
|
||||
extractor = LlamaExtract()
|
||||
|
||||
|
||||
# Define schema using Pydantic
|
||||
class Resume(BaseModel):
|
||||
name: str = Field(description="Full name of candidate")
|
||||
email: str = Field(description="Email address")
|
||||
skills: list[str] = Field(description="Technical skills and technologies")
|
||||
|
||||
|
||||
# Create extraction agent
|
||||
agent = extractor.create_agent(name="resume-parser", data_schema=Resume)
|
||||
|
||||
# Extract data from document
|
||||
result = agent.extract("resume.pdf")
|
||||
print(result.data)
|
||||
```
|
||||
|
||||
## Core Concepts
|
||||
|
||||
- **Extraction Agents**: Reusable extractors configured with a specific schema and extraction settings.
|
||||
- **Data Schema**: Structure definition for the data you want to extract.
|
||||
- **Extraction Jobs**: Asynchronous extraction tasks that can be monitored.
|
||||
|
||||
## Defining Schemas
|
||||
|
||||
Schemas can be defined using either Pydantic models or JSON Schema:
|
||||
|
||||
### Using Pydantic (Recommended)
|
||||
|
||||
```python
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
class Experience(BaseModel):
|
||||
company: str = Field(description="Company name")
|
||||
title: str = Field(description="Job title")
|
||||
start_date: Optional[str] = Field(description="Start date of employment")
|
||||
end_date: Optional[str] = Field(description="End date of employment")
|
||||
|
||||
|
||||
class Resume(BaseModel):
|
||||
name: str = Field(description="Candidate name")
|
||||
experience: List[Experience] = Field(description="Work history")
|
||||
```
|
||||
|
||||
### Using JSON Schema
|
||||
|
||||
```python
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string", "description": "Candidate name"},
|
||||
"experience": {
|
||||
"type": "array",
|
||||
"description": "Work history",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"company": {
|
||||
"type": "string",
|
||||
"description": "Company name",
|
||||
},
|
||||
"title": {"type": "string", "description": "Job title"},
|
||||
"start_date": {
|
||||
"anyOf": [{"type": "string"}, {"type": "null"}],
|
||||
"description": "Start date of employment",
|
||||
},
|
||||
"end_date": {
|
||||
"anyOf": [{"type": "string"}, {"type": "null"}],
|
||||
"description": "End date of employment",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
agent = extractor.create_agent(name="resume-parser", data_schema=schema)
|
||||
```
|
||||
|
||||
### Important restrictions on JSON/Pydantic Schema
|
||||
|
||||
*LlamaExtract only supports a subset of the JSON Schema specification.* While limited, it should
|
||||
be sufficient for a wide variety of use-cases.
|
||||
|
||||
- All fields are required by default. Nullable fields must be explicitly marked as such,
|
||||
using `"anyOf"` with a `"null"` type. See `"start_date"` field above.
|
||||
- Root node must be of type `"object"`.
|
||||
- Schema nesting must be limited to within 5 levels.
|
||||
- The important fields are key names/titles, type and description. Fields for
|
||||
formatting, default values, etc. are not supported.
|
||||
- There are other restrictions on number of keys, size of the schema, etc. that you may
|
||||
hit for complex extraction use cases. In such cases, it is worth thinking how to restructure
|
||||
your extraction workflow to fit within these constraints, e.g. by extracting subset of fields
|
||||
and later merging them together.
|
||||
|
||||
## Other Extraction APIs
|
||||
|
||||
### Batch Processing
|
||||
|
||||
Process multiple files asynchronously:
|
||||
|
||||
```python
|
||||
# Queue multiple files for extraction
|
||||
jobs = await agent.queue_extraction(["resume1.pdf", "resume2.pdf"])
|
||||
|
||||
# Check job status
|
||||
for job in jobs:
|
||||
status = agent.get_extraction_job(job.id).status
|
||||
print(f"Job {job.id}: {status}")
|
||||
|
||||
# Get results when complete
|
||||
results = [agent.get_extraction_run_for_job(job.id) for job in jobs]
|
||||
```
|
||||
|
||||
### Updating Schemas
|
||||
|
||||
Schemas can be modified and updated after creation:
|
||||
|
||||
```python
|
||||
# Update schema
|
||||
agent.data_schema = new_schema
|
||||
|
||||
# Save changes
|
||||
agent.save()
|
||||
```
|
||||
|
||||
### Managing Agents
|
||||
|
||||
```python
|
||||
# List all agents
|
||||
agents = extractor.list_agents()
|
||||
|
||||
# Get specific agent
|
||||
agent = extractor.get_agent(name="resume-parser")
|
||||
|
||||
# Delete agent
|
||||
extractor.delete_agent(agent.id)
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
pip install llama-extract==0.1.0
|
||||
```
|
||||
|
||||
## Tips & Best Practices
|
||||
|
||||
1. **Schema Design**:
|
||||
- Try to limit schema nesting to 3-4 levels.
|
||||
- Make fields optional when data might not always be present. Having required fields may force the model
|
||||
to hallucinate when these fields are not present in the documents.
|
||||
- When you want to extract a variable number of entities, use an `array` type. Note that you cannot use
|
||||
an `array` type for the root node.
|
||||
- Use descriptive field names and detailed descriptions. Use descriptions to pass formatting
|
||||
instructions or few-shot examples.
|
||||
- Start simple and iteratively build your schema to incorporate requirements.
|
||||
|
||||
2. **Running Extractions**:
|
||||
- Note that resetting `agent.schema` will not save the schema to the database,
|
||||
until you call `agent.save`, but it will be used for running extractions.
|
||||
- Check job status prior to accessing results. Any extraction error should be available as
|
||||
part of `job.error` or `extraction_run.error` fields for debugging.
|
||||
- Consider async operations (`queue_extraction`) for large-scale extraction once you have finalized your schema.
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- [Example Notebook](examples/resume_screening.ipynb) - Detailed walkthrough of resume parsing
|
||||
- [Discord Community](https://discord.com/invite/eN6D2HQ4aX) - Get help and share feedback
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1,882 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Extracting data from resumes\n",
|
||||
"\n",
|
||||
"Let us assume that we are running a hiring process for a company and we have received a list of resumes from candidates. We want to extract structured data from the resumes so that we can run a screening process and shortlist candidates. \n",
|
||||
"\n",
|
||||
"Take a look at one of the resumes in the `data/resumes` directory. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"\n",
|
||||
" <iframe\n",
|
||||
" width=\"600\"\n",
|
||||
" height=\"400\"\n",
|
||||
" src=\"./data/resumes/ai_researcher.pdf\"\n",
|
||||
" frameborder=\"0\"\n",
|
||||
" allowfullscreen\n",
|
||||
" \n",
|
||||
" ></iframe>\n",
|
||||
" "
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.lib.display.IFrame at 0x103a7e950>"
|
||||
]
|
||||
},
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from IPython.display import IFrame\n",
|
||||
"\n",
|
||||
"IFrame(src=\"./data/resumes/ai_researcher.pdf\", width=600, height=400)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You will notice that all the resumes have different layouts but contain common information like name, email, experience, education, etc. \n",
|
||||
"\n",
|
||||
"With LlamaExtract, we will show you how to:\n",
|
||||
"- *Define* a data schema to extract the information of interest. \n",
|
||||
"- *Iterate* over the data schema to generalize the schema for multiple resumes.\n",
|
||||
"- *Finalize* the schema and schedule extractions for multiple resumes.\n",
|
||||
"\n",
|
||||
"We will start by defining a `LlamaExtract` client which provides a Python interface to the LlamaExtract API. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from llama_extract import LlamaExtract\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Load environment variables (put LLAMA_CLOUD_API_KEY in your .env file)\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"\n",
|
||||
"# Optionally, add your project id/organization id\n",
|
||||
"llama_extract = LlamaExtract()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Defining the data schema\n",
|
||||
"\n",
|
||||
"Next, let us try to extract two fields from the resume: `name` and `email`. We can either use a Python dictionary structure to define the `data_schema` as a JSON or use a Pydantic model instead, for brevity and convenience. In either case, our output is guaranteed to validate against this schema."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pydantic import BaseModel, Field\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class Resume(BaseModel):\n",
|
||||
" name: str = Field(description=\"The name of the candidate\")\n",
|
||||
" email: str = Field(description=\"The email address of the candidate\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_cloud.core.api_error import ApiError\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" existing_agent = llama_extract.get_agent(name=\"resume-screening\")\n",
|
||||
" if existing_agent:\n",
|
||||
" llama_extract.delete_agent(existing_agent.id)\n",
|
||||
"except ApiError as e:\n",
|
||||
" if e.status_code == 404:\n",
|
||||
" pass\n",
|
||||
" else:\n",
|
||||
" raise\n",
|
||||
"\n",
|
||||
"agent = llama_extract.create_agent(name=\"resume-screening\", data_schema=Resume) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[ExtractionAgent(id=ad801427-d06b-499d-bbe0-6109c5f0646b, name=resume-screening)]"
|
||||
]
|
||||
},
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"llama_extract.list_agents()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Uploading files: 100%|██████████| 1/1 [00:00<00:00, 1.19it/s]\n",
|
||||
"Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00, 1.30s/it]\n",
|
||||
"Extracting files: 100%|██████████| 1/1 [00:03<00:00, 3.18s/it]\n",
|
||||
"Uploading files: 100%|██████████| 1/1 [00:00<00:00, 1.23it/s]\n",
|
||||
"Creating extraction jobs: 100%|██████████| 1/1 [00:03<00:00, 3.09s/it]\n",
|
||||
"Extracting files: 100%|██████████| 1/1 [00:11<00:00, 11.11s/it]\n",
|
||||
"Uploading files: 100%|██████████| 1/1 [00:00<00:00, 1.16it/s]\n",
|
||||
"Creating extraction jobs: 100%|██████████| 1/1 [00:03<00:00, 3.10s/it]\n",
|
||||
"Extracting files: 100%|██████████| 1/1 [00:09<00:00, 9.87s/it]\n",
|
||||
"Uploading files: 100%|██████████| 1/1 [00:00<00:00, 1.12it/s]\n",
|
||||
"Creating extraction jobs: 100%|██████████| 1/1 [00:05<00:00, 5.92s/it]\n",
|
||||
"Extracting files: 100%|██████████| 1/1 [00:12<00:00, 12.05s/it]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'name': 'Dr. Rachel Zhang', 'email': 'rachel.zhang@email.com'}"
|
||||
]
|
||||
},
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"resume = agent.extract(\"./data/resumes/ai_researcher.pdf\")\n",
|
||||
"resume.data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Iterating over the data schema\n",
|
||||
"\n",
|
||||
"Now that we have created a data schema, let us add more fields to the schema. We will add `experience` and `education` fields to the schema. \n",
|
||||
"- We can create a new Pydantic model for each of these fields and represent `experience` and `education` as lists of these models. Doing this will allow us to extract multiple entities from the resume without having to pre-define how many experiences or education the candidate has. \n",
|
||||
"- We have added a `description` parameter to provide more context for extraction. We can use `description` to provide example inputs/outputs for the extraction. \n",
|
||||
"- Note that we have annotated the `start_date` and `end_date` fields with `Optional[str]` to indicate that these fields are optional. This is *important* because the schema will be used to extract data from multiple resumes and not all resumes will have the same format. A field must only be required if it is guaranteed to be present in all the resumes. \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from typing import List, Optional\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class Education(BaseModel):\n",
|
||||
" institution: str = Field(description=\"The institution of the candidate\")\n",
|
||||
" degree: str = Field(description=\"The degree of the candidate\")\n",
|
||||
" start_date: Optional[str] = Field(\n",
|
||||
" default=None, description=\"The start date of the candidate's education\"\n",
|
||||
" )\n",
|
||||
" end_date: Optional[str] = Field(\n",
|
||||
" default=None, description=\"The end date of the candidate's education\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class Experience(BaseModel):\n",
|
||||
" company: str = Field(description=\"The name of the company\")\n",
|
||||
" title: str = Field(description=\"The title of the candidate\")\n",
|
||||
" description: Optional[str] = Field(\n",
|
||||
" default=None, description=\"The description of the candidate's experience\"\n",
|
||||
" )\n",
|
||||
" start_date: Optional[str] = Field(\n",
|
||||
" default=None, description=\"The start date of the candidate's experience\"\n",
|
||||
" )\n",
|
||||
" end_date: Optional[str] = Field(\n",
|
||||
" default=None, description=\"The end date of the candidate's experience\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class Resume(BaseModel):\n",
|
||||
" name: str = Field(description=\"The name of the candidate\")\n",
|
||||
" email: str = Field(description=\"The email address of the candidate\")\n",
|
||||
" links: List[str] = Field(\n",
|
||||
" description=\"The links to the candidate's social media profiles\"\n",
|
||||
" )\n",
|
||||
" experience: List[Experience] = Field(description=\"The candidate's experience\")\n",
|
||||
" education: List[Education] = Field(description=\"The candidate's education\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Next, we will update the `data_schema` for the `resume-screening` agent to use the new `Resume` model. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'name': 'Dr. Rachel Zhang',\n",
|
||||
" 'email': 'rachel.zhang@email.com',\n",
|
||||
" 'links': ['linkedin.com/in/rachelzhang',\n",
|
||||
" 'github.com/rzhang-ai',\n",
|
||||
" 'scholar.google.com/rachelzhang'],\n",
|
||||
" 'experience': [{'company': 'DeepMind',\n",
|
||||
" 'title': 'Senior Research Scientist',\n",
|
||||
" 'description': '- Lead researcher on large-scale multi-task learning systems, developing novel architectures that improve cross-task generalization by 40%\\n- Pioneered new approach to zero-shot learning using contrastive training, published in NeurIPS 2023\\n- Built and led team of 6 researchers working on foundational ML models\\n- Developed novel regularization techniques for large language models, reducing catastrophic forgetting by 35%',\n",
|
||||
" 'start_date': '2019',\n",
|
||||
" 'end_date': 'Present'},\n",
|
||||
" {'company': 'Google Research',\n",
|
||||
" 'title': 'Research Scientist',\n",
|
||||
" 'description': '- Developed probabilistic frameworks for robust ML, published in ICML 2018\\n- Created novel attention mechanisms for computer vision models, improving accuracy by 25%\\n- Led collaboration with Google Brain team on efficient training methods for transformer models\\n- Mentored 4 PhD interns and collaborated with academic institutions',\n",
|
||||
" 'start_date': '2015',\n",
|
||||
" 'end_date': '2019'},\n",
|
||||
" {'company': 'Columbia University',\n",
|
||||
" 'title': 'Research Assistant Professor',\n",
|
||||
" 'description': '- Published seminal work on Bayesian optimization methods (cited 1000+ times)\\n- Taught graduate-level courses in Machine Learning and Statistical Learning Theory\\n- Supervised 5 PhD students and 3 MSc students\\n- Secured $500K in research grants for probabilistic ML research',\n",
|
||||
" 'start_date': '2011',\n",
|
||||
" 'end_date': '2015'}],\n",
|
||||
" 'education': [{'institution': 'Columbia University',\n",
|
||||
" 'degree': 'Ph.D. in Computer Science',\n",
|
||||
" 'start_date': '2007',\n",
|
||||
" 'end_date': '2011'},\n",
|
||||
" {'institution': 'Stanford University',\n",
|
||||
" 'degree': 'M.S. in Computer Science',\n",
|
||||
" 'start_date': '2005',\n",
|
||||
" 'end_date': '2007'}]}"
|
||||
]
|
||||
},
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"agent.data_schema = Resume\n",
|
||||
"resume = agent.extract(\"./data/resumes/ai_researcher.pdf\")\n",
|
||||
"resume.data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This is a good start. Let us add a few more fields to the schema and re-run the extraction. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class TechnicalSkills(BaseModel):\n",
|
||||
" programming_languages: List[str] = Field(\n",
|
||||
" description=\"The programming languages the candidate is proficient in.\"\n",
|
||||
" )\n",
|
||||
" frameworks: List[str] = Field(\n",
|
||||
" description=\"The tools/frameworks the candidate is proficient in, e.g. React, Django, PyTorch, etc.\"\n",
|
||||
" )\n",
|
||||
" skills: List[str] = Field(\n",
|
||||
" description=\"Other general skills the candidate is proficient in, e.g. Data Engineering, Machine Learning, etc.\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class Resume(BaseModel):\n",
|
||||
" name: str = Field(description=\"The name of the candidate\")\n",
|
||||
" email: str = Field(description=\"The email address of the candidate\")\n",
|
||||
" links: List[str] = Field(\n",
|
||||
" description=\"The links to the candidate's social media profiles\"\n",
|
||||
" )\n",
|
||||
" experience: List[Experience] = Field(description=\"The candidate's experience\")\n",
|
||||
" education: List[Education] = Field(description=\"The candidate's education\")\n",
|
||||
" technical_skills: TechnicalSkills = Field(\n",
|
||||
" description=\"The candidate's technical skills\"\n",
|
||||
" )\n",
|
||||
" key_accomplishments: str = Field(\n",
|
||||
" description=\"Summarize the candidates highest achievements.\"\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'name': 'Dr. Rachel Zhang',\n",
|
||||
" 'email': 'rachel.zhang@email.com',\n",
|
||||
" 'links': ['linkedin.com/in/rachelzhang',\n",
|
||||
" 'github.com/rzhang-ai',\n",
|
||||
" 'scholar.google.com/rachelzhang'],\n",
|
||||
" 'experience': [{'company': 'DeepMind',\n",
|
||||
" 'title': 'Senior Research Scientist',\n",
|
||||
" 'description': '- Lead researcher on large-scale multi-task learning systems, developing novel architectures that improve cross-task generalization by 40%\\n- Pioneered new approach to zero-shot learning using contrastive training, published in NeurIPS 2023\\n- Built and led team of 6 researchers working on foundational ML models\\n- Developed novel regularization techniques for large language models, reducing catastrophic forgetting by 35%',\n",
|
||||
" 'start_date': '2019',\n",
|
||||
" 'end_date': 'Present'},\n",
|
||||
" {'company': 'Google Research',\n",
|
||||
" 'title': 'Research Scientist',\n",
|
||||
" 'description': '- Developed probabilistic frameworks for robust ML, published in ICML 2018\\n- Created novel attention mechanisms for computer vision models, improving accuracy by 25%\\n- Led collaboration with Google Brain team on efficient training methods for transformer models\\n- Mentored 4 PhD interns and collaborated with academic institutions',\n",
|
||||
" 'start_date': '2015',\n",
|
||||
" 'end_date': '2019'},\n",
|
||||
" {'company': 'Columbia University',\n",
|
||||
" 'title': 'Research Assistant Professor',\n",
|
||||
" 'description': '- Published seminal work on Bayesian optimization methods (cited 1000+ times)\\n- Taught graduate-level courses in Machine Learning and Statistical Learning Theory\\n- Supervised 5 PhD students and 3 MSc students\\n- Secured $500K in research grants for probabilistic ML research',\n",
|
||||
" 'start_date': '2011',\n",
|
||||
" 'end_date': '2015'}],\n",
|
||||
" 'education': [{'institution': 'Columbia University',\n",
|
||||
" 'degree': 'Ph.D. in Computer Science',\n",
|
||||
" 'start_date': '2007',\n",
|
||||
" 'end_date': '2011'},\n",
|
||||
" {'institution': 'Stanford University',\n",
|
||||
" 'degree': 'M.S. in Computer Science',\n",
|
||||
" 'start_date': '2005',\n",
|
||||
" 'end_date': '2007'}],\n",
|
||||
" 'technical_skills': {'programming_languages': ['Python',\n",
|
||||
" 'C++',\n",
|
||||
" 'Julia',\n",
|
||||
" 'CUDA'],\n",
|
||||
" 'frameworks': ['PyTorch', 'TensorFlow', 'JAX', 'Ray'],\n",
|
||||
" 'skills': ['Deep Learning',\n",
|
||||
" 'Reinforcement Learning',\n",
|
||||
" 'Probabilistic Models',\n",
|
||||
" 'Multi-Task Learning',\n",
|
||||
" 'Zero-Shot Learning',\n",
|
||||
" 'Neural Architecture Search']},\n",
|
||||
" 'key_accomplishments': 'AI researcher with 12+ years of experience spanning classical machine learning, deep learning, and probabilistic modeling. Led groundbreaking research in reinforcement learning, generative models, and multi-task learning. Published 25+ papers in top-tier conferences (NeurIPS, ICML, ICLR). Strong track record of transitioning theoretical advances into practical applications in both academic and industrial settings.'}"
|
||||
]
|
||||
},
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"agent.data_schema = Resume\n",
|
||||
"resume = agent.extract(\"./data/resumes/ai_researcher.pdf\")\n",
|
||||
"resume.data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Finalizing the schema\n",
|
||||
"\n",
|
||||
"This is great! We have extracted a lot of key information from the resume that is well-typed and can be used downstream for further processing. Until now, this data is ephemeral and will be lost if we close the session. Let us save the state of our extraction and use it to extract data from multiple resumes. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"agent.save()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'type': 'object',\n",
|
||||
" '$defs': {'Education': {'type': 'object',\n",
|
||||
" 'title': 'Education',\n",
|
||||
" 'required': ['institution', 'degree', 'start_date', 'end_date'],\n",
|
||||
" 'properties': {'degree': {'type': 'string',\n",
|
||||
" 'title': 'Degree',\n",
|
||||
" 'description': 'The degree of the candidate'},\n",
|
||||
" 'end_date': {'anyOf': [{'type': 'string'}, {'type': 'null'}],\n",
|
||||
" 'title': 'End Date',\n",
|
||||
" 'description': \"The end date of the candidate's education\"},\n",
|
||||
" 'start_date': {'anyOf': [{'type': 'string'}, {'type': 'null'}],\n",
|
||||
" 'title': 'Start Date',\n",
|
||||
" 'description': \"The start date of the candidate's education\"},\n",
|
||||
" 'institution': {'type': 'string',\n",
|
||||
" 'title': 'Institution',\n",
|
||||
" 'description': 'The institution of the candidate'}},\n",
|
||||
" 'additionalProperties': False},\n",
|
||||
" 'Experience': {'type': 'object',\n",
|
||||
" 'title': 'Experience',\n",
|
||||
" 'required': ['company', 'title', 'description', 'start_date', 'end_date'],\n",
|
||||
" 'properties': {'title': {'type': 'string',\n",
|
||||
" 'title': 'Title',\n",
|
||||
" 'description': 'The title of the candidate'},\n",
|
||||
" 'company': {'type': 'string',\n",
|
||||
" 'title': 'Company',\n",
|
||||
" 'description': 'The name of the company'},\n",
|
||||
" 'end_date': {'anyOf': [{'type': 'string'}, {'type': 'null'}],\n",
|
||||
" 'title': 'End Date',\n",
|
||||
" 'description': \"The end date of the candidate's experience\"},\n",
|
||||
" 'start_date': {'anyOf': [{'type': 'string'}, {'type': 'null'}],\n",
|
||||
" 'title': 'Start Date',\n",
|
||||
" 'description': \"The start date of the candidate's experience\"},\n",
|
||||
" 'description': {'anyOf': [{'type': 'string'}, {'type': 'null'}],\n",
|
||||
" 'title': 'Description',\n",
|
||||
" 'description': \"The description of the candidate's experience\"}},\n",
|
||||
" 'additionalProperties': False},\n",
|
||||
" 'TechnicalSkills': {'type': 'object',\n",
|
||||
" 'title': 'TechnicalSkills',\n",
|
||||
" 'required': ['programming_languages', 'frameworks', 'skills'],\n",
|
||||
" 'properties': {'skills': {'type': 'array',\n",
|
||||
" 'items': {'type': 'string'},\n",
|
||||
" 'title': 'Skills',\n",
|
||||
" 'description': 'Other general skills the candidate is proficient in, e.g. Data Engineering, Machine Learning, etc.'},\n",
|
||||
" 'frameworks': {'type': 'array',\n",
|
||||
" 'items': {'type': 'string'},\n",
|
||||
" 'title': 'Frameworks',\n",
|
||||
" 'description': 'The tools/frameworks the candidate is proficient in, e.g. React, Django, PyTorch, etc.'},\n",
|
||||
" 'programming_languages': {'type': 'array',\n",
|
||||
" 'items': {'type': 'string'},\n",
|
||||
" 'title': 'Programming Languages',\n",
|
||||
" 'description': 'The programming languages the candidate is proficient in.'}},\n",
|
||||
" 'additionalProperties': False}},\n",
|
||||
" 'title': 'Resume',\n",
|
||||
" 'required': ['name',\n",
|
||||
" 'email',\n",
|
||||
" 'links',\n",
|
||||
" 'experience',\n",
|
||||
" 'education',\n",
|
||||
" 'technical_skills',\n",
|
||||
" 'key_accomplishments'],\n",
|
||||
" 'properties': {'name': {'type': 'string',\n",
|
||||
" 'title': 'Name',\n",
|
||||
" 'description': 'The name of the candidate'},\n",
|
||||
" 'email': {'type': 'string',\n",
|
||||
" 'title': 'Email',\n",
|
||||
" 'description': 'The email address of the candidate'},\n",
|
||||
" 'links': {'type': 'array',\n",
|
||||
" 'items': {'type': 'string'},\n",
|
||||
" 'title': 'Links',\n",
|
||||
" 'description': \"The links to the candidate's social media profiles\"},\n",
|
||||
" 'education': {'type': 'array',\n",
|
||||
" 'items': {'$ref': '#/$defs/Education'},\n",
|
||||
" 'title': 'Education',\n",
|
||||
" 'description': \"The candidate's education\"},\n",
|
||||
" 'experience': {'type': 'array',\n",
|
||||
" 'items': {'$ref': '#/$defs/Experience'},\n",
|
||||
" 'title': 'Experience',\n",
|
||||
" 'description': \"The candidate's experience\"},\n",
|
||||
" 'technical_skills': {'type': 'object',\n",
|
||||
" 'title': 'TechnicalSkills',\n",
|
||||
" 'required': ['programming_languages', 'frameworks', 'skills'],\n",
|
||||
" 'properties': {'skills': {'type': 'array',\n",
|
||||
" 'items': {'type': 'string'},\n",
|
||||
" 'title': 'Skills',\n",
|
||||
" 'description': 'Other general skills the candidate is proficient in, e.g. Data Engineering, Machine Learning, etc.'},\n",
|
||||
" 'frameworks': {'type': 'array',\n",
|
||||
" 'items': {'type': 'string'},\n",
|
||||
" 'title': 'Frameworks',\n",
|
||||
" 'description': 'The tools/frameworks the candidate is proficient in, e.g. React, Django, PyTorch, etc.'},\n",
|
||||
" 'programming_languages': {'type': 'array',\n",
|
||||
" 'items': {'type': 'string'},\n",
|
||||
" 'title': 'Programming Languages',\n",
|
||||
" 'description': 'The programming languages the candidate is proficient in.'}},\n",
|
||||
" 'description': \"The candidate's technical skills\",\n",
|
||||
" 'additionalProperties': False},\n",
|
||||
" 'key_accomplishments': {'type': 'string',\n",
|
||||
" 'title': 'Key Accomplishments',\n",
|
||||
" 'description': 'Summarize the candidates highest achievements.'}},\n",
|
||||
" 'additionalProperties': False}"
|
||||
]
|
||||
},
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"agent = llama_extract.get_agent(\"resume-screening\")\n",
|
||||
"agent.data_schema # Latest schema should be returned"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Queueing extractions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"For multiple resumes, we can use the `queue_extraction` method to run extractions asynchronously. This is ideal for processing batch extraction jobs."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Uploading files: 100%|██████████| 3/3 [00:01<00:00, 2.29it/s]\n",
|
||||
"Creating extraction jobs: 100%|██████████| 3/3 [00:04<00:00, 1.61s/it]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"# All resumes in the data/resumes directory\n",
|
||||
"resumes = []\n",
|
||||
"\n",
|
||||
"with os.scandir(\"./data/resumes\") as entries:\n",
|
||||
" for entry in entries:\n",
|
||||
" if entry.is_file():\n",
|
||||
" resumes.append(entry.path)\n",
|
||||
"\n",
|
||||
"jobs = await agent.queue_extraction(resumes)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"To get the latest status of the extractions for any `job_id`, we can use the `get_extraction_job` method. \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[<StatusEnum.PENDING: 'PENDING'>,\n",
|
||||
" <StatusEnum.PENDING: 'PENDING'>,\n",
|
||||
" <StatusEnum.PENDING: 'PENDING'>]"
|
||||
]
|
||||
},
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"[agent.get_extraction_job(job_id=job.id).status for job in jobs]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We notice that all extraction runs are in a PENDING state. We can check back again to see if the extractions have completed. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[<StatusEnum.SUCCESS: 'SUCCESS'>,\n",
|
||||
" <StatusEnum.SUCCESS: 'SUCCESS'>,\n",
|
||||
" <StatusEnum.SUCCESS: 'SUCCESS'>]"
|
||||
]
|
||||
},
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"[agent.get_extraction_job(job_id=job.id).status for job in jobs]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Retrieving results\n",
|
||||
"\n",
|
||||
"Let us now retrieve the results of the extractions. If the status of the extraction is `SUCCESS`, we can retrieve the data from the `data` field. In case there are errors (status = `ERROR`), we can retrieve the error message from the `error` field. \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"results = []\n",
|
||||
"for job in jobs:\n",
|
||||
" extract_run = agent.list_extraction_runs(job_id=job.id)[0]\n",
|
||||
" if extract_run.status == \"SUCCESS\":\n",
|
||||
" results.append(extract_run.data)\n",
|
||||
" else:\n",
|
||||
" print(f\"Extraction status for job {job.id}: {extract_run.status}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'name': 'Dr. Rachel Zhang, Ph.D.',\n",
|
||||
" 'email': 'rachel.zhang@email.com',\n",
|
||||
" 'links': ['linkedin.com/in/rachelzhang',\n",
|
||||
" 'github.com/rzhang-ai',\n",
|
||||
" 'scholar.google.com/rachelzhang'],\n",
|
||||
" 'experience': [{'company': 'DeepMind',\n",
|
||||
" 'title': 'Senior Research Scientist',\n",
|
||||
" 'description': '- Lead researcher on large-scale multi-task learning systems, developing novel architectures that improve cross-task generalization by 40%\\n- Pioneered new approach to zero-shot learning using contrastive training, published in NeurIPS 2023\\n- Built and led team of 6 researchers working on foundational ML models\\n- Developed novel regularization techniques for large language models, reducing catastrophic forgetting by 35%',\n",
|
||||
" 'start_date': '2019',\n",
|
||||
" 'end_date': 'Present'},\n",
|
||||
" {'company': 'Google Research',\n",
|
||||
" 'title': 'Research Scientist',\n",
|
||||
" 'description': '- Developed probabilistic frameworks for robust ML, published in ICML 2018\\n- Created novel attention mechanisms for computer vision models, improving accuracy by 25%\\n- Led collaboration with Google Brain team on efficient training methods for transformer models\\n- Mentored 4 PhD interns and collaborated with academic institutions',\n",
|
||||
" 'start_date': '2015',\n",
|
||||
" 'end_date': '2019'},\n",
|
||||
" {'company': 'Columbia University',\n",
|
||||
" 'title': 'Research Assistant Professor',\n",
|
||||
" 'description': '- Published seminal work on Bayesian optimization methods (cited 1000+ times)\\n- Taught graduate-level courses in Machine Learning and Statistical Learning Theory\\n- Supervised 5 PhD students and 3 MSc students\\n- Secured $500K in research grants for probabilistic ML research',\n",
|
||||
" 'start_date': '2011',\n",
|
||||
" 'end_date': '2015'}],\n",
|
||||
" 'education': [{'institution': 'Columbia University',\n",
|
||||
" 'degree': 'Ph.D. in Computer Science',\n",
|
||||
" 'start_date': '2007',\n",
|
||||
" 'end_date': '2011'},\n",
|
||||
" {'institution': 'Stanford University',\n",
|
||||
" 'degree': 'M.S. in Computer Science',\n",
|
||||
" 'start_date': '2005',\n",
|
||||
" 'end_date': '2007'}],\n",
|
||||
" 'technical_skills': {'programming_languages': ['Python',\n",
|
||||
" 'C++',\n",
|
||||
" 'Julia',\n",
|
||||
" 'CUDA'],\n",
|
||||
" 'frameworks': ['PyTorch', 'TensorFlow', 'JAX', 'Ray'],\n",
|
||||
" 'skills': ['Deep Learning',\n",
|
||||
" 'Reinforcement Learning',\n",
|
||||
" 'Probabilistic Models',\n",
|
||||
" 'Multi-Task Learning',\n",
|
||||
" 'Zero-Shot Learning',\n",
|
||||
" 'Neural Architecture Search']},\n",
|
||||
" 'key_accomplishments': 'AI researcher with 12+ years of experience spanning classical machine learning, deep learning, and probabilistic modeling. Led groundbreaking research in reinforcement learning, generative models, and multi-task learning. Published 25+ papers in top-tier conferences (NeurIPS, ICML, ICLR). Strong track record of transitioning theoretical advances into practical applications in both academic and industrial settings.'}"
|
||||
]
|
||||
},
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"results[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'name': 'Alex Park',\n",
|
||||
" 'email': 'alex park@email.com',\n",
|
||||
" 'links': ['linkedin.com/in/alexpark'],\n",
|
||||
" 'experience': [{'company': 'SearchTech AI',\n",
|
||||
" 'title': 'Senior Machine Learning Engineer',\n",
|
||||
" 'description': 'Led development of next-generation learning-to-rank system using BER\\nArchitected and deployed real-time personalization system processing 10\\nIncreasing CTR by 15%\\nImproving search relevance by 24% (NDCG@10)',\n",
|
||||
" 'start_date': None,\n",
|
||||
" 'end_date': None},\n",
|
||||
" {'company': 'Commerce Corp',\n",
|
||||
" 'title': '',\n",
|
||||
" 'description': 'Developed semantic search system using transformer models and approximate nearest neighbors, reducing null search results by 35%',\n",
|
||||
" 'start_date': None,\n",
|
||||
" 'end_date': None},\n",
|
||||
" {'company': 'Tech Solutions Inc',\n",
|
||||
" 'title': 'Machine Learning Engineer',\n",
|
||||
" 'description': 'Implemented query understanding pipeline',\n",
|
||||
" 'start_date': None,\n",
|
||||
" 'end_date': None},\n",
|
||||
" {'company': '',\n",
|
||||
" 'title': 'Software Engineer',\n",
|
||||
" 'description': 'Built data pipelines and Flasticsearch',\n",
|
||||
" 'start_date': None,\n",
|
||||
" 'end_date': None}],\n",
|
||||
" 'education': [{'institution': 'University of California, Berkeley',\n",
|
||||
" 'degree': 'M.S. Computer Science',\n",
|
||||
" 'start_date': None,\n",
|
||||
" 'end_date': None},\n",
|
||||
" {'institution': 'University of California, Berkeley',\n",
|
||||
" 'degree': 'B.S. Computer Science',\n",
|
||||
" 'start_date': None,\n",
|
||||
" 'end_date': None},\n",
|
||||
" {'institution': 'University of Washington',\n",
|
||||
" 'degree': '',\n",
|
||||
" 'start_date': None,\n",
|
||||
" 'end_date': None}],\n",
|
||||
" 'technical_skills': {'programming_languages': ['Python',\n",
|
||||
" 'SQL',\n",
|
||||
" 'Java',\n",
|
||||
" 'Scala',\n",
|
||||
" 'Shell Scripting'],\n",
|
||||
" 'frameworks': ['PyTorch',\n",
|
||||
" 'TensorFlow',\n",
|
||||
" 'Scikit-learn',\n",
|
||||
" 'Elasticsearch',\n",
|
||||
" 'Solr',\n",
|
||||
" 'Lucene',\n",
|
||||
" 'BERT',\n",
|
||||
" 'Word2Vec',\n",
|
||||
" 'FastAI',\n",
|
||||
" 'BM25',\n",
|
||||
" 'FAISS',\n",
|
||||
" 'Docker',\n",
|
||||
" 'Kubernetes'],\n",
|
||||
" 'skills': []},\n",
|
||||
" 'key_accomplishments': 'Machine Learning Engineer with 5 years of experience building and deploying large-scale search and relevance systems: Specialized in developing personalized search algorithms, learning-to-rank models; and recommendation systems. Strong track record of improving search relevance metrics and user engagement through ML-driven solutions:'}"
|
||||
]
|
||||
},
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"results[1]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'name': 'Sarah Chen',\n",
|
||||
" 'email': 'sarah.chen@email.com',\n",
|
||||
" 'links': [],\n",
|
||||
" 'experience': [{'company': 'TechCorp Solutions',\n",
|
||||
" 'title': 'Senior Software Architect',\n",
|
||||
" 'description': '- Led architectural design and implementation of a cloud-native platform serving 2M+ users\\n- Established architectural guidelines and best practices adopted across 12 development teams\\n- Reduced system latency by 40% through implementation of event-driven architecture\\n- Mentored 15+ senior developers in cloud-native development practices',\n",
|
||||
" 'start_date': '2020',\n",
|
||||
" 'end_date': 'Present'},\n",
|
||||
" {'company': 'DataFlow Systems',\n",
|
||||
" 'title': 'Lead Software Engineer',\n",
|
||||
" 'description': '- Architected and led development of distributed data processing platform handling 5TB daily\\n- Designed microservices architecture reducing deployment time by 65%\\n- Led migration of legacy monolith to cloud-native architecture\\n- Managed team of 8 engineers across 3 international locations',\n",
|
||||
" 'start_date': '2016',\n",
|
||||
" 'end_date': '2020'},\n",
|
||||
" {'company': 'InnovateTech',\n",
|
||||
" 'title': 'Senior Software Engineer',\n",
|
||||
" 'description': '- Developed high-performance trading platform processing 100K transactions per second\\n- Implemented real-time analytics engine reducing processing latency by 75%\\n- Led adoption of container orchestration reducing deployment costs by 35%',\n",
|
||||
" 'start_date': '2013',\n",
|
||||
" 'end_date': '2016'}],\n",
|
||||
" 'education': [{'institution': 'Stanford University',\n",
|
||||
" 'degree': 'Master of Science in Computer Science',\n",
|
||||
" 'start_date': None,\n",
|
||||
" 'end_date': '2013'},\n",
|
||||
" {'institution': 'University of California, Berkeley',\n",
|
||||
" 'degree': 'Bachelor of Science in Computer Engineering',\n",
|
||||
" 'start_date': None,\n",
|
||||
" 'end_date': '2011'}],\n",
|
||||
" 'technical_skills': {'programming_languages': ['Java',\n",
|
||||
" 'Python',\n",
|
||||
" 'Go',\n",
|
||||
" 'JavaScript/TypeScript'],\n",
|
||||
" 'frameworks': [],\n",
|
||||
" 'skills': ['Architecture & Design',\n",
|
||||
" 'Microservices',\n",
|
||||
" 'Event-Driven Architecture',\n",
|
||||
" 'Domain-Driven Design',\n",
|
||||
" 'REST APIs',\n",
|
||||
" 'Cloud Platforms',\n",
|
||||
" 'AWS (Advanced)',\n",
|
||||
" 'Azure',\n",
|
||||
" 'Google Cloud Platform']},\n",
|
||||
" 'key_accomplishments': '- Co-inventor on three patents for distributed systems architecture\\n- Published paper on \"Scalable Microservices Architecture\" at IEEE Cloud Computing Conference 2022\\n- Keynote Speaker, CloudCon 2023: \"Future of Cloud-Native Architecture\"\\n- Regular presenter at local tech meetups and conferences'}"
|
||||
]
|
||||
},
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"results[2]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Congratulations! You now have an agent that can extract structured data from resumes. \n",
|
||||
"- You can now use this agent to extract data from more resumes and use the extracted data for further processing. \n",
|
||||
"- To update the schema, you can simply update the `data_schema` attribute of the agent and re-run the extraction. \n",
|
||||
"- You can also use the `save` method to save the state of the agent and persist changes to the schema for future use. \n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -1,3 +0,0 @@
|
||||
from llama_extract.extract import LlamaExtract, ExtractionAgent
|
||||
|
||||
__all__ = ["LlamaExtract", "ExtractionAgent"]
|
||||
@@ -1,655 +0,0 @@
|
||||
import asyncio
|
||||
import os
|
||||
import time
|
||||
from io import BufferedIOBase, BufferedReader, BytesIO
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Type, Union, Tuple, Coroutine, Any, TypeVar
|
||||
import warnings
|
||||
import httpx
|
||||
from pydantic import BaseModel
|
||||
from llama_cloud import (
|
||||
ExtractAgent as CloudExtractAgent,
|
||||
ExtractConfig,
|
||||
ExtractJob,
|
||||
ExtractJobCreate,
|
||||
ExtractRun,
|
||||
File,
|
||||
ExtractMode,
|
||||
StatusEnum,
|
||||
Project,
|
||||
ExtractTarget,
|
||||
LlamaExtractSettings,
|
||||
)
|
||||
from llama_cloud.client import AsyncLlamaCloud
|
||||
from llama_extract.utils import JSONObjectType, augment_async_errors
|
||||
from llama_index.core.schema import BaseComponent
|
||||
from llama_index.core.async_utils import run_jobs
|
||||
from llama_index.core.bridge.pydantic import Field, PrivateAttr
|
||||
from llama_index.core.constants import DEFAULT_BASE_URL
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
FileInput = Union[str, Path, bytes, BufferedIOBase]
|
||||
SchemaInput = Union[JSONObjectType, Type[BaseModel]]
|
||||
|
||||
DEFAULT_EXTRACT_CONFIG = ExtractConfig(
|
||||
extraction_target=ExtractTarget.PER_DOC,
|
||||
extraction_mode=ExtractMode.ACCURATE,
|
||||
)
|
||||
|
||||
|
||||
class ExtractionAgent:
|
||||
"""Class representing a single extraction agent with methods for extraction operations."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
client: AsyncLlamaCloud,
|
||||
agent: CloudExtractAgent,
|
||||
project_id: Optional[str] = None,
|
||||
organization_id: Optional[str] = None,
|
||||
check_interval: int = 1,
|
||||
max_timeout: int = 2000,
|
||||
num_workers: int = 4,
|
||||
show_progress: bool = True,
|
||||
verbose: bool = False,
|
||||
):
|
||||
self._client = client
|
||||
self._agent = agent
|
||||
self._project_id = project_id
|
||||
self._organization_id = organization_id
|
||||
self.check_interval = check_interval
|
||||
self.max_timeout = max_timeout
|
||||
self.num_workers = num_workers
|
||||
self.show_progress = show_progress
|
||||
self._verbose = verbose
|
||||
self._data_schema: Union[JSONObjectType, None] = None
|
||||
self._config: Union[ExtractConfig, None] = None
|
||||
self._thread_pool = ThreadPoolExecutor(
|
||||
max_workers=min(10, (os.cpu_count() or 1) + 4)
|
||||
)
|
||||
|
||||
def _run_in_thread(self, coro: Coroutine[Any, Any, T]) -> T:
|
||||
"""Run coroutine in a separate thread to avoid event loop issues"""
|
||||
|
||||
def run_coro() -> T:
|
||||
async def wrapped_coro() -> T:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=self._client._client_wrapper.httpx_client.timeout,
|
||||
) as client:
|
||||
original_client = self._client._client_wrapper.httpx_client
|
||||
self._client._client_wrapper.httpx_client = client
|
||||
try:
|
||||
return await coro
|
||||
finally:
|
||||
self._client._client_wrapper.httpx_client = original_client
|
||||
|
||||
return asyncio.run(wrapped_coro())
|
||||
|
||||
return self._thread_pool.submit(run_coro).result()
|
||||
|
||||
@property
|
||||
def id(self) -> str:
|
||||
return self._agent.id
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return self._agent.name
|
||||
|
||||
@property
|
||||
def data_schema(self) -> dict:
|
||||
return self._agent.data_schema if not self._data_schema else self._data_schema
|
||||
|
||||
@data_schema.setter
|
||||
def data_schema(self, data_schema: SchemaInput) -> None:
|
||||
processed_schema: JSONObjectType
|
||||
if isinstance(data_schema, dict):
|
||||
# TODO: if we expose a get_validated JSON schema method, we can use it here
|
||||
processed_schema = data_schema # type: ignore
|
||||
elif isinstance(data_schema, type) and issubclass(data_schema, BaseModel):
|
||||
processed_schema = data_schema.model_json_schema()
|
||||
else:
|
||||
raise ValueError(
|
||||
"data_schema must be either a dictionary or a Pydantic model"
|
||||
)
|
||||
validated_schema = self._run_in_thread(
|
||||
self._client.llama_extract.validate_extraction_schema(
|
||||
data_schema=processed_schema
|
||||
)
|
||||
)
|
||||
self._data_schema = validated_schema.data_schema
|
||||
|
||||
@property
|
||||
def config(self) -> ExtractConfig:
|
||||
return self._agent.config if not self._config else self._config
|
||||
|
||||
@config.setter
|
||||
def config(self, config: ExtractConfig) -> None:
|
||||
self._config = config
|
||||
|
||||
async def _upload_file(self, file_input: FileInput) -> File:
|
||||
"""Upload a file for extraction."""
|
||||
if isinstance(file_input, BufferedIOBase):
|
||||
upload_file = file_input
|
||||
elif isinstance(file_input, bytes):
|
||||
upload_file = BytesIO(file_input)
|
||||
elif isinstance(file_input, (str, Path)):
|
||||
upload_file = open(file_input, "rb")
|
||||
else:
|
||||
raise ValueError(
|
||||
"file_input must be either a file path string, file bytes, or buffer object"
|
||||
)
|
||||
|
||||
try:
|
||||
return await self._client.files.upload_file(
|
||||
project_id=self._project_id, upload_file=upload_file
|
||||
)
|
||||
finally:
|
||||
if isinstance(upload_file, BufferedReader):
|
||||
upload_file.close()
|
||||
|
||||
async def _wait_for_job_result(self, job_id: str) -> Optional[ExtractRun]:
|
||||
"""Wait for and return the results of an extraction job."""
|
||||
start = time.perf_counter()
|
||||
tries = 0
|
||||
while True:
|
||||
await asyncio.sleep(self.check_interval)
|
||||
tries += 1
|
||||
job = await self._client.llama_extract.get_job(
|
||||
job_id=job_id,
|
||||
)
|
||||
|
||||
if job.status == StatusEnum.SUCCESS:
|
||||
return await self._client.llama_extract.get_run_by_job_id(
|
||||
job_id=job_id,
|
||||
)
|
||||
elif job.status == StatusEnum.PENDING:
|
||||
end = time.perf_counter()
|
||||
if end - start > self.max_timeout:
|
||||
raise Exception(f"Timeout while extracting the file: {job_id}")
|
||||
if self._verbose and tries % 10 == 0:
|
||||
print(".", end="", flush=True)
|
||||
continue
|
||||
else:
|
||||
warnings.warn(
|
||||
f"Failure in job: {job_id}, status: {job.status}, error: {job.error}"
|
||||
)
|
||||
return await self._client.llama_extract.get_run_by_job_id(
|
||||
job_id=job_id,
|
||||
)
|
||||
|
||||
def save(self) -> None:
|
||||
"""Persist the extraction agent's schema and config to the database.
|
||||
|
||||
Returns:
|
||||
ExtractionAgent: The updated extraction agent
|
||||
"""
|
||||
self._agent = self._run_in_thread(
|
||||
self._client.llama_extract.update_extraction_agent(
|
||||
extraction_agent_id=self.id,
|
||||
data_schema=self.data_schema,
|
||||
config=self.config,
|
||||
)
|
||||
)
|
||||
|
||||
async def _queue_extraction_test(
|
||||
self,
|
||||
files: Union[FileInput, List[FileInput]],
|
||||
extract_settings: LlamaExtractSettings,
|
||||
) -> Union[ExtractJob, List[ExtractJob]]:
|
||||
if not isinstance(files, list):
|
||||
files = [files]
|
||||
single_file = True
|
||||
else:
|
||||
single_file = False
|
||||
|
||||
upload_tasks = [self._upload_file(file) for file in files]
|
||||
with augment_async_errors():
|
||||
uploaded_files = await run_jobs(
|
||||
upload_tasks,
|
||||
workers=self.num_workers,
|
||||
desc="Uploading files",
|
||||
show_progress=self.show_progress,
|
||||
)
|
||||
|
||||
async def run_job(file: File) -> ExtractRun:
|
||||
job_queued = await self._client.llama_extract.run_job_test_user(
|
||||
job_create=ExtractJobCreate(
|
||||
extraction_agent_id=self.id,
|
||||
file_id=file.id,
|
||||
data_schema_override=self.data_schema,
|
||||
config_override=self.config,
|
||||
),
|
||||
extract_settings=extract_settings,
|
||||
)
|
||||
return await self._wait_for_job_result(job_queued.id)
|
||||
|
||||
job_tasks = [run_job(file) for file in uploaded_files]
|
||||
with augment_async_errors():
|
||||
extract_jobs = await run_jobs(
|
||||
job_tasks,
|
||||
workers=self.num_workers,
|
||||
desc="Running extraction jobs",
|
||||
show_progress=self.show_progress,
|
||||
)
|
||||
|
||||
if self._verbose:
|
||||
for file, job in zip(files, extract_jobs):
|
||||
file_repr = (
|
||||
str(file) if isinstance(file, (str, Path)) else "<bytes/buffer>"
|
||||
)
|
||||
print(
|
||||
f"Queued file extraction for file {file_repr} under job_id {job.id}"
|
||||
)
|
||||
|
||||
return extract_jobs[0] if single_file else extract_jobs
|
||||
|
||||
async def queue_extraction(
|
||||
self,
|
||||
files: Union[FileInput, List[FileInput]],
|
||||
) -> Union[ExtractJob, List[ExtractJob]]:
|
||||
"""
|
||||
Queue multiple files for extraction.
|
||||
|
||||
Args:
|
||||
files (Union[FileInput, List[FileInput]]): The files to extract
|
||||
|
||||
Returns:
|
||||
Union[ExtractJob, List[ExtractJob]]: The queued extraction jobs
|
||||
"""
|
||||
"""Queue one or more files for extraction concurrently."""
|
||||
if not isinstance(files, list):
|
||||
files = [files]
|
||||
single_file = True
|
||||
else:
|
||||
single_file = False
|
||||
|
||||
upload_tasks = [self._upload_file(file) for file in files]
|
||||
with augment_async_errors():
|
||||
uploaded_files = await run_jobs(
|
||||
upload_tasks,
|
||||
workers=self.num_workers,
|
||||
desc="Uploading files",
|
||||
show_progress=self.show_progress,
|
||||
)
|
||||
|
||||
job_tasks = [
|
||||
self._client.llama_extract.run_job(
|
||||
request=ExtractJobCreate(
|
||||
extraction_agent_id=self.id,
|
||||
file_id=file.id,
|
||||
data_schema_override=self.data_schema,
|
||||
config_override=self.config,
|
||||
),
|
||||
)
|
||||
for file in uploaded_files
|
||||
]
|
||||
with augment_async_errors():
|
||||
extract_jobs = await run_jobs(
|
||||
job_tasks,
|
||||
workers=self.num_workers,
|
||||
desc="Creating extraction jobs",
|
||||
show_progress=self.show_progress,
|
||||
)
|
||||
|
||||
if self._verbose:
|
||||
for file, job in zip(files, extract_jobs):
|
||||
file_repr = (
|
||||
str(file) if isinstance(file, (str, Path)) else "<bytes/buffer>"
|
||||
)
|
||||
print(
|
||||
f"Queued file extraction for file {file_repr} under job_id {job.id}"
|
||||
)
|
||||
|
||||
return extract_jobs[0] if single_file else extract_jobs
|
||||
|
||||
async def aextract(
|
||||
self, files: Union[FileInput, List[FileInput]]
|
||||
) -> Union[ExtractRun, List[ExtractRun]]:
|
||||
"""Asynchronously extract data from one or more files using this agent.
|
||||
|
||||
Args:
|
||||
files (Union[FileInput, List[FileInput]]): The files to extract
|
||||
|
||||
Returns:
|
||||
Union[ExtractRun, List[ExtractRun]]: The extraction results
|
||||
"""
|
||||
if not isinstance(files, list):
|
||||
files = [files]
|
||||
single_file = True
|
||||
else:
|
||||
single_file = False
|
||||
|
||||
# Queue all files for extraction
|
||||
jobs = await self.queue_extraction(files)
|
||||
# Wait for all results concurrently
|
||||
result_tasks = [self._wait_for_job_result(job.id) for job in jobs]
|
||||
with augment_async_errors():
|
||||
results = await run_jobs(
|
||||
result_tasks,
|
||||
workers=self.num_workers,
|
||||
desc="Extracting files",
|
||||
show_progress=self.show_progress,
|
||||
)
|
||||
|
||||
return results[0] if single_file else results
|
||||
|
||||
def extract(
|
||||
self, files: Union[FileInput, List[FileInput]]
|
||||
) -> Union[ExtractRun, List[ExtractRun]]:
|
||||
"""Synchronously extract data from one or more files using this agent.
|
||||
|
||||
Args:
|
||||
files (Union[FileInput, List[FileInput]]): The files to extract
|
||||
|
||||
Returns:
|
||||
Union[ExtractRun, List[ExtractRun]]: The extraction results
|
||||
"""
|
||||
return self._run_in_thread(self.aextract(files))
|
||||
|
||||
def get_extraction_job(self, job_id: str) -> ExtractJob:
|
||||
"""
|
||||
Get the extraction job for a given job_id.
|
||||
|
||||
Args:
|
||||
job_id (str): The job_id to get the extraction job for
|
||||
|
||||
Returns:
|
||||
ExtractJob: The extraction job
|
||||
"""
|
||||
return self._run_in_thread(self._client.llama_extract.get_job(job_id=job_id))
|
||||
|
||||
def get_extraction_run_for_job(self, job_id: str) -> ExtractRun:
|
||||
"""
|
||||
Get the extraction run for a given job_id.
|
||||
|
||||
Args:
|
||||
job_id (str): The job_id to get the extraction run for
|
||||
|
||||
Returns:
|
||||
ExtractRun: The extraction run
|
||||
"""
|
||||
return self._run_in_thread(
|
||||
self._client.llama_extract.get_run_by_job_id(
|
||||
job_id=job_id,
|
||||
)
|
||||
)
|
||||
|
||||
def list_extraction_runs(self) -> List[ExtractRun]:
|
||||
"""List extraction runs for the extraction agent.
|
||||
|
||||
Returns:
|
||||
List[ExtractRun]: List of extraction runs
|
||||
"""
|
||||
return self._run_in_thread(
|
||||
self._client.llama_extract.list_extract_runs(
|
||||
extraction_agent_id=self.id,
|
||||
)
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"ExtractionAgent(id={self.id}, name={self.name})"
|
||||
|
||||
|
||||
class LlamaExtract(BaseComponent):
|
||||
"""Factory class for creating and managing extraction agents."""
|
||||
|
||||
api_key: str = Field(description="The API key for the LlamaExtract API.")
|
||||
base_url: str = Field(description="The base URL of the LlamaExtract API.")
|
||||
check_interval: int = Field(
|
||||
default=1,
|
||||
description="The interval in seconds to check if the extraction is done.",
|
||||
)
|
||||
max_timeout: int = Field(
|
||||
default=2000,
|
||||
description="The maximum timeout in seconds to wait for the extraction to finish.",
|
||||
)
|
||||
num_workers: int = Field(
|
||||
default=4,
|
||||
gt=0,
|
||||
lt=10,
|
||||
description="The number of workers to use sending API requests for extraction.",
|
||||
)
|
||||
show_progress: bool = Field(
|
||||
default=True, description="Show progress when extracting multiple files."
|
||||
)
|
||||
verbose: bool = Field(
|
||||
default=False, description="Show verbose output when extracting files."
|
||||
)
|
||||
_async_client: AsyncLlamaCloud = PrivateAttr()
|
||||
_thread_pool: ThreadPoolExecutor = PrivateAttr()
|
||||
_project_id: Optional[str] = PrivateAttr()
|
||||
_organization_id: Optional[str] = PrivateAttr()
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_key: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
check_interval: int = 1,
|
||||
max_timeout: int = 2000,
|
||||
num_workers: int = 4,
|
||||
show_progress: bool = True,
|
||||
project_id: Optional[str] = None,
|
||||
organization_id: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
):
|
||||
if not api_key:
|
||||
api_key = os.getenv("LLAMA_CLOUD_API_KEY", None)
|
||||
if api_key is None:
|
||||
raise ValueError("The API key is required.")
|
||||
|
||||
if not base_url:
|
||||
base_url = os.getenv("LLAMA_CLOUD_BASE_URL", None) or DEFAULT_BASE_URL
|
||||
|
||||
super().__init__(
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
check_interval=check_interval,
|
||||
max_timeout=max_timeout,
|
||||
num_workers=num_workers,
|
||||
show_progress=show_progress,
|
||||
verbose=verbose,
|
||||
)
|
||||
|
||||
self._async_client = AsyncLlamaCloud(
|
||||
token=self.api_key, base_url=self.base_url, timeout=None
|
||||
)
|
||||
self._thread_pool = ThreadPoolExecutor(
|
||||
max_workers=min(10, (os.cpu_count() or 1) + 4)
|
||||
)
|
||||
# Fetch default project id if not provided
|
||||
if not project_id:
|
||||
project_id = os.getenv("LLAMA_CLOUD_PROJECT_ID", None)
|
||||
if not project_id:
|
||||
print("No project_id provided, fetching default project.")
|
||||
projects: List[Project] = self._run_in_thread(
|
||||
self._async_client.projects.list_projects()
|
||||
)
|
||||
default_project = [p for p in projects if p.is_default]
|
||||
if not default_project:
|
||||
raise ValueError(
|
||||
"No default project found. Please provide a project_id."
|
||||
)
|
||||
project_id = default_project[0].id
|
||||
|
||||
self._project_id = project_id
|
||||
self._organization_id = organization_id
|
||||
|
||||
def _run_in_thread(self, coro: Coroutine[Any, Any, T]) -> T:
|
||||
"""Run coroutine in a separate thread to avoid event loop issues"""
|
||||
|
||||
def run_coro() -> T:
|
||||
# Create a new client for this thread
|
||||
async def wrapped_coro() -> T:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=self._async_client._client_wrapper.httpx_client.timeout,
|
||||
) as client:
|
||||
# Replace the client in the coro's context
|
||||
original_client = self._async_client._client_wrapper.httpx_client
|
||||
self._async_client._client_wrapper.httpx_client = client
|
||||
try:
|
||||
return await coro
|
||||
finally:
|
||||
self._async_client._client_wrapper.httpx_client = (
|
||||
original_client
|
||||
)
|
||||
|
||||
return asyncio.run(wrapped_coro())
|
||||
|
||||
return self._thread_pool.submit(run_coro).result()
|
||||
|
||||
def create_agent(
|
||||
self,
|
||||
name: str,
|
||||
data_schema: SchemaInput,
|
||||
config: Optional[ExtractConfig] = None,
|
||||
) -> ExtractionAgent:
|
||||
"""Create a new extraction agent.
|
||||
|
||||
Args:
|
||||
name (str): The name of the extraction agent
|
||||
data_schema (SchemaInput): The data schema for the extraction agent
|
||||
config (Optional[ExtractConfig]): The extraction config for the agent
|
||||
|
||||
Returns:
|
||||
ExtractionAgent: The created extraction agent
|
||||
"""
|
||||
|
||||
if isinstance(data_schema, dict):
|
||||
data_schema = data_schema
|
||||
elif issubclass(data_schema, BaseModel):
|
||||
data_schema = data_schema.model_json_schema()
|
||||
else:
|
||||
raise ValueError(
|
||||
"data_schema must be either a dictionary or a Pydantic model"
|
||||
)
|
||||
|
||||
agent = self._run_in_thread(
|
||||
self._async_client.llama_extract.create_extraction_agent(
|
||||
name=name,
|
||||
data_schema=data_schema,
|
||||
config=config or DEFAULT_EXTRACT_CONFIG,
|
||||
project_id=self._project_id,
|
||||
organization_id=self._organization_id,
|
||||
)
|
||||
)
|
||||
|
||||
return ExtractionAgent(
|
||||
client=self._async_client,
|
||||
agent=agent,
|
||||
project_id=self._project_id,
|
||||
organization_id=self._organization_id,
|
||||
check_interval=self.check_interval,
|
||||
max_timeout=self.max_timeout,
|
||||
num_workers=self.num_workers,
|
||||
show_progress=self.show_progress,
|
||||
verbose=self.verbose,
|
||||
)
|
||||
|
||||
def get_agent(
|
||||
self,
|
||||
name: Optional[str] = None,
|
||||
id: Optional[str] = None,
|
||||
) -> ExtractionAgent:
|
||||
"""Get extraction agents by name or extraction agent ID.
|
||||
|
||||
Args:
|
||||
name (Optional[str]): Filter by name
|
||||
extraction_agent_id (Optional[str]): Filter by extraction agent ID
|
||||
|
||||
Returns:
|
||||
ExtractionAgent: The extraction agent
|
||||
"""
|
||||
if id is not None and name is not None:
|
||||
warnings.warn(
|
||||
"Both name and extraction_agent_id are provided. Using extraction_agent_id."
|
||||
)
|
||||
|
||||
if id:
|
||||
agent = self._run_in_thread(
|
||||
self._async_client.llama_extract.get_extraction_agent(
|
||||
extraction_agent_id=id,
|
||||
)
|
||||
)
|
||||
|
||||
elif name:
|
||||
agent = self._run_in_thread(
|
||||
self._async_client.llama_extract.get_extraction_agent_by_name(
|
||||
name=name,
|
||||
project_id=self._project_id,
|
||||
)
|
||||
)
|
||||
else:
|
||||
raise ValueError("Either name or extraction_agent_id must be provided.")
|
||||
|
||||
return ExtractionAgent(
|
||||
client=self._async_client,
|
||||
agent=agent,
|
||||
project_id=self._project_id,
|
||||
organization_id=self._organization_id,
|
||||
check_interval=self.check_interval,
|
||||
max_timeout=self.max_timeout,
|
||||
num_workers=self.num_workers,
|
||||
show_progress=self.show_progress,
|
||||
verbose=self.verbose,
|
||||
)
|
||||
|
||||
def list_agents(self) -> List[ExtractionAgent]:
|
||||
"""List all available extraction agents."""
|
||||
agents = self._run_in_thread(
|
||||
self._async_client.llama_extract.list_extraction_agents(
|
||||
project_id=self._project_id,
|
||||
)
|
||||
)
|
||||
|
||||
return [
|
||||
ExtractionAgent(
|
||||
client=self._async_client,
|
||||
agent=agent,
|
||||
project_id=self._project_id,
|
||||
organization_id=self._organization_id,
|
||||
check_interval=self.check_interval,
|
||||
max_timeout=self.max_timeout,
|
||||
num_workers=self.num_workers,
|
||||
show_progress=self.show_progress,
|
||||
verbose=self.verbose,
|
||||
)
|
||||
for agent in agents
|
||||
]
|
||||
|
||||
def delete_agent(self, agent_id: str) -> None:
|
||||
"""Delete an extraction agent by ID.
|
||||
|
||||
Args:
|
||||
agent_id (str): ID of the extraction agent to delete
|
||||
"""
|
||||
self._run_in_thread(
|
||||
self._async_client.llama_extract.delete_extraction_agent(
|
||||
extraction_agent_id=agent_id
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
data_dir = Path(__file__).parent.parent / "tests" / "data"
|
||||
extractor = LlamaExtract()
|
||||
try:
|
||||
agent = extractor.get_agent(name="test-agent")
|
||||
except Exception:
|
||||
agent = extractor.create_agent(
|
||||
"test-agent",
|
||||
{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"summary": {"type": "string"},
|
||||
},
|
||||
},
|
||||
)
|
||||
results = agent.extract(data_dir / "slide" / "conocophilips.pdf")
|
||||
extractor.delete_agent(agent.id)
|
||||
print(results)
|
||||
@@ -1,36 +0,0 @@
|
||||
from typing import Any, Dict, List, Union, Generator
|
||||
import asyncio
|
||||
from llama_index.core.async_utils import asyncio_run
|
||||
from contextlib import contextmanager
|
||||
|
||||
# Asyncio error messages
|
||||
nest_asyncio_err = "cannot be called from a running event loop"
|
||||
nest_asyncio_msg = (
|
||||
"The event loop is already running. "
|
||||
"Add `import nest_asyncio; nest_asyncio.apply()` to your code to fix this issue."
|
||||
)
|
||||
|
||||
|
||||
def is_jupyter() -> bool:
|
||||
"""Check if we're running in a Jupyter environment."""
|
||||
try:
|
||||
from IPython import get_ipython
|
||||
|
||||
return get_ipython().__class__.__name__ == "ZMQInteractiveShell"
|
||||
except (ImportError, AttributeError):
|
||||
return False
|
||||
|
||||
|
||||
@contextmanager
|
||||
def augment_async_errors() -> Generator[None, None, None]:
|
||||
"""Context manager to add helpful information for errors due to nested event loops."""
|
||||
try:
|
||||
yield
|
||||
except RuntimeError as e:
|
||||
if nest_asyncio_err in str(e):
|
||||
raise RuntimeError(nest_asyncio_msg)
|
||||
raise
|
||||
|
||||
|
||||
JSONType = Union[Dict[str, Any], List[Any], str, int, float, bool, None]
|
||||
JSONObjectType = Dict[str, JSONType]
|
||||
Generated
-4315
File diff suppressed because it is too large
Load Diff
@@ -1,44 +0,0 @@
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.mypy]
|
||||
files = ["llama_extract"]
|
||||
python_version = "3.9"
|
||||
|
||||
[tool.poetry]
|
||||
name = "llama-extract"
|
||||
version = "0.1.1"
|
||||
description = "Structured data extraction from files."
|
||||
authors = ["Logan Markewich <logan@runllama.ai>", "Neeraj Pradhan <neeraj@llamaindex.ai>"]
|
||||
license = "MIT"
|
||||
readme = "README.md"
|
||||
packages = [{include = "llama_extract"}]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.9,<4.0"
|
||||
llama-index-core = "^0.11.0"
|
||||
llama-cloud = "0.1.13"
|
||||
python-dotenv = "^1.0.1"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
pytest = "^8.0.0"
|
||||
ipykernel = "^6.29.0"
|
||||
pre-commit = "3.2.0"
|
||||
autoevals = "^0.0.114"
|
||||
deepdiff = "^8.1.1"
|
||||
ipython = "^8.12.3"
|
||||
jupyter = "^1.1.1"
|
||||
pytest-asyncio = {version = "^0.25.2", python = ">=3.9,<4.0"}
|
||||
mypy = "^1.14.1"
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
asyncio_mode = "strict"
|
||||
asyncio_default_fixture_loop_scope = "function"
|
||||
|
||||
[tool.ruff.format]
|
||||
line-ending = "auto"
|
||||
skip-magic-trailing-comma = false
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = ["W292"]
|
||||
Binary file not shown.
@@ -1,37 +0,0 @@
|
||||
{
|
||||
"receiptNumber": "27215058",
|
||||
"invoiceNumber": "87B37C90152",
|
||||
"datePaid": "2024-07-19",
|
||||
"paymentMethod": {
|
||||
"type": "visa",
|
||||
"lastFourDigits": "7267"
|
||||
},
|
||||
"merchant": {
|
||||
"name": "Noisebridge",
|
||||
"address": {
|
||||
"street": "272 Capp St",
|
||||
"city": "San Francisco",
|
||||
"state": "California",
|
||||
"postalCode": "94110",
|
||||
"country": "United States"
|
||||
},
|
||||
"phone": "1 6507017829",
|
||||
"email": "treasurer+stripe@noisebridge.net"
|
||||
},
|
||||
"billTo": "noisebridge@seldo.com",
|
||||
"items": [
|
||||
{
|
||||
"description": "$10 / month",
|
||||
"quantity": 1,
|
||||
"unitPrice": 10.0,
|
||||
"amount": 10.0,
|
||||
"period": {
|
||||
"start": "2024-07-19",
|
||||
"end": "2024-08-19"
|
||||
}
|
||||
}
|
||||
],
|
||||
"subtotal": 10.0,
|
||||
"total": 10.0,
|
||||
"amountPaid": 10.0
|
||||
}
|
||||
@@ -1,135 +0,0 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"required": [
|
||||
"receiptNumber",
|
||||
"datePaid",
|
||||
"total",
|
||||
"items"
|
||||
],
|
||||
"properties": {
|
||||
"receiptNumber": {
|
||||
"type": "string"
|
||||
},
|
||||
"invoiceNumber": {
|
||||
"type": "string"
|
||||
},
|
||||
"datePaid": {
|
||||
"type": "string",
|
||||
"format": "date"
|
||||
},
|
||||
"paymentMethod": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"type": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"visa",
|
||||
"mastercard",
|
||||
"amex",
|
||||
"cash",
|
||||
"other"
|
||||
]
|
||||
},
|
||||
"lastFourDigits": {
|
||||
"type": "string",
|
||||
"pattern": "^[0-9]{4}$"
|
||||
}
|
||||
}
|
||||
},
|
||||
"merchant": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"address": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"street": {
|
||||
"type": "string"
|
||||
},
|
||||
"city": {
|
||||
"type": "string"
|
||||
},
|
||||
"state": {
|
||||
"type": "string"
|
||||
},
|
||||
"postalCode": {
|
||||
"type": "string"
|
||||
},
|
||||
"country": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"phone": {
|
||||
"type": "string"
|
||||
},
|
||||
"email": {
|
||||
"type": "string",
|
||||
"format": "email"
|
||||
}
|
||||
}
|
||||
},
|
||||
"billTo": {
|
||||
"type": "string",
|
||||
"format": "email"
|
||||
},
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"required": [
|
||||
"description",
|
||||
"quantity",
|
||||
"unitPrice",
|
||||
"amount",
|
||||
"period"
|
||||
],
|
||||
"properties": {
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"quantity": {
|
||||
"type": "integer",
|
||||
"minimum": 1
|
||||
},
|
||||
"unitPrice": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"amount": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"period": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"start": {
|
||||
"type": "string",
|
||||
"format": "date"
|
||||
},
|
||||
"end": {
|
||||
"type": "string",
|
||||
"format": "date"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"subtotal": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"total": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
},
|
||||
"amountPaid": {
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,200 +0,0 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"title": "Resume Schema",
|
||||
"type": "object",
|
||||
"required": [
|
||||
"basics",
|
||||
"skills",
|
||||
"experience"
|
||||
],
|
||||
"properties": {
|
||||
"basics": {
|
||||
"type": "object",
|
||||
"required": [
|
||||
"name",
|
||||
"email"
|
||||
],
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"email": {
|
||||
"type": "string",
|
||||
"format": "email"
|
||||
},
|
||||
"phone": {
|
||||
"type": "string"
|
||||
},
|
||||
"location": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string"
|
||||
},
|
||||
"region": {
|
||||
"type": "string"
|
||||
},
|
||||
"country": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"profiles": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"network": {
|
||||
"type": "string"
|
||||
},
|
||||
"url": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"summary": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"skills": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"category": {
|
||||
"type": "string"
|
||||
},
|
||||
"keywords": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"level": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"beginner",
|
||||
"intermediate",
|
||||
"advanced",
|
||||
"expert"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"experience": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"required": [
|
||||
"company",
|
||||
"position",
|
||||
"startDate"
|
||||
],
|
||||
"properties": {
|
||||
"company": {
|
||||
"type": "string"
|
||||
},
|
||||
"position": {
|
||||
"type": "string"
|
||||
},
|
||||
"startDate": {
|
||||
"type": "string",
|
||||
"format": "date"
|
||||
},
|
||||
"endDate": {
|
||||
"type": "string",
|
||||
"format": "date"
|
||||
},
|
||||
"highlights": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"technologies": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"education": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"required": [
|
||||
"institution",
|
||||
"degree"
|
||||
],
|
||||
"properties": {
|
||||
"institution": {
|
||||
"type": "string"
|
||||
},
|
||||
"degree": {
|
||||
"type": "string"
|
||||
},
|
||||
"field": {
|
||||
"type": "string"
|
||||
},
|
||||
"graduationDate": {
|
||||
"type": "string",
|
||||
"format": "date"
|
||||
},
|
||||
"gpa": {
|
||||
"type": "number"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"certifications": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"issuer": {
|
||||
"type": "string"
|
||||
},
|
||||
"date": {
|
||||
"type": "string",
|
||||
"format": "date"
|
||||
},
|
||||
"validUntil": {
|
||||
"type": "string",
|
||||
"format": "date"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"publications": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"publisher": {
|
||||
"type": "string"
|
||||
},
|
||||
"date": {
|
||||
"type": "string",
|
||||
"format": "date"
|
||||
},
|
||||
"url": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,300 +0,0 @@
|
||||
<!doctype html>
|
||||
<html>
|
||||
|
||||
<head>
|
||||
<style>
|
||||
body {
|
||||
font-family: "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
background: #fff;
|
||||
color: #333;
|
||||
line-height: 1.6;
|
||||
}
|
||||
|
||||
.container {
|
||||
display: flex;
|
||||
max-width: 1200px;
|
||||
margin: 0 auto;
|
||||
box-shadow: 0 0 20px rgba(0, 0, 0, 0.1);
|
||||
min-height: 100vh;
|
||||
}
|
||||
|
||||
.sidebar {
|
||||
background: #2c3e50;
|
||||
color: white;
|
||||
padding: 2rem;
|
||||
width: 300px;
|
||||
}
|
||||
|
||||
.main-content {
|
||||
padding: 2rem;
|
||||
flex: 1;
|
||||
}
|
||||
|
||||
.profile-name {
|
||||
font-size: 2.5rem;
|
||||
margin: 0;
|
||||
color: #2c3e50;
|
||||
border-bottom: 3px solid #3498db;
|
||||
padding-bottom: 0.5rem;
|
||||
}
|
||||
|
||||
.profile-title {
|
||||
font-size: 1.5rem;
|
||||
color: #7f8c8d;
|
||||
margin: 0.5rem 0 2rem 0;
|
||||
}
|
||||
|
||||
.contact-info {
|
||||
margin-bottom: 2rem;
|
||||
}
|
||||
|
||||
.section-title {
|
||||
font-size: 1.2rem;
|
||||
text-transform: uppercase;
|
||||
color: #3498db;
|
||||
margin-bottom: 1rem;
|
||||
letter-spacing: 1px;
|
||||
}
|
||||
|
||||
.sidebar .section-title {
|
||||
color: white;
|
||||
border-bottom: 2px solid #3498db;
|
||||
padding-bottom: 0.5rem;
|
||||
}
|
||||
|
||||
.skill-category {
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
.skill-list {
|
||||
list-style: none;
|
||||
padding: 0;
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
.skill-list li {
|
||||
margin-bottom: 0.5rem;
|
||||
font-size: 0.9rem;
|
||||
}
|
||||
|
||||
.experience-item {
|
||||
margin-bottom: 2rem;
|
||||
}
|
||||
|
||||
.company-name {
|
||||
font-weight: bold;
|
||||
color: #2c3e50;
|
||||
font-size: 1.1rem;
|
||||
}
|
||||
|
||||
.job-title {
|
||||
color: #3498db;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.date {
|
||||
color: #7f8c8d;
|
||||
font-size: 0.9rem;
|
||||
}
|
||||
|
||||
.achievements {
|
||||
list-style: disc;
|
||||
padding-left: 1.2rem;
|
||||
margin-top: 0.5rem;
|
||||
}
|
||||
|
||||
.contact-info a {
|
||||
color: white;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
.education-item {
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div class="container">
|
||||
<div class="sidebar">
|
||||
<div class="contact-info">
|
||||
<h2 class="section-title">Contact</h2>
|
||||
<p>sarah.chen@email.com</p>
|
||||
<p>(555) 123-4567</p>
|
||||
<p>San Francisco, CA</p>
|
||||
<p><a href="#">LinkedIn Profile</a></p>
|
||||
</div>
|
||||
|
||||
<div class="skills-section">
|
||||
<h2 class="section-title">Technical Skills</h2>
|
||||
|
||||
<div class="skill-category">
|
||||
<h3>Architecture & Design</h3>
|
||||
<ul class="skill-list">
|
||||
<li>Microservices</li>
|
||||
<li>Event-Driven Architecture</li>
|
||||
<li>Domain-Driven Design</li>
|
||||
<li>REST APIs</li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="skill-category">
|
||||
<h3>Cloud Platforms</h3>
|
||||
<ul class="skill-list">
|
||||
<li>AWS (Advanced)</li>
|
||||
<li>Azure</li>
|
||||
<li>Google Cloud Platform</li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="skill-category">
|
||||
<h3>Programming</h3>
|
||||
<ul class="skill-list">
|
||||
<li>Java</li>
|
||||
<li>Python</li>
|
||||
<li>Go</li>
|
||||
<li>JavaScript/TypeScript</li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="skill-category">
|
||||
<h3>Certifications</h3>
|
||||
<ul class="skill-list">
|
||||
<li>AWS Solutions Architect - Professional</li>
|
||||
<li>Google Cloud Architect</li>
|
||||
<li>Certified Kubernetes Administrator</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="main-content">
|
||||
<h1 class="profile-name">Sarah Chen</h1>
|
||||
<div class="profile-title">Senior Software Architect</div>
|
||||
|
||||
<div class="section">
|
||||
<h2 class="section-title">Professional Summary</h2>
|
||||
<p>
|
||||
Innovative Software Architect with over 12 years of experience
|
||||
designing and implementing large-scale distributed systems. Proven
|
||||
track record of leading technical teams and delivering robust
|
||||
enterprise solutions. Expert in cloud architecture, microservices,
|
||||
and emerging technologies with a focus on scalable, maintainable
|
||||
systems.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="section">
|
||||
<h2 class="section-title">Professional Experience</h2>
|
||||
|
||||
<div class="experience-item">
|
||||
<div class="company-name">TechCorp Solutions</div>
|
||||
<div class="job-title">Senior Software Architect</div>
|
||||
<div class="date">2020 - Present</div>
|
||||
<ul class="achievements">
|
||||
<li>
|
||||
Led architectural design and implementation of a cloud-native
|
||||
platform serving 2M+ users
|
||||
</li>
|
||||
<li>
|
||||
Established architectural guidelines and best practices adopted
|
||||
across 12 development teams
|
||||
</li>
|
||||
<li>
|
||||
Reduced system latency by 40% through implementation of
|
||||
event-driven architecture
|
||||
</li>
|
||||
<li>
|
||||
Mentored 15+ senior developers in cloud-native development
|
||||
practices
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="experience-item">
|
||||
<div class="company-name">DataFlow Systems</div>
|
||||
<div class="job-title">Lead Software Engineer</div>
|
||||
<div class="date">2016 - 2020</div>
|
||||
<ul class="achievements">
|
||||
<li>
|
||||
Architected and led development of distributed data processing
|
||||
platform handling 5TB daily
|
||||
</li>
|
||||
<li>
|
||||
Designed microservices architecture reducing deployment time by
|
||||
65%
|
||||
</li>
|
||||
<li>
|
||||
Led migration of legacy monolith to cloud-native architecture
|
||||
</li>
|
||||
<li>
|
||||
Managed team of 8 engineers across 3 international locations
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="experience-item">
|
||||
<div class="company-name">InnovateTech</div>
|
||||
<div class="job-title">Senior Software Engineer</div>
|
||||
<div class="date">2013 - 2016</div>
|
||||
<ul class="achievements">
|
||||
<li>
|
||||
Developed high-performance trading platform processing 100K
|
||||
transactions per second
|
||||
</li>
|
||||
<li>
|
||||
Implemented real-time analytics engine reducing processing
|
||||
latency by 75%
|
||||
</li>
|
||||
<li>
|
||||
Led adoption of container orchestration reducing deployment
|
||||
costs by 35%
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="section">
|
||||
<h2 class="section-title">Education</h2>
|
||||
|
||||
<div class="education-item">
|
||||
<div class="company-name">Stanford University</div>
|
||||
<div class="job-title">Master of Science in Computer Science</div>
|
||||
<div class="date">2013</div>
|
||||
<p>Focus: Distributed Systems and Machine Learning</p>
|
||||
</div>
|
||||
|
||||
<div class="education-item">
|
||||
<div class="company-name">University of California, Berkeley</div>
|
||||
<div class="job-title">
|
||||
Bachelor of Science in Computer Engineering
|
||||
</div>
|
||||
<div class="date">2011</div>
|
||||
<p>Magna Cum Laude</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="section">
|
||||
<h2 class="section-title">Patents & Speaking</h2>
|
||||
<ul class="achievements">
|
||||
<li>
|
||||
Co-inventor on three patents for distributed systems architecture
|
||||
</li>
|
||||
<li>
|
||||
Published paper on "Scalable Microservices Architecture" at IEEE
|
||||
Cloud Computing Conference 2022
|
||||
</li>
|
||||
<li>
|
||||
Keynote Speaker, CloudCon 2023: "Future of Cloud-Native
|
||||
Architecture"
|
||||
</li>
|
||||
<li>Regular presenter at local tech meetups and conferences</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
@@ -1,104 +0,0 @@
|
||||
{
|
||||
"basics": {
|
||||
"name": "Sarah Chen",
|
||||
"email": "san.francisco@email.com",
|
||||
"phone": "(555) 123-4567",
|
||||
"location": {
|
||||
"city": "San Francisco",
|
||||
"region": "CA",
|
||||
"country": "USA"
|
||||
}
|
||||
},
|
||||
"skills": [
|
||||
{
|
||||
"category": "Architecture & Design",
|
||||
"keywords": [
|
||||
"Microservices",
|
||||
"Event-Driven Architecture",
|
||||
"Domain-Driven Design",
|
||||
"REST APIs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"category": "Cloud Platforms",
|
||||
"keywords": [
|
||||
"AWS",
|
||||
"Azure",
|
||||
"Google Cloud Platform"
|
||||
]
|
||||
},
|
||||
{
|
||||
"category": "Programming Languages",
|
||||
"keywords": [
|
||||
"Java",
|
||||
"Python",
|
||||
"Go",
|
||||
"JavaScript",
|
||||
"TypeScript"
|
||||
]
|
||||
}
|
||||
],
|
||||
"experience": [
|
||||
{
|
||||
"company": "TechCorp Solutions",
|
||||
"position": "Senior Software Architect",
|
||||
"startDate": "2020-01-01",
|
||||
"endDate": "2024-01-10"
|
||||
},
|
||||
{
|
||||
"company": "DataFlow Systems",
|
||||
"position": "Lead Software Engineer",
|
||||
"startDate": "2016-01-01",
|
||||
"endDate": "2019-12-31",
|
||||
"technologies": [
|
||||
"Distributed Systems",
|
||||
"Microservices",
|
||||
"Cloud Migration"
|
||||
]
|
||||
},
|
||||
{
|
||||
"company": "InnovateTech",
|
||||
"position": "Senior Software Engineer",
|
||||
"startDate": "2013-01-01",
|
||||
"endDate": "2015-12-31",
|
||||
"technologies": [
|
||||
"High-performance Computing",
|
||||
"Real-time Analytics",
|
||||
"Container Orchestration"
|
||||
]
|
||||
}
|
||||
],
|
||||
"education": [
|
||||
{
|
||||
"institution": "Stanford University",
|
||||
"degree": "Master of Science",
|
||||
"field": "Computer Science",
|
||||
"graduationDate": "2013-01-01",
|
||||
"specialization": "Distributed Systems and Machine Learning"
|
||||
},
|
||||
{
|
||||
"institution": "University of California, Berkeley",
|
||||
"degree": "Bachelor of Science",
|
||||
"field": "Computer Engineering",
|
||||
"graduationDate": "2011-01-01"
|
||||
}
|
||||
],
|
||||
"certifications": [
|
||||
{
|
||||
"name": "AWS Solutions Architect - Professional"
|
||||
},
|
||||
{
|
||||
"name": "Google Cloud Architect"
|
||||
},
|
||||
{
|
||||
"name": "Certified Kubernetes Administrator"
|
||||
}
|
||||
],
|
||||
"publications": [
|
||||
{
|
||||
"title": "Scalable Microservices Architecture",
|
||||
"publisher": "IEEE Cloud Computing Conference",
|
||||
"date": "2022-01-01"
|
||||
}
|
||||
]
|
||||
}
|
||||
Binary file not shown.
@@ -1,48 +0,0 @@
|
||||
{
|
||||
"companyInfo": {
|
||||
"name": "CloudFlow Analytics",
|
||||
"fundingStage": "Series A",
|
||||
"foundedYear": null,
|
||||
"industry": null,
|
||||
"location": null
|
||||
},
|
||||
"financialMetrics": {
|
||||
"mrr": {
|
||||
"value": 580000,
|
||||
"currency": "USD",
|
||||
"growthRate": 27
|
||||
},
|
||||
"grossMargin": 88
|
||||
},
|
||||
"growthMetrics": {
|
||||
"customers": {
|
||||
"total": 1247,
|
||||
"growth": 142,
|
||||
"enterprisePercent": null
|
||||
},
|
||||
"nrr": 147
|
||||
},
|
||||
"marketMetrics": {
|
||||
"tam": 50000000000,
|
||||
"sam": null,
|
||||
"marketShare": null,
|
||||
"competitors": null
|
||||
},
|
||||
"differentiators": [
|
||||
{
|
||||
"claim": "Processing Speed",
|
||||
"metric": "5x faster",
|
||||
"comparisonTarget": "competitors"
|
||||
},
|
||||
{
|
||||
"claim": "ML Accuracy",
|
||||
"metric": "99.9%",
|
||||
"comparisonTarget": null
|
||||
},
|
||||
{
|
||||
"claim": "Market Potential",
|
||||
"metric": "80%",
|
||||
"comparisonTarget": "Fortune 500"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,151 +0,0 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"required": [
|
||||
"companyInfo",
|
||||
"financialMetrics",
|
||||
"growthMetrics"
|
||||
],
|
||||
"properties": {
|
||||
"companyInfo": {
|
||||
"type": "object",
|
||||
"required": [
|
||||
"name",
|
||||
"fundingStage"
|
||||
],
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"fundingStage": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"Pre-seed",
|
||||
"Seed",
|
||||
"Series A",
|
||||
"Series B",
|
||||
"Series C+"
|
||||
]
|
||||
},
|
||||
"foundedYear": {
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "integer"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
]
|
||||
},
|
||||
"industry": {
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
]
|
||||
},
|
||||
"location": {
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"financialMetrics": {
|
||||
"type": "object",
|
||||
"required": [
|
||||
"mrr",
|
||||
"growthRate"
|
||||
],
|
||||
"properties": {
|
||||
"mrr": {
|
||||
"type": "object",
|
||||
"description": "Monthly Recurring Revenue",
|
||||
"required": [
|
||||
"value",
|
||||
"currency",
|
||||
"growthRate"
|
||||
],
|
||||
"properties": {
|
||||
"value": {
|
||||
"type": "number"
|
||||
},
|
||||
"currency": {
|
||||
"type": "string"
|
||||
},
|
||||
"growthRate": {
|
||||
"type": "number"
|
||||
}
|
||||
}
|
||||
},
|
||||
"grossMargin": {
|
||||
"type": "number"
|
||||
}
|
||||
}
|
||||
},
|
||||
"growthMetrics": {
|
||||
"type": "object",
|
||||
"required": [
|
||||
"customers",
|
||||
"nrr"
|
||||
],
|
||||
"properties": {
|
||||
"customers": {
|
||||
"type": "object",
|
||||
"required": [
|
||||
"total",
|
||||
"growth"
|
||||
],
|
||||
"properties": {
|
||||
"total": {
|
||||
"type": "integer"
|
||||
},
|
||||
"growth": {
|
||||
"type": "number"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nrr": {
|
||||
"description": "Net Revenue Retention",
|
||||
"type": "number"
|
||||
}
|
||||
}
|
||||
},
|
||||
"differentiators": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"required": [
|
||||
"claim",
|
||||
"metric"
|
||||
],
|
||||
"properties": {
|
||||
"claim": {
|
||||
"type": "string"
|
||||
},
|
||||
"metric": {
|
||||
"type": "string"
|
||||
},
|
||||
"comparisonTarget": {
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,149 +0,0 @@
|
||||
import os
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
|
||||
from llama_extract import LlamaExtract, ExtractionAgent
|
||||
from dotenv import load_dotenv
|
||||
from time import perf_counter
|
||||
from collections import namedtuple
|
||||
import json
|
||||
import uuid
|
||||
from llama_cloud.core.api_error import ApiError
|
||||
from llama_cloud.types import (
|
||||
ExtractConfig,
|
||||
ExtractMode,
|
||||
LlamaParseParameters,
|
||||
LlamaExtractSettings,
|
||||
)
|
||||
|
||||
load_dotenv(Path(__file__).parent.parent / ".env.dev", override=True)
|
||||
|
||||
|
||||
TEST_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
# Get configuration from environment
|
||||
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
|
||||
LLAMA_CLOUD_BASE_URL = os.getenv("LLAMA_CLOUD_BASE_URL")
|
||||
LLAMA_CLOUD_PROJECT_ID = os.getenv("LLAMA_CLOUD_PROJECT_ID")
|
||||
|
||||
TestCase = namedtuple(
|
||||
"TestCase", ["name", "schema_path", "config", "input_file", "expected_output"]
|
||||
)
|
||||
|
||||
|
||||
def get_test_cases():
|
||||
"""Get all test cases from TEST_DIR.
|
||||
|
||||
Returns:
|
||||
List[TestCase]: List of test cases
|
||||
"""
|
||||
test_cases = []
|
||||
|
||||
for data_type in os.listdir(TEST_DIR):
|
||||
data_type_dir = os.path.join(TEST_DIR, data_type)
|
||||
if not os.path.isdir(data_type_dir):
|
||||
continue
|
||||
|
||||
schema_path = os.path.join(data_type_dir, "schema.json")
|
||||
if not os.path.exists(schema_path):
|
||||
continue
|
||||
|
||||
input_files = []
|
||||
|
||||
for file in os.listdir(data_type_dir):
|
||||
file_path = os.path.join(data_type_dir, file)
|
||||
if (
|
||||
not os.path.isfile(file_path)
|
||||
or file == "schema.json"
|
||||
or file.endswith(".test.json")
|
||||
):
|
||||
continue
|
||||
|
||||
input_files.append(file_path)
|
||||
|
||||
settings = [
|
||||
ExtractConfig(extraction_mode=ExtractMode.FAST),
|
||||
ExtractConfig(extraction_mode=ExtractMode.ACCURATE),
|
||||
]
|
||||
|
||||
for input_file in sorted(input_files):
|
||||
base_name = os.path.splitext(os.path.basename(input_file))[0]
|
||||
expected_output = os.path.join(data_type_dir, f"{base_name}.test.json")
|
||||
|
||||
if not os.path.exists(expected_output):
|
||||
continue
|
||||
|
||||
test_name = f"{data_type}/{os.path.basename(input_file)}"
|
||||
for setting in settings:
|
||||
test_cases.append(
|
||||
TestCase(
|
||||
name=test_name,
|
||||
schema_path=schema_path,
|
||||
input_file=input_file,
|
||||
config=setting,
|
||||
expected_output=expected_output,
|
||||
)
|
||||
)
|
||||
|
||||
return test_cases
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def extractor():
|
||||
"""Create a single LlamaExtract instance for all tests."""
|
||||
extract = LlamaExtract(
|
||||
api_key=LLAMA_CLOUD_API_KEY,
|
||||
base_url=LLAMA_CLOUD_BASE_URL,
|
||||
project_id=LLAMA_CLOUD_PROJECT_ID,
|
||||
verbose=True,
|
||||
)
|
||||
yield extract
|
||||
# Cleanup thread pool at end of session
|
||||
extract._thread_pool.shutdown()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def extraction_agent(test_case: TestCase, extractor: LlamaExtract):
|
||||
"""Fixture to create and cleanup extraction agent for each test."""
|
||||
# Create unique name with random UUID (important for CI to avoid conflicts)
|
||||
unique_id = uuid.uuid4().hex[:8]
|
||||
agent_name = f"{test_case.name}_{unique_id}"
|
||||
|
||||
with open(test_case.schema_path, "r") as f:
|
||||
schema = json.load(f)
|
||||
|
||||
# Clean up any existing agents with this name
|
||||
try:
|
||||
agents = extractor.list_agents()
|
||||
for agent in agents:
|
||||
if agent.name == agent_name:
|
||||
extractor.delete_agent(agent.id)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to cleanup existing agent: {str(e)}")
|
||||
|
||||
# Create new agent
|
||||
agent = extractor.create_agent(agent_name, schema, config=test_case.config)
|
||||
yield agent
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
"CI" in os.environ,
|
||||
reason="CI environment is not suitable for benchmarking",
|
||||
)
|
||||
@pytest.mark.parametrize("test_case", get_test_cases(), ids=lambda x: x.name)
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_extraction(
|
||||
test_case: TestCase, extraction_agent: ExtractionAgent
|
||||
) -> None:
|
||||
start = perf_counter()
|
||||
result = await extraction_agent._queue_extraction_test(
|
||||
test_case.input_file,
|
||||
extract_settings=LlamaExtractSettings(
|
||||
llama_parse_params=LlamaParseParameters(
|
||||
invalidate_cache=True,
|
||||
do_not_cache=True,
|
||||
)
|
||||
),
|
||||
)
|
||||
end = perf_counter()
|
||||
print(f"Time taken: {end - start} seconds")
|
||||
print(result)
|
||||
@@ -1,190 +0,0 @@
|
||||
import os
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from pydantic import BaseModel
|
||||
from dotenv import load_dotenv
|
||||
from llama_cloud.core.api_error import ApiError
|
||||
|
||||
from llama_extract import LlamaExtract, ExtractionAgent
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv(Path(__file__).parent.parent / ".env.dev", override=True)
|
||||
|
||||
# Get configuration from environment
|
||||
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
|
||||
LLAMA_CLOUD_BASE_URL = os.getenv("LLAMA_CLOUD_BASE_URL")
|
||||
LLAMA_CLOUD_PROJECT_ID = os.getenv("LLAMA_CLOUD_PROJECT_ID")
|
||||
|
||||
# Skip all tests if API key is not set
|
||||
pytestmark = pytest.mark.skipif(
|
||||
not LLAMA_CLOUD_API_KEY, reason="LLAMA_CLOUD_API_KEY not set"
|
||||
)
|
||||
|
||||
|
||||
# Test data
|
||||
class TestSchema(BaseModel):
|
||||
title: str
|
||||
summary: str
|
||||
|
||||
|
||||
# Test data paths
|
||||
TEST_DIR = Path(__file__).parent / "data"
|
||||
TEST_PDF = TEST_DIR / "slide" / "saas_slide.pdf"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def llama_extract():
|
||||
return LlamaExtract(
|
||||
api_key=LLAMA_CLOUD_API_KEY,
|
||||
base_url=LLAMA_CLOUD_BASE_URL,
|
||||
project_id=LLAMA_CLOUD_PROJECT_ID,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_agent_name():
|
||||
return "test-api-agent"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_schema_dict():
|
||||
return {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"summary": {"type": "string"},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_agent(llama_extract, test_agent_name, test_schema_dict, request):
|
||||
"""Creates a test agent and cleans it up after the test"""
|
||||
test_id = request.node.nodeid
|
||||
test_hash = hex(hash(test_id))[-8:]
|
||||
base_name = test_agent_name
|
||||
|
||||
base_name = next(
|
||||
(marker.args[0] for marker in request.node.iter_markers("agent_name")),
|
||||
base_name,
|
||||
)
|
||||
name = f"{base_name}_{test_hash}"
|
||||
|
||||
schema = next(
|
||||
(
|
||||
marker.args[0][0] if isinstance(marker.args[0], tuple) else marker.args[0]
|
||||
for marker in request.node.iter_markers("agent_schema")
|
||||
),
|
||||
test_schema_dict,
|
||||
)
|
||||
|
||||
# Cleanup existing agent
|
||||
try:
|
||||
for agent in llama_extract.list_agents():
|
||||
if agent.name == name:
|
||||
llama_extract.delete_agent(agent.id)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to cleanup existing agent: {e}")
|
||||
|
||||
agent = llama_extract.create_agent(name=name, data_schema=schema)
|
||||
yield agent
|
||||
|
||||
# Cleanup after test
|
||||
try:
|
||||
llama_extract.delete_agent(agent.id)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to delete agent {agent.id}: {e}")
|
||||
|
||||
|
||||
class TestLlamaExtract:
|
||||
def test_init_without_api_key(self):
|
||||
env_backup = os.getenv("LLAMA_CLOUD_API_KEY")
|
||||
del os.environ["LLAMA_CLOUD_API_KEY"]
|
||||
with pytest.raises(ValueError, match="The API key is required"):
|
||||
LlamaExtract(api_key=None, base_url=LLAMA_CLOUD_BASE_URL)
|
||||
os.environ["LLAMA_CLOUD_API_KEY"] = env_backup
|
||||
|
||||
@pytest.mark.agent_name("test-dict-schema-agent")
|
||||
def test_create_agent_with_dict_schema(self, test_agent):
|
||||
assert isinstance(test_agent, ExtractionAgent)
|
||||
|
||||
@pytest.mark.agent_name("test-pydantic-schema-agent")
|
||||
@pytest.mark.agent_schema((TestSchema,))
|
||||
def test_create_agent_with_pydantic_schema(self, test_agent):
|
||||
assert isinstance(test_agent, ExtractionAgent)
|
||||
|
||||
def test_get_agent_by_name(self, llama_extract, test_agent):
|
||||
agent = llama_extract.get_agent(name=test_agent.name)
|
||||
assert isinstance(agent, ExtractionAgent)
|
||||
assert agent.name == test_agent.name
|
||||
assert agent.id == test_agent.id
|
||||
assert agent.data_schema == test_agent.data_schema
|
||||
|
||||
def test_get_agent_by_id(self, llama_extract, test_agent):
|
||||
agent = llama_extract.get_agent(id=test_agent.id)
|
||||
assert isinstance(agent, ExtractionAgent)
|
||||
assert agent.id == test_agent.id
|
||||
assert agent.name == test_agent.name
|
||||
assert agent.data_schema == test_agent.data_schema
|
||||
|
||||
def test_list_agents(self, llama_extract, test_agent):
|
||||
agents = llama_extract.list_agents()
|
||||
assert isinstance(agents, list)
|
||||
assert any(a.id == test_agent.id for a in agents)
|
||||
|
||||
|
||||
class TestExtractionAgent:
|
||||
@pytest.mark.asyncio
|
||||
async def test_extract_single_file(self, test_agent):
|
||||
result = await test_agent.aextract(TEST_PDF)
|
||||
assert result.status == "SUCCESS"
|
||||
assert result.data is not None
|
||||
assert isinstance(result.data, dict)
|
||||
assert "title" in result.data
|
||||
assert "summary" in result.data
|
||||
|
||||
def test_sync_extract_single_file(self, test_agent):
|
||||
result = test_agent.extract(TEST_PDF)
|
||||
assert result.status == "SUCCESS"
|
||||
assert result.data is not None
|
||||
assert isinstance(result.data, dict)
|
||||
assert "title" in result.data
|
||||
assert "summary" in result.data
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extract_multiple_files(self, test_agent):
|
||||
files = [TEST_PDF, TEST_PDF] # Using same file twice for testing
|
||||
response = await test_agent.aextract(files)
|
||||
|
||||
assert len(response) == 2
|
||||
for result in response:
|
||||
assert result.status == "SUCCESS"
|
||||
assert result.data is not None
|
||||
assert isinstance(result.data, dict)
|
||||
assert "title" in result.data
|
||||
assert "summary" in result.data
|
||||
|
||||
def test_save_agent_updates(
|
||||
self, test_agent: ExtractionAgent, llama_extract: LlamaExtract
|
||||
):
|
||||
new_schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"new_field": {"type": "string"},
|
||||
"title": {"type": "string"},
|
||||
"summary": {"type": "string"},
|
||||
},
|
||||
}
|
||||
test_agent.data_schema = new_schema
|
||||
test_agent.save()
|
||||
|
||||
# Verify the update by getting a fresh instance
|
||||
updated_agent = llama_extract.get_agent(name=test_agent.name)
|
||||
assert "new_field" in updated_agent.data_schema["properties"]
|
||||
|
||||
def test_list_extraction_runs(self, test_agent: ExtractionAgent):
|
||||
assert len(test_agent.list_extraction_runs()) == 0
|
||||
test_agent.extract(TEST_PDF)
|
||||
runs = test_agent.list_extraction_runs()
|
||||
assert len(runs) > 0
|
||||
@@ -1,142 +0,0 @@
|
||||
import os
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
|
||||
from llama_extract import LlamaExtract, ExtractionAgent
|
||||
from dotenv import load_dotenv
|
||||
from collections import namedtuple
|
||||
import json
|
||||
import uuid
|
||||
from llama_cloud.core.api_error import ApiError
|
||||
from llama_cloud.types import ExtractConfig, ExtractMode, ExtractConfig
|
||||
from deepdiff import DeepDiff
|
||||
from tests.util import json_subset_match_score
|
||||
|
||||
load_dotenv(Path(__file__).parent.parent / ".env.dev", override=True)
|
||||
|
||||
|
||||
TEST_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
|
||||
# Get configuration from environment
|
||||
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
|
||||
LLAMA_CLOUD_BASE_URL = os.getenv("LLAMA_CLOUD_BASE_URL")
|
||||
LLAMA_CLOUD_PROJECT_ID = os.getenv("LLAMA_CLOUD_PROJECT_ID")
|
||||
|
||||
TestCase = namedtuple(
|
||||
"TestCase", ["name", "schema_path", "config", "input_file", "expected_output"]
|
||||
)
|
||||
|
||||
|
||||
def get_test_cases():
|
||||
"""Get all test cases from TEST_DIR.
|
||||
|
||||
Returns:
|
||||
List[TestCase]: List of test cases
|
||||
"""
|
||||
test_cases = []
|
||||
|
||||
for data_type in os.listdir(TEST_DIR):
|
||||
data_type_dir = os.path.join(TEST_DIR, data_type)
|
||||
if not os.path.isdir(data_type_dir):
|
||||
continue
|
||||
|
||||
schema_path = os.path.join(data_type_dir, "schema.json")
|
||||
if not os.path.exists(schema_path):
|
||||
continue
|
||||
|
||||
input_files = []
|
||||
|
||||
for file in os.listdir(data_type_dir):
|
||||
file_path = os.path.join(data_type_dir, file)
|
||||
if (
|
||||
not os.path.isfile(file_path)
|
||||
or file == "schema.json"
|
||||
or file.endswith(".test.json")
|
||||
):
|
||||
continue
|
||||
|
||||
input_files.append(file_path)
|
||||
|
||||
settings = [
|
||||
ExtractConfig(extraction_mode=ExtractMode.FAST),
|
||||
ExtractConfig(extraction_mode=ExtractMode.ACCURATE),
|
||||
]
|
||||
|
||||
for input_file in sorted(input_files):
|
||||
base_name = os.path.splitext(os.path.basename(input_file))[0]
|
||||
expected_output = os.path.join(data_type_dir, f"{base_name}.test.json")
|
||||
|
||||
if not os.path.exists(expected_output):
|
||||
continue
|
||||
|
||||
test_name = f"{data_type}/{os.path.basename(input_file)}"
|
||||
for setting in settings:
|
||||
test_cases.append(
|
||||
TestCase(
|
||||
name=test_name,
|
||||
schema_path=schema_path,
|
||||
input_file=input_file,
|
||||
config=setting,
|
||||
expected_output=expected_output,
|
||||
)
|
||||
)
|
||||
|
||||
return test_cases
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def extractor():
|
||||
"""Create a single LlamaExtract instance for all tests."""
|
||||
extract = LlamaExtract(
|
||||
api_key=LLAMA_CLOUD_API_KEY,
|
||||
base_url=LLAMA_CLOUD_BASE_URL,
|
||||
project_id=LLAMA_CLOUD_PROJECT_ID,
|
||||
verbose=True,
|
||||
)
|
||||
yield extract
|
||||
# Cleanup thread pool at end of session
|
||||
extract._thread_pool.shutdown()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def extraction_agent(test_case: TestCase, extractor: LlamaExtract):
|
||||
"""Fixture to create and cleanup extraction agent for each test."""
|
||||
# Create unique name with random UUID (important for CI to avoid conflicts)
|
||||
unique_id = uuid.uuid4().hex[:8]
|
||||
agent_name = f"{test_case.name}_{unique_id}"
|
||||
|
||||
with open(test_case.schema_path, "r") as f:
|
||||
schema = json.load(f)
|
||||
|
||||
# Clean up any existing agents with this name
|
||||
try:
|
||||
agents = extractor.list_agents()
|
||||
for agent in agents:
|
||||
if agent.name == agent_name:
|
||||
extractor.delete_agent(agent.id)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to cleanup existing agent: {str(e)}")
|
||||
|
||||
# Create new agent
|
||||
agent = extractor.create_agent(agent_name, schema, config=test_case.config)
|
||||
yield agent
|
||||
|
||||
# Cleanup after test
|
||||
try:
|
||||
extractor.delete_agent(agent.id)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to delete agent {agent.id}: {str(e)}")
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
|
||||
reason="LLAMA_CLOUD_API_KEY not set",
|
||||
)
|
||||
@pytest.mark.parametrize("test_case", get_test_cases(), ids=lambda x: x.name)
|
||||
def test_extraction(test_case: TestCase, extraction_agent: ExtractionAgent) -> None:
|
||||
result = extraction_agent.extract(test_case.input_file).data
|
||||
with open(test_case.expected_output, "r") as f:
|
||||
expected = json.load(f)
|
||||
# TODO: fix the saas_slide test
|
||||
assert json_subset_match_score(expected, result) > 0.3, DeepDiff(
|
||||
expected, result, ignore_order=True
|
||||
)
|
||||
@@ -1,37 +0,0 @@
|
||||
from typing import Any
|
||||
|
||||
from autoevals.string import Levenshtein
|
||||
from autoevals.number import NumericDiff
|
||||
|
||||
|
||||
def json_subset_match_score(expected: Any, actual: Any) -> float:
|
||||
"""
|
||||
Adapted from autoevals.JsonDiff to only test on the subset of keys within the expected json.
|
||||
"""
|
||||
string_scorer = Levenshtein()
|
||||
number_scorer = NumericDiff()
|
||||
if isinstance(expected, dict) and isinstance(actual, dict):
|
||||
if len(expected) == 0 and len(actual) == 0:
|
||||
return 1
|
||||
keys = set(expected.keys())
|
||||
scores = [json_subset_match_score(expected.get(k), actual.get(k)) for k in keys]
|
||||
scores = [s for s in scores if s is not None]
|
||||
return sum(scores) / len(scores)
|
||||
elif isinstance(expected, list) and isinstance(actual, list):
|
||||
if len(expected) == 0 and len(actual) == 0:
|
||||
return 1
|
||||
scores = [json_subset_match_score(e1, e2) for (e1, e2) in zip(expected, actual)]
|
||||
scores = [s for s in scores if s is not None]
|
||||
return sum(scores) / max(len(expected), len(actual))
|
||||
elif isinstance(expected, str) and isinstance(actual, str):
|
||||
return string_scorer.eval(expected, actual).score
|
||||
elif (isinstance(expected, int) or isinstance(expected, float)) and (
|
||||
isinstance(actual, int) or isinstance(actual, float)
|
||||
):
|
||||
return number_scorer.eval(expected, actual).score
|
||||
elif expected is None and actual is None:
|
||||
return 1
|
||||
elif expected is None or actual is None:
|
||||
return 0
|
||||
else:
|
||||
return 0
|
||||
Reference in New Issue
Block a user