This commit is contained in:
Logan
2025-02-27 18:53:39 -06:00
committed by GitHub
parent 4940131c1d
commit 5c66f2f234
31 changed files with 0 additions and 7928 deletions
-48
View File
@@ -1,48 +0,0 @@
name: Build Package
# Build package on its own without additional pip install
on:
push:
branches:
- main
pull_request:
env:
POETRY_VERSION: "1.6.1"
jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
# You can use PyPy versions in python-version.
# For example, pypy-2.7 and pypy-3.8
matrix:
os: [ubuntu-latest, windows-latest]
python-version: ["3.9"]
steps:
- uses: actions/checkout@v3
- name: Set up python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install Poetry
uses: snok/install-poetry@v1
with:
version: ${{ env.POETRY_VERSION }}
- name: Install deps
shell: bash
run: poetry install
- name: Ensure lock works
shell: bash
run: poetry lock
- name: Build
shell: bash
run: poetry build
- name: Test installing built package
shell: bash
run: python -m pip install .
- name: Test import
shell: bash
working-directory: ${{ vars.RUNNER_TEMP }}
run: python -c "import llama_extract"
-81
View File
@@ -1,81 +0,0 @@
# For most projects, this workflow file will not need changing; you simply need
# to commit it to your repository.
#
# You may wish to alter this file to override the set of languages analyzed,
# or to provide custom queries or build logic.
#
# ******** NOTE ********
# We have attempted to detect the languages in your repository. Please check
# the `language` matrix defined below to confirm you have the correct set of
# supported CodeQL languages.
#
name: "CodeQL"
on:
push:
branches: ["main"]
pull_request:
# The branches below must be a subset of the branches above
branches: ["main"]
schedule:
- cron: "30 16 * * 4"
jobs:
analyze:
name: Analyze
# Runner size impacts CodeQL analysis time. To learn more, please see:
# - https://gh.io/recommended-hardware-resources-for-running-codeql
# - https://gh.io/supported-runners-and-hardware-resources
# - https://gh.io/using-larger-runners
# Consider using larger runners for possible analysis time improvements.
runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }}
permissions:
actions: read
contents: read
security-events: write
strategy:
fail-fast: false
matrix:
language: ["python"]
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby', 'swift' ]
# Use only 'java' to analyze code written in Java, Kotlin or both
# Use only 'javascript' to analyze code written in JavaScript, TypeScript or both
# Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
steps:
- name: Checkout repository
uses: actions/checkout@v3
# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v2
with:
languages: ${{ matrix.language }}
# If you wish to specify custom queries, you can do so here or in a config file.
# By default, queries listed here will override any specified in a config file.
# Prefix the list here with "+" to use these queries and those in the config file.
# For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
# queries: security-extended,security-and-quality
# Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift).
# If this step fails, then you should remove it and run the build manually (see below)
- name: Autobuild
uses: github/codeql-action/autobuild@v2
# ️ Command-line programs to run using the OS shell.
# 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
# If the Autobuild fails above, remove it and uncomment the following three lines.
# modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
# - run: |
# echo "Run, Build Application using script"
# ./location_of_script_within_repo/buildscript.sh
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v2
with:
category: "/language:${{matrix.language}}"
-37
View File
@@ -1,37 +0,0 @@
name: Linting
on:
push:
branches:
- main
pull_request:
env:
POETRY_VERSION: "1.6.1"
jobs:
build:
runs-on: ubuntu-latest
strategy:
# You can use PyPy versions in python-version.
# For example, pypy-2.7 and pypy-3.8
matrix:
python-version: ["3.9"]
steps:
- uses: actions/checkout@v3
with:
fetch-depth: ${{ github.event_name == 'pull_request' && 2 || 0 }}
- name: Set up python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install Poetry
uses: snok/install-poetry@v1
with:
version: ${{ env.POETRY_VERSION }}
- name: Install pre-commit
shell: bash
run: poetry run pip install pre-commit
- name: Run linter
shell: bash
run: poetry run make lint
-76
View File
@@ -1,76 +0,0 @@
name: Publish Release
on:
push:
tags:
- "v*"
workflow_dispatch:
jobs:
publish:
runs-on: ubuntu-latest
permissions:
actions: write # To trigger workflow
contents: read # To checkout code
if: github.repository == 'run-llama/llama_extract'
steps:
- name: Trigger Unit Tests
uses: actions/github-script@v7
with:
script: |
const result = await github.rest.actions.createWorkflowDispatch({
owner: context.repo.owner,
repo: context.repo.repo,
workflow_id: 'unit_test.yml',
ref: 'main'
});
- name: Wait for tests
uses: actions/github-script@v7
with:
script: |
const TIMEOUT = 600000; // 10 minutes in milliseconds
const START_TIME = Date.now();
while (Date.now() - START_TIME < TIMEOUT) {
console.log('Checking test status...');
const runs = await github.rest.actions.listWorkflowRuns({
owner: context.repo.owner,
repo: context.repo.repo,
workflow_id: 'unit_test.yml',
status: 'completed',
branch: 'main'
});
if(runs.data.workflow_runs.length > 0) {
const run = runs.data.workflow_runs[0];
if(run.conclusion === 'success') {
console.log('Tests passed!');
return;
} else if(run.conclusion === 'failure') {
throw new Error('Tests failed!');
}
}
console.log('...');
await new Promise(r => setTimeout(r, 30000)); // Wait 30 seconds between checks
}
throw new Error('Tests did not complete within 10 minutes');
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.9"
- name: Install Poetry
run: |
curl -sSL https://install.python-poetry.org | python3 -
- name: Build package
run: poetry build
- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@v1.8.14
with:
password: ${{ secrets.LLAMA_EXTRACT_PYPI_TOKEN }}
-43
View File
@@ -1,43 +0,0 @@
name: Unit Testing
on:
push:
branches:
- main
tags:
- 'v*'
pull_request:
workflow_dispatch:
env:
POETRY_VERSION: "1.6.1"
jobs:
test:
runs-on: ubuntu-latest
environment: ${{ (startsWith(github.ref, 'refs/tags/v') || github.event_name == 'workflow_dispatch') && 'CI-prod' || 'CI-staging' }}
strategy:
# You can use PyPy versions in python-version.
# For example, pypy-2.7 and pypy-3.8
matrix:
python-version: ["3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Set up python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install Poetry
uses: snok/install-poetry@v1
with:
version: ${{ env.POETRY_VERSION }}
- name: Install deps
shell: bash
run: poetry install --with dev
- name: Run testing
env:
CI: true
shell: bash
run: poetry run pytest tests
-21
View File
@@ -1,21 +0,0 @@
MIT License
Copyright (c) 2024 LlamaIndex
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
-14
View File
@@ -1,14 +0,0 @@
GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
help: ## Show all Makefile targets.
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'
format: ## Run code autoformatters (black).
pre-commit install
git ls-files | xargs pre-commit run ruff --files
lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy
pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files
test: ## Run tests via pytest
pytest tests
-180
View File
@@ -3,183 +3,3 @@
> ⚠️ This project has been moved to [LlamaCloud Services](https://github.com/run-llama/llama_cloud_services/)
> --------
LlamaExtract provides a simple API for extracting structured data from unstructured documents like PDFs, text files and images (upcoming).
## Quick Start
```python
from llama_extract import LlamaExtract
from pydantic import BaseModel, Field
# Initialize client
extractor = LlamaExtract()
# Define schema using Pydantic
class Resume(BaseModel):
name: str = Field(description="Full name of candidate")
email: str = Field(description="Email address")
skills: list[str] = Field(description="Technical skills and technologies")
# Create extraction agent
agent = extractor.create_agent(name="resume-parser", data_schema=Resume)
# Extract data from document
result = agent.extract("resume.pdf")
print(result.data)
```
## Core Concepts
- **Extraction Agents**: Reusable extractors configured with a specific schema and extraction settings.
- **Data Schema**: Structure definition for the data you want to extract.
- **Extraction Jobs**: Asynchronous extraction tasks that can be monitored.
## Defining Schemas
Schemas can be defined using either Pydantic models or JSON Schema:
### Using Pydantic (Recommended)
```python
from pydantic import BaseModel, Field
from typing import List, Optional
class Experience(BaseModel):
company: str = Field(description="Company name")
title: str = Field(description="Job title")
start_date: Optional[str] = Field(description="Start date of employment")
end_date: Optional[str] = Field(description="End date of employment")
class Resume(BaseModel):
name: str = Field(description="Candidate name")
experience: List[Experience] = Field(description="Work history")
```
### Using JSON Schema
```python
schema = {
"type": "object",
"properties": {
"name": {"type": "string", "description": "Candidate name"},
"experience": {
"type": "array",
"description": "Work history",
"items": {
"type": "object",
"properties": {
"company": {
"type": "string",
"description": "Company name",
},
"title": {"type": "string", "description": "Job title"},
"start_date": {
"anyOf": [{"type": "string"}, {"type": "null"}],
"description": "Start date of employment",
},
"end_date": {
"anyOf": [{"type": "string"}, {"type": "null"}],
"description": "End date of employment",
},
},
},
},
},
}
agent = extractor.create_agent(name="resume-parser", data_schema=schema)
```
### Important restrictions on JSON/Pydantic Schema
*LlamaExtract only supports a subset of the JSON Schema specification.* While limited, it should
be sufficient for a wide variety of use-cases.
- All fields are required by default. Nullable fields must be explicitly marked as such,
using `"anyOf"` with a `"null"` type. See `"start_date"` field above.
- Root node must be of type `"object"`.
- Schema nesting must be limited to within 5 levels.
- The important fields are key names/titles, type and description. Fields for
formatting, default values, etc. are not supported.
- There are other restrictions on number of keys, size of the schema, etc. that you may
hit for complex extraction use cases. In such cases, it is worth thinking how to restructure
your extraction workflow to fit within these constraints, e.g. by extracting subset of fields
and later merging them together.
## Other Extraction APIs
### Batch Processing
Process multiple files asynchronously:
```python
# Queue multiple files for extraction
jobs = await agent.queue_extraction(["resume1.pdf", "resume2.pdf"])
# Check job status
for job in jobs:
status = agent.get_extraction_job(job.id).status
print(f"Job {job.id}: {status}")
# Get results when complete
results = [agent.get_extraction_run_for_job(job.id) for job in jobs]
```
### Updating Schemas
Schemas can be modified and updated after creation:
```python
# Update schema
agent.data_schema = new_schema
# Save changes
agent.save()
```
### Managing Agents
```python
# List all agents
agents = extractor.list_agents()
# Get specific agent
agent = extractor.get_agent(name="resume-parser")
# Delete agent
extractor.delete_agent(agent.id)
```
## Installation
```bash
pip install llama-extract==0.1.0
```
## Tips & Best Practices
1. **Schema Design**:
- Try to limit schema nesting to 3-4 levels.
- Make fields optional when data might not always be present. Having required fields may force the model
to hallucinate when these fields are not present in the documents.
- When you want to extract a variable number of entities, use an `array` type. Note that you cannot use
an `array` type for the root node.
- Use descriptive field names and detailed descriptions. Use descriptions to pass formatting
instructions or few-shot examples.
- Start simple and iteratively build your schema to incorporate requirements.
2. **Running Extractions**:
- Note that resetting `agent.schema` will not save the schema to the database,
until you call `agent.save`, but it will be used for running extractions.
- Check job status prior to accessing results. Any extraction error should be available as
part of `job.error` or `extraction_run.error` fields for debugging.
- Consider async operations (`queue_extraction`) for large-scale extraction once you have finalized your schema.
## Additional Resources
- [Example Notebook](examples/resume_screening.ipynb) - Detailed walkthrough of resume parsing
- [Discord Community](https://discord.com/invite/eN6D2HQ4aX) - Get help and share feedback
Binary file not shown.
Binary file not shown.
Binary file not shown.
-882
View File
@@ -1,882 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Extracting data from resumes\n",
"\n",
"Let us assume that we are running a hiring process for a company and we have received a list of resumes from candidates. We want to extract structured data from the resumes so that we can run a screening process and shortlist candidates. \n",
"\n",
"Take a look at one of the resumes in the `data/resumes` directory. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <iframe\n",
" width=\"600\"\n",
" height=\"400\"\n",
" src=\"./data/resumes/ai_researcher.pdf\"\n",
" frameborder=\"0\"\n",
" allowfullscreen\n",
" \n",
" ></iframe>\n",
" "
],
"text/plain": [
"<IPython.lib.display.IFrame at 0x103a7e950>"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from IPython.display import IFrame\n",
"\n",
"IFrame(src=\"./data/resumes/ai_researcher.pdf\", width=600, height=400)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You will notice that all the resumes have different layouts but contain common information like name, email, experience, education, etc. \n",
"\n",
"With LlamaExtract, we will show you how to:\n",
"- *Define* a data schema to extract the information of interest. \n",
"- *Iterate* over the data schema to generalize the schema for multiple resumes.\n",
"- *Finalize* the schema and schedule extractions for multiple resumes.\n",
"\n",
"We will start by defining a `LlamaExtract` client which provides a Python interface to the LlamaExtract API. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from dotenv import load_dotenv\n",
"from llama_extract import LlamaExtract\n",
"\n",
"\n",
"# Load environment variables (put LLAMA_CLOUD_API_KEY in your .env file)\n",
"load_dotenv(override=True)\n",
"\n",
"# Optionally, add your project id/organization id\n",
"llama_extract = LlamaExtract()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Defining the data schema\n",
"\n",
"Next, let us try to extract two fields from the resume: `name` and `email`. We can either use a Python dictionary structure to define the `data_schema` as a JSON or use a Pydantic model instead, for brevity and convenience. In either case, our output is guaranteed to validate against this schema."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pydantic import BaseModel, Field\n",
"\n",
"\n",
"class Resume(BaseModel):\n",
" name: str = Field(description=\"The name of the candidate\")\n",
" email: str = Field(description=\"The email address of the candidate\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from llama_cloud.core.api_error import ApiError\n",
"\n",
"try:\n",
" existing_agent = llama_extract.get_agent(name=\"resume-screening\")\n",
" if existing_agent:\n",
" llama_extract.delete_agent(existing_agent.id)\n",
"except ApiError as e:\n",
" if e.status_code == 404:\n",
" pass\n",
" else:\n",
" raise\n",
"\n",
"agent = llama_extract.create_agent(name=\"resume-screening\", data_schema=Resume) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[ExtractionAgent(id=ad801427-d06b-499d-bbe0-6109c5f0646b, name=resume-screening)]"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"llama_extract.list_agents()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Uploading files: 100%|██████████| 1/1 [00:00<00:00, 1.19it/s]\n",
"Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00, 1.30s/it]\n",
"Extracting files: 100%|██████████| 1/1 [00:03<00:00, 3.18s/it]\n",
"Uploading files: 100%|██████████| 1/1 [00:00<00:00, 1.23it/s]\n",
"Creating extraction jobs: 100%|██████████| 1/1 [00:03<00:00, 3.09s/it]\n",
"Extracting files: 100%|██████████| 1/1 [00:11<00:00, 11.11s/it]\n",
"Uploading files: 100%|██████████| 1/1 [00:00<00:00, 1.16it/s]\n",
"Creating extraction jobs: 100%|██████████| 1/1 [00:03<00:00, 3.10s/it]\n",
"Extracting files: 100%|██████████| 1/1 [00:09<00:00, 9.87s/it]\n",
"Uploading files: 100%|██████████| 1/1 [00:00<00:00, 1.12it/s]\n",
"Creating extraction jobs: 100%|██████████| 1/1 [00:05<00:00, 5.92s/it]\n",
"Extracting files: 100%|██████████| 1/1 [00:12<00:00, 12.05s/it]\n"
]
},
{
"data": {
"text/plain": [
"{'name': 'Dr. Rachel Zhang', 'email': 'rachel.zhang@email.com'}"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"resume = agent.extract(\"./data/resumes/ai_researcher.pdf\")\n",
"resume.data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Iterating over the data schema\n",
"\n",
"Now that we have created a data schema, let us add more fields to the schema. We will add `experience` and `education` fields to the schema. \n",
"- We can create a new Pydantic model for each of these fields and represent `experience` and `education` as lists of these models. Doing this will allow us to extract multiple entities from the resume without having to pre-define how many experiences or education the candidate has. \n",
"- We have added a `description` parameter to provide more context for extraction. We can use `description` to provide example inputs/outputs for the extraction. \n",
"- Note that we have annotated the `start_date` and `end_date` fields with `Optional[str]` to indicate that these fields are optional. This is *important* because the schema will be used to extract data from multiple resumes and not all resumes will have the same format. A field must only be required if it is guaranteed to be present in all the resumes. \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from typing import List, Optional\n",
"\n",
"\n",
"class Education(BaseModel):\n",
" institution: str = Field(description=\"The institution of the candidate\")\n",
" degree: str = Field(description=\"The degree of the candidate\")\n",
" start_date: Optional[str] = Field(\n",
" default=None, description=\"The start date of the candidate's education\"\n",
" )\n",
" end_date: Optional[str] = Field(\n",
" default=None, description=\"The end date of the candidate's education\"\n",
" )\n",
"\n",
"\n",
"class Experience(BaseModel):\n",
" company: str = Field(description=\"The name of the company\")\n",
" title: str = Field(description=\"The title of the candidate\")\n",
" description: Optional[str] = Field(\n",
" default=None, description=\"The description of the candidate's experience\"\n",
" )\n",
" start_date: Optional[str] = Field(\n",
" default=None, description=\"The start date of the candidate's experience\"\n",
" )\n",
" end_date: Optional[str] = Field(\n",
" default=None, description=\"The end date of the candidate's experience\"\n",
" )\n",
"\n",
"\n",
"class Resume(BaseModel):\n",
" name: str = Field(description=\"The name of the candidate\")\n",
" email: str = Field(description=\"The email address of the candidate\")\n",
" links: List[str] = Field(\n",
" description=\"The links to the candidate's social media profiles\"\n",
" )\n",
" experience: List[Experience] = Field(description=\"The candidate's experience\")\n",
" education: List[Education] = Field(description=\"The candidate's education\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, we will update the `data_schema` for the `resume-screening` agent to use the new `Resume` model. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'name': 'Dr. Rachel Zhang',\n",
" 'email': 'rachel.zhang@email.com',\n",
" 'links': ['linkedin.com/in/rachelzhang',\n",
" 'github.com/rzhang-ai',\n",
" 'scholar.google.com/rachelzhang'],\n",
" 'experience': [{'company': 'DeepMind',\n",
" 'title': 'Senior Research Scientist',\n",
" 'description': '- Lead researcher on large-scale multi-task learning systems, developing novel architectures that improve cross-task generalization by 40%\\n- Pioneered new approach to zero-shot learning using contrastive training, published in NeurIPS 2023\\n- Built and led team of 6 researchers working on foundational ML models\\n- Developed novel regularization techniques for large language models, reducing catastrophic forgetting by 35%',\n",
" 'start_date': '2019',\n",
" 'end_date': 'Present'},\n",
" {'company': 'Google Research',\n",
" 'title': 'Research Scientist',\n",
" 'description': '- Developed probabilistic frameworks for robust ML, published in ICML 2018\\n- Created novel attention mechanisms for computer vision models, improving accuracy by 25%\\n- Led collaboration with Google Brain team on efficient training methods for transformer models\\n- Mentored 4 PhD interns and collaborated with academic institutions',\n",
" 'start_date': '2015',\n",
" 'end_date': '2019'},\n",
" {'company': 'Columbia University',\n",
" 'title': 'Research Assistant Professor',\n",
" 'description': '- Published seminal work on Bayesian optimization methods (cited 1000+ times)\\n- Taught graduate-level courses in Machine Learning and Statistical Learning Theory\\n- Supervised 5 PhD students and 3 MSc students\\n- Secured $500K in research grants for probabilistic ML research',\n",
" 'start_date': '2011',\n",
" 'end_date': '2015'}],\n",
" 'education': [{'institution': 'Columbia University',\n",
" 'degree': 'Ph.D. in Computer Science',\n",
" 'start_date': '2007',\n",
" 'end_date': '2011'},\n",
" {'institution': 'Stanford University',\n",
" 'degree': 'M.S. in Computer Science',\n",
" 'start_date': '2005',\n",
" 'end_date': '2007'}]}"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"agent.data_schema = Resume\n",
"resume = agent.extract(\"./data/resumes/ai_researcher.pdf\")\n",
"resume.data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This is a good start. Let us add a few more fields to the schema and re-run the extraction. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class TechnicalSkills(BaseModel):\n",
" programming_languages: List[str] = Field(\n",
" description=\"The programming languages the candidate is proficient in.\"\n",
" )\n",
" frameworks: List[str] = Field(\n",
" description=\"The tools/frameworks the candidate is proficient in, e.g. React, Django, PyTorch, etc.\"\n",
" )\n",
" skills: List[str] = Field(\n",
" description=\"Other general skills the candidate is proficient in, e.g. Data Engineering, Machine Learning, etc.\"\n",
" )\n",
"\n",
"\n",
"class Resume(BaseModel):\n",
" name: str = Field(description=\"The name of the candidate\")\n",
" email: str = Field(description=\"The email address of the candidate\")\n",
" links: List[str] = Field(\n",
" description=\"The links to the candidate's social media profiles\"\n",
" )\n",
" experience: List[Experience] = Field(description=\"The candidate's experience\")\n",
" education: List[Education] = Field(description=\"The candidate's education\")\n",
" technical_skills: TechnicalSkills = Field(\n",
" description=\"The candidate's technical skills\"\n",
" )\n",
" key_accomplishments: str = Field(\n",
" description=\"Summarize the candidates highest achievements.\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'name': 'Dr. Rachel Zhang',\n",
" 'email': 'rachel.zhang@email.com',\n",
" 'links': ['linkedin.com/in/rachelzhang',\n",
" 'github.com/rzhang-ai',\n",
" 'scholar.google.com/rachelzhang'],\n",
" 'experience': [{'company': 'DeepMind',\n",
" 'title': 'Senior Research Scientist',\n",
" 'description': '- Lead researcher on large-scale multi-task learning systems, developing novel architectures that improve cross-task generalization by 40%\\n- Pioneered new approach to zero-shot learning using contrastive training, published in NeurIPS 2023\\n- Built and led team of 6 researchers working on foundational ML models\\n- Developed novel regularization techniques for large language models, reducing catastrophic forgetting by 35%',\n",
" 'start_date': '2019',\n",
" 'end_date': 'Present'},\n",
" {'company': 'Google Research',\n",
" 'title': 'Research Scientist',\n",
" 'description': '- Developed probabilistic frameworks for robust ML, published in ICML 2018\\n- Created novel attention mechanisms for computer vision models, improving accuracy by 25%\\n- Led collaboration with Google Brain team on efficient training methods for transformer models\\n- Mentored 4 PhD interns and collaborated with academic institutions',\n",
" 'start_date': '2015',\n",
" 'end_date': '2019'},\n",
" {'company': 'Columbia University',\n",
" 'title': 'Research Assistant Professor',\n",
" 'description': '- Published seminal work on Bayesian optimization methods (cited 1000+ times)\\n- Taught graduate-level courses in Machine Learning and Statistical Learning Theory\\n- Supervised 5 PhD students and 3 MSc students\\n- Secured $500K in research grants for probabilistic ML research',\n",
" 'start_date': '2011',\n",
" 'end_date': '2015'}],\n",
" 'education': [{'institution': 'Columbia University',\n",
" 'degree': 'Ph.D. in Computer Science',\n",
" 'start_date': '2007',\n",
" 'end_date': '2011'},\n",
" {'institution': 'Stanford University',\n",
" 'degree': 'M.S. in Computer Science',\n",
" 'start_date': '2005',\n",
" 'end_date': '2007'}],\n",
" 'technical_skills': {'programming_languages': ['Python',\n",
" 'C++',\n",
" 'Julia',\n",
" 'CUDA'],\n",
" 'frameworks': ['PyTorch', 'TensorFlow', 'JAX', 'Ray'],\n",
" 'skills': ['Deep Learning',\n",
" 'Reinforcement Learning',\n",
" 'Probabilistic Models',\n",
" 'Multi-Task Learning',\n",
" 'Zero-Shot Learning',\n",
" 'Neural Architecture Search']},\n",
" 'key_accomplishments': 'AI researcher with 12+ years of experience spanning classical machine learning, deep learning, and probabilistic modeling. Led groundbreaking research in reinforcement learning, generative models, and multi-task learning. Published 25+ papers in top-tier conferences (NeurIPS, ICML, ICLR). Strong track record of transitioning theoretical advances into practical applications in both academic and industrial settings.'}"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"agent.data_schema = Resume\n",
"resume = agent.extract(\"./data/resumes/ai_researcher.pdf\")\n",
"resume.data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Finalizing the schema\n",
"\n",
"This is great! We have extracted a lot of key information from the resume that is well-typed and can be used downstream for further processing. Until now, this data is ephemeral and will be lost if we close the session. Let us save the state of our extraction and use it to extract data from multiple resumes. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"agent.save()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'type': 'object',\n",
" '$defs': {'Education': {'type': 'object',\n",
" 'title': 'Education',\n",
" 'required': ['institution', 'degree', 'start_date', 'end_date'],\n",
" 'properties': {'degree': {'type': 'string',\n",
" 'title': 'Degree',\n",
" 'description': 'The degree of the candidate'},\n",
" 'end_date': {'anyOf': [{'type': 'string'}, {'type': 'null'}],\n",
" 'title': 'End Date',\n",
" 'description': \"The end date of the candidate's education\"},\n",
" 'start_date': {'anyOf': [{'type': 'string'}, {'type': 'null'}],\n",
" 'title': 'Start Date',\n",
" 'description': \"The start date of the candidate's education\"},\n",
" 'institution': {'type': 'string',\n",
" 'title': 'Institution',\n",
" 'description': 'The institution of the candidate'}},\n",
" 'additionalProperties': False},\n",
" 'Experience': {'type': 'object',\n",
" 'title': 'Experience',\n",
" 'required': ['company', 'title', 'description', 'start_date', 'end_date'],\n",
" 'properties': {'title': {'type': 'string',\n",
" 'title': 'Title',\n",
" 'description': 'The title of the candidate'},\n",
" 'company': {'type': 'string',\n",
" 'title': 'Company',\n",
" 'description': 'The name of the company'},\n",
" 'end_date': {'anyOf': [{'type': 'string'}, {'type': 'null'}],\n",
" 'title': 'End Date',\n",
" 'description': \"The end date of the candidate's experience\"},\n",
" 'start_date': {'anyOf': [{'type': 'string'}, {'type': 'null'}],\n",
" 'title': 'Start Date',\n",
" 'description': \"The start date of the candidate's experience\"},\n",
" 'description': {'anyOf': [{'type': 'string'}, {'type': 'null'}],\n",
" 'title': 'Description',\n",
" 'description': \"The description of the candidate's experience\"}},\n",
" 'additionalProperties': False},\n",
" 'TechnicalSkills': {'type': 'object',\n",
" 'title': 'TechnicalSkills',\n",
" 'required': ['programming_languages', 'frameworks', 'skills'],\n",
" 'properties': {'skills': {'type': 'array',\n",
" 'items': {'type': 'string'},\n",
" 'title': 'Skills',\n",
" 'description': 'Other general skills the candidate is proficient in, e.g. Data Engineering, Machine Learning, etc.'},\n",
" 'frameworks': {'type': 'array',\n",
" 'items': {'type': 'string'},\n",
" 'title': 'Frameworks',\n",
" 'description': 'The tools/frameworks the candidate is proficient in, e.g. React, Django, PyTorch, etc.'},\n",
" 'programming_languages': {'type': 'array',\n",
" 'items': {'type': 'string'},\n",
" 'title': 'Programming Languages',\n",
" 'description': 'The programming languages the candidate is proficient in.'}},\n",
" 'additionalProperties': False}},\n",
" 'title': 'Resume',\n",
" 'required': ['name',\n",
" 'email',\n",
" 'links',\n",
" 'experience',\n",
" 'education',\n",
" 'technical_skills',\n",
" 'key_accomplishments'],\n",
" 'properties': {'name': {'type': 'string',\n",
" 'title': 'Name',\n",
" 'description': 'The name of the candidate'},\n",
" 'email': {'type': 'string',\n",
" 'title': 'Email',\n",
" 'description': 'The email address of the candidate'},\n",
" 'links': {'type': 'array',\n",
" 'items': {'type': 'string'},\n",
" 'title': 'Links',\n",
" 'description': \"The links to the candidate's social media profiles\"},\n",
" 'education': {'type': 'array',\n",
" 'items': {'$ref': '#/$defs/Education'},\n",
" 'title': 'Education',\n",
" 'description': \"The candidate's education\"},\n",
" 'experience': {'type': 'array',\n",
" 'items': {'$ref': '#/$defs/Experience'},\n",
" 'title': 'Experience',\n",
" 'description': \"The candidate's experience\"},\n",
" 'technical_skills': {'type': 'object',\n",
" 'title': 'TechnicalSkills',\n",
" 'required': ['programming_languages', 'frameworks', 'skills'],\n",
" 'properties': {'skills': {'type': 'array',\n",
" 'items': {'type': 'string'},\n",
" 'title': 'Skills',\n",
" 'description': 'Other general skills the candidate is proficient in, e.g. Data Engineering, Machine Learning, etc.'},\n",
" 'frameworks': {'type': 'array',\n",
" 'items': {'type': 'string'},\n",
" 'title': 'Frameworks',\n",
" 'description': 'The tools/frameworks the candidate is proficient in, e.g. React, Django, PyTorch, etc.'},\n",
" 'programming_languages': {'type': 'array',\n",
" 'items': {'type': 'string'},\n",
" 'title': 'Programming Languages',\n",
" 'description': 'The programming languages the candidate is proficient in.'}},\n",
" 'description': \"The candidate's technical skills\",\n",
" 'additionalProperties': False},\n",
" 'key_accomplishments': {'type': 'string',\n",
" 'title': 'Key Accomplishments',\n",
" 'description': 'Summarize the candidates highest achievements.'}},\n",
" 'additionalProperties': False}"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"agent = llama_extract.get_agent(\"resume-screening\")\n",
"agent.data_schema # Latest schema should be returned"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Queueing extractions"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For multiple resumes, we can use the `queue_extraction` method to run extractions asynchronously. This is ideal for processing batch extraction jobs."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Uploading files: 100%|██████████| 3/3 [00:01<00:00, 2.29it/s]\n",
"Creating extraction jobs: 100%|██████████| 3/3 [00:04<00:00, 1.61s/it]\n"
]
}
],
"source": [
"import os\n",
"\n",
"# All resumes in the data/resumes directory\n",
"resumes = []\n",
"\n",
"with os.scandir(\"./data/resumes\") as entries:\n",
" for entry in entries:\n",
" if entry.is_file():\n",
" resumes.append(entry.path)\n",
"\n",
"jobs = await agent.queue_extraction(resumes)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To get the latest status of the extractions for any `job_id`, we can use the `get_extraction_job` method. \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<StatusEnum.PENDING: 'PENDING'>,\n",
" <StatusEnum.PENDING: 'PENDING'>,\n",
" <StatusEnum.PENDING: 'PENDING'>]"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[agent.get_extraction_job(job_id=job.id).status for job in jobs]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We notice that all extraction runs are in a PENDING state. We can check back again to see if the extractions have completed. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<StatusEnum.SUCCESS: 'SUCCESS'>,\n",
" <StatusEnum.SUCCESS: 'SUCCESS'>,\n",
" <StatusEnum.SUCCESS: 'SUCCESS'>]"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[agent.get_extraction_job(job_id=job.id).status for job in jobs]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Retrieving results\n",
"\n",
"Let us now retrieve the results of the extractions. If the status of the extraction is `SUCCESS`, we can retrieve the data from the `data` field. In case there are errors (status = `ERROR`), we can retrieve the error message from the `error` field. \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"results = []\n",
"for job in jobs:\n",
" extract_run = agent.list_extraction_runs(job_id=job.id)[0]\n",
" if extract_run.status == \"SUCCESS\":\n",
" results.append(extract_run.data)\n",
" else:\n",
" print(f\"Extraction status for job {job.id}: {extract_run.status}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'name': 'Dr. Rachel Zhang, Ph.D.',\n",
" 'email': 'rachel.zhang@email.com',\n",
" 'links': ['linkedin.com/in/rachelzhang',\n",
" 'github.com/rzhang-ai',\n",
" 'scholar.google.com/rachelzhang'],\n",
" 'experience': [{'company': 'DeepMind',\n",
" 'title': 'Senior Research Scientist',\n",
" 'description': '- Lead researcher on large-scale multi-task learning systems, developing novel architectures that improve cross-task generalization by 40%\\n- Pioneered new approach to zero-shot learning using contrastive training, published in NeurIPS 2023\\n- Built and led team of 6 researchers working on foundational ML models\\n- Developed novel regularization techniques for large language models, reducing catastrophic forgetting by 35%',\n",
" 'start_date': '2019',\n",
" 'end_date': 'Present'},\n",
" {'company': 'Google Research',\n",
" 'title': 'Research Scientist',\n",
" 'description': '- Developed probabilistic frameworks for robust ML, published in ICML 2018\\n- Created novel attention mechanisms for computer vision models, improving accuracy by 25%\\n- Led collaboration with Google Brain team on efficient training methods for transformer models\\n- Mentored 4 PhD interns and collaborated with academic institutions',\n",
" 'start_date': '2015',\n",
" 'end_date': '2019'},\n",
" {'company': 'Columbia University',\n",
" 'title': 'Research Assistant Professor',\n",
" 'description': '- Published seminal work on Bayesian optimization methods (cited 1000+ times)\\n- Taught graduate-level courses in Machine Learning and Statistical Learning Theory\\n- Supervised 5 PhD students and 3 MSc students\\n- Secured $500K in research grants for probabilistic ML research',\n",
" 'start_date': '2011',\n",
" 'end_date': '2015'}],\n",
" 'education': [{'institution': 'Columbia University',\n",
" 'degree': 'Ph.D. in Computer Science',\n",
" 'start_date': '2007',\n",
" 'end_date': '2011'},\n",
" {'institution': 'Stanford University',\n",
" 'degree': 'M.S. in Computer Science',\n",
" 'start_date': '2005',\n",
" 'end_date': '2007'}],\n",
" 'technical_skills': {'programming_languages': ['Python',\n",
" 'C++',\n",
" 'Julia',\n",
" 'CUDA'],\n",
" 'frameworks': ['PyTorch', 'TensorFlow', 'JAX', 'Ray'],\n",
" 'skills': ['Deep Learning',\n",
" 'Reinforcement Learning',\n",
" 'Probabilistic Models',\n",
" 'Multi-Task Learning',\n",
" 'Zero-Shot Learning',\n",
" 'Neural Architecture Search']},\n",
" 'key_accomplishments': 'AI researcher with 12+ years of experience spanning classical machine learning, deep learning, and probabilistic modeling. Led groundbreaking research in reinforcement learning, generative models, and multi-task learning. Published 25+ papers in top-tier conferences (NeurIPS, ICML, ICLR). Strong track record of transitioning theoretical advances into practical applications in both academic and industrial settings.'}"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'name': 'Alex Park',\n",
" 'email': 'alex park@email.com',\n",
" 'links': ['linkedin.com/in/alexpark'],\n",
" 'experience': [{'company': 'SearchTech AI',\n",
" 'title': 'Senior Machine Learning Engineer',\n",
" 'description': 'Led development of next-generation learning-to-rank system using BER\\nArchitected and deployed real-time personalization system processing 10\\nIncreasing CTR by 15%\\nImproving search relevance by 24% (NDCG@10)',\n",
" 'start_date': None,\n",
" 'end_date': None},\n",
" {'company': 'Commerce Corp',\n",
" 'title': '',\n",
" 'description': 'Developed semantic search system using transformer models and approximate nearest neighbors, reducing null search results by 35%',\n",
" 'start_date': None,\n",
" 'end_date': None},\n",
" {'company': 'Tech Solutions Inc',\n",
" 'title': 'Machine Learning Engineer',\n",
" 'description': 'Implemented query understanding pipeline',\n",
" 'start_date': None,\n",
" 'end_date': None},\n",
" {'company': '',\n",
" 'title': 'Software Engineer',\n",
" 'description': 'Built data pipelines and Flasticsearch',\n",
" 'start_date': None,\n",
" 'end_date': None}],\n",
" 'education': [{'institution': 'University of California, Berkeley',\n",
" 'degree': 'M.S. Computer Science',\n",
" 'start_date': None,\n",
" 'end_date': None},\n",
" {'institution': 'University of California, Berkeley',\n",
" 'degree': 'B.S. Computer Science',\n",
" 'start_date': None,\n",
" 'end_date': None},\n",
" {'institution': 'University of Washington',\n",
" 'degree': '',\n",
" 'start_date': None,\n",
" 'end_date': None}],\n",
" 'technical_skills': {'programming_languages': ['Python',\n",
" 'SQL',\n",
" 'Java',\n",
" 'Scala',\n",
" 'Shell Scripting'],\n",
" 'frameworks': ['PyTorch',\n",
" 'TensorFlow',\n",
" 'Scikit-learn',\n",
" 'Elasticsearch',\n",
" 'Solr',\n",
" 'Lucene',\n",
" 'BERT',\n",
" 'Word2Vec',\n",
" 'FastAI',\n",
" 'BM25',\n",
" 'FAISS',\n",
" 'Docker',\n",
" 'Kubernetes'],\n",
" 'skills': []},\n",
" 'key_accomplishments': 'Machine Learning Engineer with 5 years of experience building and deploying large-scale search and relevance systems: Specialized in developing personalized search algorithms, learning-to-rank models; and recommendation systems. Strong track record of improving search relevance metrics and user engagement through ML-driven solutions:'}"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results[1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'name': 'Sarah Chen',\n",
" 'email': 'sarah.chen@email.com',\n",
" 'links': [],\n",
" 'experience': [{'company': 'TechCorp Solutions',\n",
" 'title': 'Senior Software Architect',\n",
" 'description': '- Led architectural design and implementation of a cloud-native platform serving 2M+ users\\n- Established architectural guidelines and best practices adopted across 12 development teams\\n- Reduced system latency by 40% through implementation of event-driven architecture\\n- Mentored 15+ senior developers in cloud-native development practices',\n",
" 'start_date': '2020',\n",
" 'end_date': 'Present'},\n",
" {'company': 'DataFlow Systems',\n",
" 'title': 'Lead Software Engineer',\n",
" 'description': '- Architected and led development of distributed data processing platform handling 5TB daily\\n- Designed microservices architecture reducing deployment time by 65%\\n- Led migration of legacy monolith to cloud-native architecture\\n- Managed team of 8 engineers across 3 international locations',\n",
" 'start_date': '2016',\n",
" 'end_date': '2020'},\n",
" {'company': 'InnovateTech',\n",
" 'title': 'Senior Software Engineer',\n",
" 'description': '- Developed high-performance trading platform processing 100K transactions per second\\n- Implemented real-time analytics engine reducing processing latency by 75%\\n- Led adoption of container orchestration reducing deployment costs by 35%',\n",
" 'start_date': '2013',\n",
" 'end_date': '2016'}],\n",
" 'education': [{'institution': 'Stanford University',\n",
" 'degree': 'Master of Science in Computer Science',\n",
" 'start_date': None,\n",
" 'end_date': '2013'},\n",
" {'institution': 'University of California, Berkeley',\n",
" 'degree': 'Bachelor of Science in Computer Engineering',\n",
" 'start_date': None,\n",
" 'end_date': '2011'}],\n",
" 'technical_skills': {'programming_languages': ['Java',\n",
" 'Python',\n",
" 'Go',\n",
" 'JavaScript/TypeScript'],\n",
" 'frameworks': [],\n",
" 'skills': ['Architecture & Design',\n",
" 'Microservices',\n",
" 'Event-Driven Architecture',\n",
" 'Domain-Driven Design',\n",
" 'REST APIs',\n",
" 'Cloud Platforms',\n",
" 'AWS (Advanced)',\n",
" 'Azure',\n",
" 'Google Cloud Platform']},\n",
" 'key_accomplishments': '- Co-inventor on three patents for distributed systems architecture\\n- Published paper on \"Scalable Microservices Architecture\" at IEEE Cloud Computing Conference 2022\\n- Keynote Speaker, CloudCon 2023: \"Future of Cloud-Native Architecture\"\\n- Regular presenter at local tech meetups and conferences'}"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results[2]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Congratulations! You now have an agent that can extract structured data from resumes. \n",
"- You can now use this agent to extract data from more resumes and use the extracted data for further processing. \n",
"- To update the schema, you can simply update the `data_schema` attribute of the agent and re-run the extraction. \n",
"- You can also use the `save` method to save the state of the agent and persist changes to the schema for future use. \n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
-3
View File
@@ -1,3 +0,0 @@
from llama_extract.extract import LlamaExtract, ExtractionAgent
__all__ = ["LlamaExtract", "ExtractionAgent"]
-655
View File
@@ -1,655 +0,0 @@
import asyncio
import os
import time
from io import BufferedIOBase, BufferedReader, BytesIO
from pathlib import Path
from typing import List, Optional, Type, Union, Tuple, Coroutine, Any, TypeVar
import warnings
import httpx
from pydantic import BaseModel
from llama_cloud import (
ExtractAgent as CloudExtractAgent,
ExtractConfig,
ExtractJob,
ExtractJobCreate,
ExtractRun,
File,
ExtractMode,
StatusEnum,
Project,
ExtractTarget,
LlamaExtractSettings,
)
from llama_cloud.client import AsyncLlamaCloud
from llama_extract.utils import JSONObjectType, augment_async_errors
from llama_index.core.schema import BaseComponent
from llama_index.core.async_utils import run_jobs
from llama_index.core.bridge.pydantic import Field, PrivateAttr
from llama_index.core.constants import DEFAULT_BASE_URL
from concurrent.futures import ThreadPoolExecutor
T = TypeVar("T")
FileInput = Union[str, Path, bytes, BufferedIOBase]
SchemaInput = Union[JSONObjectType, Type[BaseModel]]
DEFAULT_EXTRACT_CONFIG = ExtractConfig(
extraction_target=ExtractTarget.PER_DOC,
extraction_mode=ExtractMode.ACCURATE,
)
class ExtractionAgent:
"""Class representing a single extraction agent with methods for extraction operations."""
def __init__(
self,
client: AsyncLlamaCloud,
agent: CloudExtractAgent,
project_id: Optional[str] = None,
organization_id: Optional[str] = None,
check_interval: int = 1,
max_timeout: int = 2000,
num_workers: int = 4,
show_progress: bool = True,
verbose: bool = False,
):
self._client = client
self._agent = agent
self._project_id = project_id
self._organization_id = organization_id
self.check_interval = check_interval
self.max_timeout = max_timeout
self.num_workers = num_workers
self.show_progress = show_progress
self._verbose = verbose
self._data_schema: Union[JSONObjectType, None] = None
self._config: Union[ExtractConfig, None] = None
self._thread_pool = ThreadPoolExecutor(
max_workers=min(10, (os.cpu_count() or 1) + 4)
)
def _run_in_thread(self, coro: Coroutine[Any, Any, T]) -> T:
"""Run coroutine in a separate thread to avoid event loop issues"""
def run_coro() -> T:
async def wrapped_coro() -> T:
async with httpx.AsyncClient(
timeout=self._client._client_wrapper.httpx_client.timeout,
) as client:
original_client = self._client._client_wrapper.httpx_client
self._client._client_wrapper.httpx_client = client
try:
return await coro
finally:
self._client._client_wrapper.httpx_client = original_client
return asyncio.run(wrapped_coro())
return self._thread_pool.submit(run_coro).result()
@property
def id(self) -> str:
return self._agent.id
@property
def name(self) -> str:
return self._agent.name
@property
def data_schema(self) -> dict:
return self._agent.data_schema if not self._data_schema else self._data_schema
@data_schema.setter
def data_schema(self, data_schema: SchemaInput) -> None:
processed_schema: JSONObjectType
if isinstance(data_schema, dict):
# TODO: if we expose a get_validated JSON schema method, we can use it here
processed_schema = data_schema # type: ignore
elif isinstance(data_schema, type) and issubclass(data_schema, BaseModel):
processed_schema = data_schema.model_json_schema()
else:
raise ValueError(
"data_schema must be either a dictionary or a Pydantic model"
)
validated_schema = self._run_in_thread(
self._client.llama_extract.validate_extraction_schema(
data_schema=processed_schema
)
)
self._data_schema = validated_schema.data_schema
@property
def config(self) -> ExtractConfig:
return self._agent.config if not self._config else self._config
@config.setter
def config(self, config: ExtractConfig) -> None:
self._config = config
async def _upload_file(self, file_input: FileInput) -> File:
"""Upload a file for extraction."""
if isinstance(file_input, BufferedIOBase):
upload_file = file_input
elif isinstance(file_input, bytes):
upload_file = BytesIO(file_input)
elif isinstance(file_input, (str, Path)):
upload_file = open(file_input, "rb")
else:
raise ValueError(
"file_input must be either a file path string, file bytes, or buffer object"
)
try:
return await self._client.files.upload_file(
project_id=self._project_id, upload_file=upload_file
)
finally:
if isinstance(upload_file, BufferedReader):
upload_file.close()
async def _wait_for_job_result(self, job_id: str) -> Optional[ExtractRun]:
"""Wait for and return the results of an extraction job."""
start = time.perf_counter()
tries = 0
while True:
await asyncio.sleep(self.check_interval)
tries += 1
job = await self._client.llama_extract.get_job(
job_id=job_id,
)
if job.status == StatusEnum.SUCCESS:
return await self._client.llama_extract.get_run_by_job_id(
job_id=job_id,
)
elif job.status == StatusEnum.PENDING:
end = time.perf_counter()
if end - start > self.max_timeout:
raise Exception(f"Timeout while extracting the file: {job_id}")
if self._verbose and tries % 10 == 0:
print(".", end="", flush=True)
continue
else:
warnings.warn(
f"Failure in job: {job_id}, status: {job.status}, error: {job.error}"
)
return await self._client.llama_extract.get_run_by_job_id(
job_id=job_id,
)
def save(self) -> None:
"""Persist the extraction agent's schema and config to the database.
Returns:
ExtractionAgent: The updated extraction agent
"""
self._agent = self._run_in_thread(
self._client.llama_extract.update_extraction_agent(
extraction_agent_id=self.id,
data_schema=self.data_schema,
config=self.config,
)
)
async def _queue_extraction_test(
self,
files: Union[FileInput, List[FileInput]],
extract_settings: LlamaExtractSettings,
) -> Union[ExtractJob, List[ExtractJob]]:
if not isinstance(files, list):
files = [files]
single_file = True
else:
single_file = False
upload_tasks = [self._upload_file(file) for file in files]
with augment_async_errors():
uploaded_files = await run_jobs(
upload_tasks,
workers=self.num_workers,
desc="Uploading files",
show_progress=self.show_progress,
)
async def run_job(file: File) -> ExtractRun:
job_queued = await self._client.llama_extract.run_job_test_user(
job_create=ExtractJobCreate(
extraction_agent_id=self.id,
file_id=file.id,
data_schema_override=self.data_schema,
config_override=self.config,
),
extract_settings=extract_settings,
)
return await self._wait_for_job_result(job_queued.id)
job_tasks = [run_job(file) for file in uploaded_files]
with augment_async_errors():
extract_jobs = await run_jobs(
job_tasks,
workers=self.num_workers,
desc="Running extraction jobs",
show_progress=self.show_progress,
)
if self._verbose:
for file, job in zip(files, extract_jobs):
file_repr = (
str(file) if isinstance(file, (str, Path)) else "<bytes/buffer>"
)
print(
f"Queued file extraction for file {file_repr} under job_id {job.id}"
)
return extract_jobs[0] if single_file else extract_jobs
async def queue_extraction(
self,
files: Union[FileInput, List[FileInput]],
) -> Union[ExtractJob, List[ExtractJob]]:
"""
Queue multiple files for extraction.
Args:
files (Union[FileInput, List[FileInput]]): The files to extract
Returns:
Union[ExtractJob, List[ExtractJob]]: The queued extraction jobs
"""
"""Queue one or more files for extraction concurrently."""
if not isinstance(files, list):
files = [files]
single_file = True
else:
single_file = False
upload_tasks = [self._upload_file(file) for file in files]
with augment_async_errors():
uploaded_files = await run_jobs(
upload_tasks,
workers=self.num_workers,
desc="Uploading files",
show_progress=self.show_progress,
)
job_tasks = [
self._client.llama_extract.run_job(
request=ExtractJobCreate(
extraction_agent_id=self.id,
file_id=file.id,
data_schema_override=self.data_schema,
config_override=self.config,
),
)
for file in uploaded_files
]
with augment_async_errors():
extract_jobs = await run_jobs(
job_tasks,
workers=self.num_workers,
desc="Creating extraction jobs",
show_progress=self.show_progress,
)
if self._verbose:
for file, job in zip(files, extract_jobs):
file_repr = (
str(file) if isinstance(file, (str, Path)) else "<bytes/buffer>"
)
print(
f"Queued file extraction for file {file_repr} under job_id {job.id}"
)
return extract_jobs[0] if single_file else extract_jobs
async def aextract(
self, files: Union[FileInput, List[FileInput]]
) -> Union[ExtractRun, List[ExtractRun]]:
"""Asynchronously extract data from one or more files using this agent.
Args:
files (Union[FileInput, List[FileInput]]): The files to extract
Returns:
Union[ExtractRun, List[ExtractRun]]: The extraction results
"""
if not isinstance(files, list):
files = [files]
single_file = True
else:
single_file = False
# Queue all files for extraction
jobs = await self.queue_extraction(files)
# Wait for all results concurrently
result_tasks = [self._wait_for_job_result(job.id) for job in jobs]
with augment_async_errors():
results = await run_jobs(
result_tasks,
workers=self.num_workers,
desc="Extracting files",
show_progress=self.show_progress,
)
return results[0] if single_file else results
def extract(
self, files: Union[FileInput, List[FileInput]]
) -> Union[ExtractRun, List[ExtractRun]]:
"""Synchronously extract data from one or more files using this agent.
Args:
files (Union[FileInput, List[FileInput]]): The files to extract
Returns:
Union[ExtractRun, List[ExtractRun]]: The extraction results
"""
return self._run_in_thread(self.aextract(files))
def get_extraction_job(self, job_id: str) -> ExtractJob:
"""
Get the extraction job for a given job_id.
Args:
job_id (str): The job_id to get the extraction job for
Returns:
ExtractJob: The extraction job
"""
return self._run_in_thread(self._client.llama_extract.get_job(job_id=job_id))
def get_extraction_run_for_job(self, job_id: str) -> ExtractRun:
"""
Get the extraction run for a given job_id.
Args:
job_id (str): The job_id to get the extraction run for
Returns:
ExtractRun: The extraction run
"""
return self._run_in_thread(
self._client.llama_extract.get_run_by_job_id(
job_id=job_id,
)
)
def list_extraction_runs(self) -> List[ExtractRun]:
"""List extraction runs for the extraction agent.
Returns:
List[ExtractRun]: List of extraction runs
"""
return self._run_in_thread(
self._client.llama_extract.list_extract_runs(
extraction_agent_id=self.id,
)
)
def __repr__(self) -> str:
return f"ExtractionAgent(id={self.id}, name={self.name})"
class LlamaExtract(BaseComponent):
"""Factory class for creating and managing extraction agents."""
api_key: str = Field(description="The API key for the LlamaExtract API.")
base_url: str = Field(description="The base URL of the LlamaExtract API.")
check_interval: int = Field(
default=1,
description="The interval in seconds to check if the extraction is done.",
)
max_timeout: int = Field(
default=2000,
description="The maximum timeout in seconds to wait for the extraction to finish.",
)
num_workers: int = Field(
default=4,
gt=0,
lt=10,
description="The number of workers to use sending API requests for extraction.",
)
show_progress: bool = Field(
default=True, description="Show progress when extracting multiple files."
)
verbose: bool = Field(
default=False, description="Show verbose output when extracting files."
)
_async_client: AsyncLlamaCloud = PrivateAttr()
_thread_pool: ThreadPoolExecutor = PrivateAttr()
_project_id: Optional[str] = PrivateAttr()
_organization_id: Optional[str] = PrivateAttr()
def __init__(
self,
api_key: Optional[str] = None,
base_url: Optional[str] = None,
check_interval: int = 1,
max_timeout: int = 2000,
num_workers: int = 4,
show_progress: bool = True,
project_id: Optional[str] = None,
organization_id: Optional[str] = None,
verbose: bool = False,
):
if not api_key:
api_key = os.getenv("LLAMA_CLOUD_API_KEY", None)
if api_key is None:
raise ValueError("The API key is required.")
if not base_url:
base_url = os.getenv("LLAMA_CLOUD_BASE_URL", None) or DEFAULT_BASE_URL
super().__init__(
api_key=api_key,
base_url=base_url,
check_interval=check_interval,
max_timeout=max_timeout,
num_workers=num_workers,
show_progress=show_progress,
verbose=verbose,
)
self._async_client = AsyncLlamaCloud(
token=self.api_key, base_url=self.base_url, timeout=None
)
self._thread_pool = ThreadPoolExecutor(
max_workers=min(10, (os.cpu_count() or 1) + 4)
)
# Fetch default project id if not provided
if not project_id:
project_id = os.getenv("LLAMA_CLOUD_PROJECT_ID", None)
if not project_id:
print("No project_id provided, fetching default project.")
projects: List[Project] = self._run_in_thread(
self._async_client.projects.list_projects()
)
default_project = [p for p in projects if p.is_default]
if not default_project:
raise ValueError(
"No default project found. Please provide a project_id."
)
project_id = default_project[0].id
self._project_id = project_id
self._organization_id = organization_id
def _run_in_thread(self, coro: Coroutine[Any, Any, T]) -> T:
"""Run coroutine in a separate thread to avoid event loop issues"""
def run_coro() -> T:
# Create a new client for this thread
async def wrapped_coro() -> T:
async with httpx.AsyncClient(
timeout=self._async_client._client_wrapper.httpx_client.timeout,
) as client:
# Replace the client in the coro's context
original_client = self._async_client._client_wrapper.httpx_client
self._async_client._client_wrapper.httpx_client = client
try:
return await coro
finally:
self._async_client._client_wrapper.httpx_client = (
original_client
)
return asyncio.run(wrapped_coro())
return self._thread_pool.submit(run_coro).result()
def create_agent(
self,
name: str,
data_schema: SchemaInput,
config: Optional[ExtractConfig] = None,
) -> ExtractionAgent:
"""Create a new extraction agent.
Args:
name (str): The name of the extraction agent
data_schema (SchemaInput): The data schema for the extraction agent
config (Optional[ExtractConfig]): The extraction config for the agent
Returns:
ExtractionAgent: The created extraction agent
"""
if isinstance(data_schema, dict):
data_schema = data_schema
elif issubclass(data_schema, BaseModel):
data_schema = data_schema.model_json_schema()
else:
raise ValueError(
"data_schema must be either a dictionary or a Pydantic model"
)
agent = self._run_in_thread(
self._async_client.llama_extract.create_extraction_agent(
name=name,
data_schema=data_schema,
config=config or DEFAULT_EXTRACT_CONFIG,
project_id=self._project_id,
organization_id=self._organization_id,
)
)
return ExtractionAgent(
client=self._async_client,
agent=agent,
project_id=self._project_id,
organization_id=self._organization_id,
check_interval=self.check_interval,
max_timeout=self.max_timeout,
num_workers=self.num_workers,
show_progress=self.show_progress,
verbose=self.verbose,
)
def get_agent(
self,
name: Optional[str] = None,
id: Optional[str] = None,
) -> ExtractionAgent:
"""Get extraction agents by name or extraction agent ID.
Args:
name (Optional[str]): Filter by name
extraction_agent_id (Optional[str]): Filter by extraction agent ID
Returns:
ExtractionAgent: The extraction agent
"""
if id is not None and name is not None:
warnings.warn(
"Both name and extraction_agent_id are provided. Using extraction_agent_id."
)
if id:
agent = self._run_in_thread(
self._async_client.llama_extract.get_extraction_agent(
extraction_agent_id=id,
)
)
elif name:
agent = self._run_in_thread(
self._async_client.llama_extract.get_extraction_agent_by_name(
name=name,
project_id=self._project_id,
)
)
else:
raise ValueError("Either name or extraction_agent_id must be provided.")
return ExtractionAgent(
client=self._async_client,
agent=agent,
project_id=self._project_id,
organization_id=self._organization_id,
check_interval=self.check_interval,
max_timeout=self.max_timeout,
num_workers=self.num_workers,
show_progress=self.show_progress,
verbose=self.verbose,
)
def list_agents(self) -> List[ExtractionAgent]:
"""List all available extraction agents."""
agents = self._run_in_thread(
self._async_client.llama_extract.list_extraction_agents(
project_id=self._project_id,
)
)
return [
ExtractionAgent(
client=self._async_client,
agent=agent,
project_id=self._project_id,
organization_id=self._organization_id,
check_interval=self.check_interval,
max_timeout=self.max_timeout,
num_workers=self.num_workers,
show_progress=self.show_progress,
verbose=self.verbose,
)
for agent in agents
]
def delete_agent(self, agent_id: str) -> None:
"""Delete an extraction agent by ID.
Args:
agent_id (str): ID of the extraction agent to delete
"""
self._run_in_thread(
self._async_client.llama_extract.delete_extraction_agent(
extraction_agent_id=agent_id
)
)
if __name__ == "__main__":
from dotenv import load_dotenv
load_dotenv()
data_dir = Path(__file__).parent.parent / "tests" / "data"
extractor = LlamaExtract()
try:
agent = extractor.get_agent(name="test-agent")
except Exception:
agent = extractor.create_agent(
"test-agent",
{
"type": "object",
"properties": {
"title": {"type": "string"},
"summary": {"type": "string"},
},
},
)
results = agent.extract(data_dir / "slide" / "conocophilips.pdf")
extractor.delete_agent(agent.id)
print(results)
-36
View File
@@ -1,36 +0,0 @@
from typing import Any, Dict, List, Union, Generator
import asyncio
from llama_index.core.async_utils import asyncio_run
from contextlib import contextmanager
# Asyncio error messages
nest_asyncio_err = "cannot be called from a running event loop"
nest_asyncio_msg = (
"The event loop is already running. "
"Add `import nest_asyncio; nest_asyncio.apply()` to your code to fix this issue."
)
def is_jupyter() -> bool:
"""Check if we're running in a Jupyter environment."""
try:
from IPython import get_ipython
return get_ipython().__class__.__name__ == "ZMQInteractiveShell"
except (ImportError, AttributeError):
return False
@contextmanager
def augment_async_errors() -> Generator[None, None, None]:
"""Context manager to add helpful information for errors due to nested event loops."""
try:
yield
except RuntimeError as e:
if nest_asyncio_err in str(e):
raise RuntimeError(nest_asyncio_msg)
raise
JSONType = Union[Dict[str, Any], List[Any], str, int, float, bool, None]
JSONObjectType = Dict[str, JSONType]
Generated
-4315
View File
File diff suppressed because it is too large Load Diff
-44
View File
@@ -1,44 +0,0 @@
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
[tool.mypy]
files = ["llama_extract"]
python_version = "3.9"
[tool.poetry]
name = "llama-extract"
version = "0.1.1"
description = "Structured data extraction from files."
authors = ["Logan Markewich <logan@runllama.ai>", "Neeraj Pradhan <neeraj@llamaindex.ai>"]
license = "MIT"
readme = "README.md"
packages = [{include = "llama_extract"}]
[tool.poetry.dependencies]
python = ">=3.9,<4.0"
llama-index-core = "^0.11.0"
llama-cloud = "0.1.13"
python-dotenv = "^1.0.1"
[tool.poetry.group.dev.dependencies]
pytest = "^8.0.0"
ipykernel = "^6.29.0"
pre-commit = "3.2.0"
autoevals = "^0.0.114"
deepdiff = "^8.1.1"
ipython = "^8.12.3"
jupyter = "^1.1.1"
pytest-asyncio = {version = "^0.25.2", python = ">=3.9,<4.0"}
mypy = "^1.14.1"
[tool.pytest.ini_options]
asyncio_mode = "strict"
asyncio_default_fixture_loop_scope = "function"
[tool.ruff.format]
line-ending = "auto"
skip-magic-trailing-comma = false
[tool.ruff.lint]
select = ["W292"]
View File
Binary file not shown.
@@ -1,37 +0,0 @@
{
"receiptNumber": "27215058",
"invoiceNumber": "87B37C90152",
"datePaid": "2024-07-19",
"paymentMethod": {
"type": "visa",
"lastFourDigits": "7267"
},
"merchant": {
"name": "Noisebridge",
"address": {
"street": "272 Capp St",
"city": "San Francisco",
"state": "California",
"postalCode": "94110",
"country": "United States"
},
"phone": "1 6507017829",
"email": "treasurer+stripe@noisebridge.net"
},
"billTo": "noisebridge@seldo.com",
"items": [
{
"description": "$10 / month",
"quantity": 1,
"unitPrice": 10.0,
"amount": 10.0,
"period": {
"start": "2024-07-19",
"end": "2024-08-19"
}
}
],
"subtotal": 10.0,
"total": 10.0,
"amountPaid": 10.0
}
-135
View File
@@ -1,135 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"required": [
"receiptNumber",
"datePaid",
"total",
"items"
],
"properties": {
"receiptNumber": {
"type": "string"
},
"invoiceNumber": {
"type": "string"
},
"datePaid": {
"type": "string",
"format": "date"
},
"paymentMethod": {
"type": "object",
"properties": {
"type": {
"type": "string",
"enum": [
"visa",
"mastercard",
"amex",
"cash",
"other"
]
},
"lastFourDigits": {
"type": "string",
"pattern": "^[0-9]{4}$"
}
}
},
"merchant": {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"address": {
"type": "object",
"properties": {
"street": {
"type": "string"
},
"city": {
"type": "string"
},
"state": {
"type": "string"
},
"postalCode": {
"type": "string"
},
"country": {
"type": "string"
}
}
},
"phone": {
"type": "string"
},
"email": {
"type": "string",
"format": "email"
}
}
},
"billTo": {
"type": "string",
"format": "email"
},
"items": {
"type": "array",
"items": {
"type": "object",
"required": [
"description",
"quantity",
"unitPrice",
"amount",
"period"
],
"properties": {
"description": {
"type": "string"
},
"quantity": {
"type": "integer",
"minimum": 1
},
"unitPrice": {
"type": "number",
"minimum": 0
},
"amount": {
"type": "number",
"minimum": 0
},
"period": {
"type": "object",
"properties": {
"start": {
"type": "string",
"format": "date"
},
"end": {
"type": "string",
"format": "date"
}
}
}
}
}
},
"subtotal": {
"type": "number",
"minimum": 0
},
"total": {
"type": "number",
"minimum": 0
},
"amountPaid": {
"type": "number",
"minimum": 0
}
}
}
-200
View File
@@ -1,200 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "Resume Schema",
"type": "object",
"required": [
"basics",
"skills",
"experience"
],
"properties": {
"basics": {
"type": "object",
"required": [
"name",
"email"
],
"properties": {
"name": {
"type": "string"
},
"email": {
"type": "string",
"format": "email"
},
"phone": {
"type": "string"
},
"location": {
"type": "object",
"properties": {
"city": {
"type": "string"
},
"region": {
"type": "string"
},
"country": {
"type": "string"
}
}
},
"profiles": {
"type": "array",
"items": {
"type": "object",
"properties": {
"network": {
"type": "string"
},
"url": {
"type": "string",
"format": "uri"
}
}
}
},
"summary": {
"type": "string"
}
}
},
"skills": {
"type": "array",
"items": {
"type": "object",
"properties": {
"category": {
"type": "string"
},
"keywords": {
"type": "array",
"items": {
"type": "string"
}
},
"level": {
"type": "string",
"enum": [
"beginner",
"intermediate",
"advanced",
"expert"
]
}
}
}
},
"experience": {
"type": "array",
"items": {
"type": "object",
"required": [
"company",
"position",
"startDate"
],
"properties": {
"company": {
"type": "string"
},
"position": {
"type": "string"
},
"startDate": {
"type": "string",
"format": "date"
},
"endDate": {
"type": "string",
"format": "date"
},
"highlights": {
"type": "array",
"items": {
"type": "string"
}
},
"technologies": {
"type": "array",
"items": {
"type": "string"
}
}
}
}
},
"education": {
"type": "array",
"items": {
"type": "object",
"required": [
"institution",
"degree"
],
"properties": {
"institution": {
"type": "string"
},
"degree": {
"type": "string"
},
"field": {
"type": "string"
},
"graduationDate": {
"type": "string",
"format": "date"
},
"gpa": {
"type": "number"
}
}
}
},
"certifications": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"issuer": {
"type": "string"
},
"date": {
"type": "string",
"format": "date"
},
"validUntil": {
"type": "string",
"format": "date"
}
}
}
},
"publications": {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"publisher": {
"type": "string"
},
"date": {
"type": "string",
"format": "date"
},
"url": {
"type": "string",
"format": "uri"
}
}
}
}
}
}
@@ -1,300 +0,0 @@
<!doctype html>
<html>
<head>
<style>
body {
font-family: "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
margin: 0;
padding: 0;
background: #fff;
color: #333;
line-height: 1.6;
}
.container {
display: flex;
max-width: 1200px;
margin: 0 auto;
box-shadow: 0 0 20px rgba(0, 0, 0, 0.1);
min-height: 100vh;
}
.sidebar {
background: #2c3e50;
color: white;
padding: 2rem;
width: 300px;
}
.main-content {
padding: 2rem;
flex: 1;
}
.profile-name {
font-size: 2.5rem;
margin: 0;
color: #2c3e50;
border-bottom: 3px solid #3498db;
padding-bottom: 0.5rem;
}
.profile-title {
font-size: 1.5rem;
color: #7f8c8d;
margin: 0.5rem 0 2rem 0;
}
.contact-info {
margin-bottom: 2rem;
}
.section-title {
font-size: 1.2rem;
text-transform: uppercase;
color: #3498db;
margin-bottom: 1rem;
letter-spacing: 1px;
}
.sidebar .section-title {
color: white;
border-bottom: 2px solid #3498db;
padding-bottom: 0.5rem;
}
.skill-category {
margin-bottom: 1rem;
}
.skill-list {
list-style: none;
padding: 0;
margin: 0;
}
.skill-list li {
margin-bottom: 0.5rem;
font-size: 0.9rem;
}
.experience-item {
margin-bottom: 2rem;
}
.company-name {
font-weight: bold;
color: #2c3e50;
font-size: 1.1rem;
}
.job-title {
color: #3498db;
font-weight: bold;
}
.date {
color: #7f8c8d;
font-size: 0.9rem;
}
.achievements {
list-style: disc;
padding-left: 1.2rem;
margin-top: 0.5rem;
}
.contact-info a {
color: white;
text-decoration: none;
}
.education-item {
margin-bottom: 1rem;
}
</style>
</head>
<body>
<div class="container">
<div class="sidebar">
<div class="contact-info">
<h2 class="section-title">Contact</h2>
<p>sarah.chen@email.com</p>
<p>(555) 123-4567</p>
<p>San Francisco, CA</p>
<p><a href="#">LinkedIn Profile</a></p>
</div>
<div class="skills-section">
<h2 class="section-title">Technical Skills</h2>
<div class="skill-category">
<h3>Architecture & Design</h3>
<ul class="skill-list">
<li>Microservices</li>
<li>Event-Driven Architecture</li>
<li>Domain-Driven Design</li>
<li>REST APIs</li>
</ul>
</div>
<div class="skill-category">
<h3>Cloud Platforms</h3>
<ul class="skill-list">
<li>AWS (Advanced)</li>
<li>Azure</li>
<li>Google Cloud Platform</li>
</ul>
</div>
<div class="skill-category">
<h3>Programming</h3>
<ul class="skill-list">
<li>Java</li>
<li>Python</li>
<li>Go</li>
<li>JavaScript/TypeScript</li>
</ul>
</div>
<div class="skill-category">
<h3>Certifications</h3>
<ul class="skill-list">
<li>AWS Solutions Architect - Professional</li>
<li>Google Cloud Architect</li>
<li>Certified Kubernetes Administrator</li>
</ul>
</div>
</div>
</div>
<div class="main-content">
<h1 class="profile-name">Sarah Chen</h1>
<div class="profile-title">Senior Software Architect</div>
<div class="section">
<h2 class="section-title">Professional Summary</h2>
<p>
Innovative Software Architect with over 12 years of experience
designing and implementing large-scale distributed systems. Proven
track record of leading technical teams and delivering robust
enterprise solutions. Expert in cloud architecture, microservices,
and emerging technologies with a focus on scalable, maintainable
systems.
</p>
</div>
<div class="section">
<h2 class="section-title">Professional Experience</h2>
<div class="experience-item">
<div class="company-name">TechCorp Solutions</div>
<div class="job-title">Senior Software Architect</div>
<div class="date">2020 - Present</div>
<ul class="achievements">
<li>
Led architectural design and implementation of a cloud-native
platform serving 2M+ users
</li>
<li>
Established architectural guidelines and best practices adopted
across 12 development teams
</li>
<li>
Reduced system latency by 40% through implementation of
event-driven architecture
</li>
<li>
Mentored 15+ senior developers in cloud-native development
practices
</li>
</ul>
</div>
<div class="experience-item">
<div class="company-name">DataFlow Systems</div>
<div class="job-title">Lead Software Engineer</div>
<div class="date">2016 - 2020</div>
<ul class="achievements">
<li>
Architected and led development of distributed data processing
platform handling 5TB daily
</li>
<li>
Designed microservices architecture reducing deployment time by
65%
</li>
<li>
Led migration of legacy monolith to cloud-native architecture
</li>
<li>
Managed team of 8 engineers across 3 international locations
</li>
</ul>
</div>
<div class="experience-item">
<div class="company-name">InnovateTech</div>
<div class="job-title">Senior Software Engineer</div>
<div class="date">2013 - 2016</div>
<ul class="achievements">
<li>
Developed high-performance trading platform processing 100K
transactions per second
</li>
<li>
Implemented real-time analytics engine reducing processing
latency by 75%
</li>
<li>
Led adoption of container orchestration reducing deployment
costs by 35%
</li>
</ul>
</div>
</div>
<div class="section">
<h2 class="section-title">Education</h2>
<div class="education-item">
<div class="company-name">Stanford University</div>
<div class="job-title">Master of Science in Computer Science</div>
<div class="date">2013</div>
<p>Focus: Distributed Systems and Machine Learning</p>
</div>
<div class="education-item">
<div class="company-name">University of California, Berkeley</div>
<div class="job-title">
Bachelor of Science in Computer Engineering
</div>
<div class="date">2011</div>
<p>Magna Cum Laude</p>
</div>
</div>
<div class="section">
<h2 class="section-title">Patents & Speaking</h2>
<ul class="achievements">
<li>
Co-inventor on three patents for distributed systems architecture
</li>
<li>
Published paper on "Scalable Microservices Architecture" at IEEE
Cloud Computing Conference 2022
</li>
<li>
Keynote Speaker, CloudCon 2023: "Future of Cloud-Native
Architecture"
</li>
<li>Regular presenter at local tech meetups and conferences</li>
</ul>
</div>
</div>
</div>
</body>
</html>
@@ -1,104 +0,0 @@
{
"basics": {
"name": "Sarah Chen",
"email": "san.francisco@email.com",
"phone": "(555) 123-4567",
"location": {
"city": "San Francisco",
"region": "CA",
"country": "USA"
}
},
"skills": [
{
"category": "Architecture & Design",
"keywords": [
"Microservices",
"Event-Driven Architecture",
"Domain-Driven Design",
"REST APIs"
]
},
{
"category": "Cloud Platforms",
"keywords": [
"AWS",
"Azure",
"Google Cloud Platform"
]
},
{
"category": "Programming Languages",
"keywords": [
"Java",
"Python",
"Go",
"JavaScript",
"TypeScript"
]
}
],
"experience": [
{
"company": "TechCorp Solutions",
"position": "Senior Software Architect",
"startDate": "2020-01-01",
"endDate": "2024-01-10"
},
{
"company": "DataFlow Systems",
"position": "Lead Software Engineer",
"startDate": "2016-01-01",
"endDate": "2019-12-31",
"technologies": [
"Distributed Systems",
"Microservices",
"Cloud Migration"
]
},
{
"company": "InnovateTech",
"position": "Senior Software Engineer",
"startDate": "2013-01-01",
"endDate": "2015-12-31",
"technologies": [
"High-performance Computing",
"Real-time Analytics",
"Container Orchestration"
]
}
],
"education": [
{
"institution": "Stanford University",
"degree": "Master of Science",
"field": "Computer Science",
"graduationDate": "2013-01-01",
"specialization": "Distributed Systems and Machine Learning"
},
{
"institution": "University of California, Berkeley",
"degree": "Bachelor of Science",
"field": "Computer Engineering",
"graduationDate": "2011-01-01"
}
],
"certifications": [
{
"name": "AWS Solutions Architect - Professional"
},
{
"name": "Google Cloud Architect"
},
{
"name": "Certified Kubernetes Administrator"
}
],
"publications": [
{
"title": "Scalable Microservices Architecture",
"publisher": "IEEE Cloud Computing Conference",
"date": "2022-01-01"
}
]
}
Binary file not shown.
-48
View File
@@ -1,48 +0,0 @@
{
"companyInfo": {
"name": "CloudFlow Analytics",
"fundingStage": "Series A",
"foundedYear": null,
"industry": null,
"location": null
},
"financialMetrics": {
"mrr": {
"value": 580000,
"currency": "USD",
"growthRate": 27
},
"grossMargin": 88
},
"growthMetrics": {
"customers": {
"total": 1247,
"growth": 142,
"enterprisePercent": null
},
"nrr": 147
},
"marketMetrics": {
"tam": 50000000000,
"sam": null,
"marketShare": null,
"competitors": null
},
"differentiators": [
{
"claim": "Processing Speed",
"metric": "5x faster",
"comparisonTarget": "competitors"
},
{
"claim": "ML Accuracy",
"metric": "99.9%",
"comparisonTarget": null
},
{
"claim": "Market Potential",
"metric": "80%",
"comparisonTarget": "Fortune 500"
}
]
}
-151
View File
@@ -1,151 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"required": [
"companyInfo",
"financialMetrics",
"growthMetrics"
],
"properties": {
"companyInfo": {
"type": "object",
"required": [
"name",
"fundingStage"
],
"properties": {
"name": {
"type": "string"
},
"fundingStage": {
"type": "string",
"enum": [
"Pre-seed",
"Seed",
"Series A",
"Series B",
"Series C+"
]
},
"foundedYear": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
]
},
"industry": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
]
},
"location": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
]
}
}
},
"financialMetrics": {
"type": "object",
"required": [
"mrr",
"growthRate"
],
"properties": {
"mrr": {
"type": "object",
"description": "Monthly Recurring Revenue",
"required": [
"value",
"currency",
"growthRate"
],
"properties": {
"value": {
"type": "number"
},
"currency": {
"type": "string"
},
"growthRate": {
"type": "number"
}
}
},
"grossMargin": {
"type": "number"
}
}
},
"growthMetrics": {
"type": "object",
"required": [
"customers",
"nrr"
],
"properties": {
"customers": {
"type": "object",
"required": [
"total",
"growth"
],
"properties": {
"total": {
"type": "integer"
},
"growth": {
"type": "number"
}
}
},
"nrr": {
"description": "Net Revenue Retention",
"type": "number"
}
}
},
"differentiators": {
"type": "array",
"items": {
"type": "object",
"required": [
"claim",
"metric"
],
"properties": {
"claim": {
"type": "string"
},
"metric": {
"type": "string"
},
"comparisonTarget": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
]
}
}
}
}
}
}
-149
View File
@@ -1,149 +0,0 @@
import os
import pytest
from pathlib import Path
from llama_extract import LlamaExtract, ExtractionAgent
from dotenv import load_dotenv
from time import perf_counter
from collections import namedtuple
import json
import uuid
from llama_cloud.core.api_error import ApiError
from llama_cloud.types import (
ExtractConfig,
ExtractMode,
LlamaParseParameters,
LlamaExtractSettings,
)
load_dotenv(Path(__file__).parent.parent / ".env.dev", override=True)
TEST_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
# Get configuration from environment
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
LLAMA_CLOUD_BASE_URL = os.getenv("LLAMA_CLOUD_BASE_URL")
LLAMA_CLOUD_PROJECT_ID = os.getenv("LLAMA_CLOUD_PROJECT_ID")
TestCase = namedtuple(
"TestCase", ["name", "schema_path", "config", "input_file", "expected_output"]
)
def get_test_cases():
"""Get all test cases from TEST_DIR.
Returns:
List[TestCase]: List of test cases
"""
test_cases = []
for data_type in os.listdir(TEST_DIR):
data_type_dir = os.path.join(TEST_DIR, data_type)
if not os.path.isdir(data_type_dir):
continue
schema_path = os.path.join(data_type_dir, "schema.json")
if not os.path.exists(schema_path):
continue
input_files = []
for file in os.listdir(data_type_dir):
file_path = os.path.join(data_type_dir, file)
if (
not os.path.isfile(file_path)
or file == "schema.json"
or file.endswith(".test.json")
):
continue
input_files.append(file_path)
settings = [
ExtractConfig(extraction_mode=ExtractMode.FAST),
ExtractConfig(extraction_mode=ExtractMode.ACCURATE),
]
for input_file in sorted(input_files):
base_name = os.path.splitext(os.path.basename(input_file))[0]
expected_output = os.path.join(data_type_dir, f"{base_name}.test.json")
if not os.path.exists(expected_output):
continue
test_name = f"{data_type}/{os.path.basename(input_file)}"
for setting in settings:
test_cases.append(
TestCase(
name=test_name,
schema_path=schema_path,
input_file=input_file,
config=setting,
expected_output=expected_output,
)
)
return test_cases
@pytest.fixture(scope="session")
def extractor():
"""Create a single LlamaExtract instance for all tests."""
extract = LlamaExtract(
api_key=LLAMA_CLOUD_API_KEY,
base_url=LLAMA_CLOUD_BASE_URL,
project_id=LLAMA_CLOUD_PROJECT_ID,
verbose=True,
)
yield extract
# Cleanup thread pool at end of session
extract._thread_pool.shutdown()
@pytest.fixture
def extraction_agent(test_case: TestCase, extractor: LlamaExtract):
"""Fixture to create and cleanup extraction agent for each test."""
# Create unique name with random UUID (important for CI to avoid conflicts)
unique_id = uuid.uuid4().hex[:8]
agent_name = f"{test_case.name}_{unique_id}"
with open(test_case.schema_path, "r") as f:
schema = json.load(f)
# Clean up any existing agents with this name
try:
agents = extractor.list_agents()
for agent in agents:
if agent.name == agent_name:
extractor.delete_agent(agent.id)
except Exception as e:
print(f"Warning: Failed to cleanup existing agent: {str(e)}")
# Create new agent
agent = extractor.create_agent(agent_name, schema, config=test_case.config)
yield agent
@pytest.mark.skipif(
"CI" in os.environ,
reason="CI environment is not suitable for benchmarking",
)
@pytest.mark.parametrize("test_case", get_test_cases(), ids=lambda x: x.name)
@pytest.mark.asyncio(loop_scope="session")
async def test_extraction(
test_case: TestCase, extraction_agent: ExtractionAgent
) -> None:
start = perf_counter()
result = await extraction_agent._queue_extraction_test(
test_case.input_file,
extract_settings=LlamaExtractSettings(
llama_parse_params=LlamaParseParameters(
invalidate_cache=True,
do_not_cache=True,
)
),
)
end = perf_counter()
print(f"Time taken: {end - start} seconds")
print(result)
-190
View File
@@ -1,190 +0,0 @@
import os
import pytest
from pathlib import Path
from pydantic import BaseModel
from dotenv import load_dotenv
from llama_cloud.core.api_error import ApiError
from llama_extract import LlamaExtract, ExtractionAgent
# Load environment variables
load_dotenv(Path(__file__).parent.parent / ".env.dev", override=True)
# Get configuration from environment
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
LLAMA_CLOUD_BASE_URL = os.getenv("LLAMA_CLOUD_BASE_URL")
LLAMA_CLOUD_PROJECT_ID = os.getenv("LLAMA_CLOUD_PROJECT_ID")
# Skip all tests if API key is not set
pytestmark = pytest.mark.skipif(
not LLAMA_CLOUD_API_KEY, reason="LLAMA_CLOUD_API_KEY not set"
)
# Test data
class TestSchema(BaseModel):
title: str
summary: str
# Test data paths
TEST_DIR = Path(__file__).parent / "data"
TEST_PDF = TEST_DIR / "slide" / "saas_slide.pdf"
@pytest.fixture
def llama_extract():
return LlamaExtract(
api_key=LLAMA_CLOUD_API_KEY,
base_url=LLAMA_CLOUD_BASE_URL,
project_id=LLAMA_CLOUD_PROJECT_ID,
verbose=True,
)
@pytest.fixture
def test_agent_name():
return "test-api-agent"
@pytest.fixture
def test_schema_dict():
return {
"type": "object",
"properties": {
"title": {"type": "string"},
"summary": {"type": "string"},
},
}
@pytest.fixture
def test_agent(llama_extract, test_agent_name, test_schema_dict, request):
"""Creates a test agent and cleans it up after the test"""
test_id = request.node.nodeid
test_hash = hex(hash(test_id))[-8:]
base_name = test_agent_name
base_name = next(
(marker.args[0] for marker in request.node.iter_markers("agent_name")),
base_name,
)
name = f"{base_name}_{test_hash}"
schema = next(
(
marker.args[0][0] if isinstance(marker.args[0], tuple) else marker.args[0]
for marker in request.node.iter_markers("agent_schema")
),
test_schema_dict,
)
# Cleanup existing agent
try:
for agent in llama_extract.list_agents():
if agent.name == name:
llama_extract.delete_agent(agent.id)
except Exception as e:
print(f"Warning: Failed to cleanup existing agent: {e}")
agent = llama_extract.create_agent(name=name, data_schema=schema)
yield agent
# Cleanup after test
try:
llama_extract.delete_agent(agent.id)
except Exception as e:
print(f"Warning: Failed to delete agent {agent.id}: {e}")
class TestLlamaExtract:
def test_init_without_api_key(self):
env_backup = os.getenv("LLAMA_CLOUD_API_KEY")
del os.environ["LLAMA_CLOUD_API_KEY"]
with pytest.raises(ValueError, match="The API key is required"):
LlamaExtract(api_key=None, base_url=LLAMA_CLOUD_BASE_URL)
os.environ["LLAMA_CLOUD_API_KEY"] = env_backup
@pytest.mark.agent_name("test-dict-schema-agent")
def test_create_agent_with_dict_schema(self, test_agent):
assert isinstance(test_agent, ExtractionAgent)
@pytest.mark.agent_name("test-pydantic-schema-agent")
@pytest.mark.agent_schema((TestSchema,))
def test_create_agent_with_pydantic_schema(self, test_agent):
assert isinstance(test_agent, ExtractionAgent)
def test_get_agent_by_name(self, llama_extract, test_agent):
agent = llama_extract.get_agent(name=test_agent.name)
assert isinstance(agent, ExtractionAgent)
assert agent.name == test_agent.name
assert agent.id == test_agent.id
assert agent.data_schema == test_agent.data_schema
def test_get_agent_by_id(self, llama_extract, test_agent):
agent = llama_extract.get_agent(id=test_agent.id)
assert isinstance(agent, ExtractionAgent)
assert agent.id == test_agent.id
assert agent.name == test_agent.name
assert agent.data_schema == test_agent.data_schema
def test_list_agents(self, llama_extract, test_agent):
agents = llama_extract.list_agents()
assert isinstance(agents, list)
assert any(a.id == test_agent.id for a in agents)
class TestExtractionAgent:
@pytest.mark.asyncio
async def test_extract_single_file(self, test_agent):
result = await test_agent.aextract(TEST_PDF)
assert result.status == "SUCCESS"
assert result.data is not None
assert isinstance(result.data, dict)
assert "title" in result.data
assert "summary" in result.data
def test_sync_extract_single_file(self, test_agent):
result = test_agent.extract(TEST_PDF)
assert result.status == "SUCCESS"
assert result.data is not None
assert isinstance(result.data, dict)
assert "title" in result.data
assert "summary" in result.data
@pytest.mark.asyncio
async def test_extract_multiple_files(self, test_agent):
files = [TEST_PDF, TEST_PDF] # Using same file twice for testing
response = await test_agent.aextract(files)
assert len(response) == 2
for result in response:
assert result.status == "SUCCESS"
assert result.data is not None
assert isinstance(result.data, dict)
assert "title" in result.data
assert "summary" in result.data
def test_save_agent_updates(
self, test_agent: ExtractionAgent, llama_extract: LlamaExtract
):
new_schema = {
"type": "object",
"properties": {
"new_field": {"type": "string"},
"title": {"type": "string"},
"summary": {"type": "string"},
},
}
test_agent.data_schema = new_schema
test_agent.save()
# Verify the update by getting a fresh instance
updated_agent = llama_extract.get_agent(name=test_agent.name)
assert "new_field" in updated_agent.data_schema["properties"]
def test_list_extraction_runs(self, test_agent: ExtractionAgent):
assert len(test_agent.list_extraction_runs()) == 0
test_agent.extract(TEST_PDF)
runs = test_agent.list_extraction_runs()
assert len(runs) > 0
-142
View File
@@ -1,142 +0,0 @@
import os
import pytest
from pathlib import Path
from llama_extract import LlamaExtract, ExtractionAgent
from dotenv import load_dotenv
from collections import namedtuple
import json
import uuid
from llama_cloud.core.api_error import ApiError
from llama_cloud.types import ExtractConfig, ExtractMode, ExtractConfig
from deepdiff import DeepDiff
from tests.util import json_subset_match_score
load_dotenv(Path(__file__).parent.parent / ".env.dev", override=True)
TEST_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
# Get configuration from environment
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
LLAMA_CLOUD_BASE_URL = os.getenv("LLAMA_CLOUD_BASE_URL")
LLAMA_CLOUD_PROJECT_ID = os.getenv("LLAMA_CLOUD_PROJECT_ID")
TestCase = namedtuple(
"TestCase", ["name", "schema_path", "config", "input_file", "expected_output"]
)
def get_test_cases():
"""Get all test cases from TEST_DIR.
Returns:
List[TestCase]: List of test cases
"""
test_cases = []
for data_type in os.listdir(TEST_DIR):
data_type_dir = os.path.join(TEST_DIR, data_type)
if not os.path.isdir(data_type_dir):
continue
schema_path = os.path.join(data_type_dir, "schema.json")
if not os.path.exists(schema_path):
continue
input_files = []
for file in os.listdir(data_type_dir):
file_path = os.path.join(data_type_dir, file)
if (
not os.path.isfile(file_path)
or file == "schema.json"
or file.endswith(".test.json")
):
continue
input_files.append(file_path)
settings = [
ExtractConfig(extraction_mode=ExtractMode.FAST),
ExtractConfig(extraction_mode=ExtractMode.ACCURATE),
]
for input_file in sorted(input_files):
base_name = os.path.splitext(os.path.basename(input_file))[0]
expected_output = os.path.join(data_type_dir, f"{base_name}.test.json")
if not os.path.exists(expected_output):
continue
test_name = f"{data_type}/{os.path.basename(input_file)}"
for setting in settings:
test_cases.append(
TestCase(
name=test_name,
schema_path=schema_path,
input_file=input_file,
config=setting,
expected_output=expected_output,
)
)
return test_cases
@pytest.fixture(scope="session")
def extractor():
"""Create a single LlamaExtract instance for all tests."""
extract = LlamaExtract(
api_key=LLAMA_CLOUD_API_KEY,
base_url=LLAMA_CLOUD_BASE_URL,
project_id=LLAMA_CLOUD_PROJECT_ID,
verbose=True,
)
yield extract
# Cleanup thread pool at end of session
extract._thread_pool.shutdown()
@pytest.fixture
def extraction_agent(test_case: TestCase, extractor: LlamaExtract):
"""Fixture to create and cleanup extraction agent for each test."""
# Create unique name with random UUID (important for CI to avoid conflicts)
unique_id = uuid.uuid4().hex[:8]
agent_name = f"{test_case.name}_{unique_id}"
with open(test_case.schema_path, "r") as f:
schema = json.load(f)
# Clean up any existing agents with this name
try:
agents = extractor.list_agents()
for agent in agents:
if agent.name == agent_name:
extractor.delete_agent(agent.id)
except Exception as e:
print(f"Warning: Failed to cleanup existing agent: {str(e)}")
# Create new agent
agent = extractor.create_agent(agent_name, schema, config=test_case.config)
yield agent
# Cleanup after test
try:
extractor.delete_agent(agent.id)
except Exception as e:
print(f"Warning: Failed to delete agent {agent.id}: {str(e)}")
@pytest.mark.skipif(
os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
reason="LLAMA_CLOUD_API_KEY not set",
)
@pytest.mark.parametrize("test_case", get_test_cases(), ids=lambda x: x.name)
def test_extraction(test_case: TestCase, extraction_agent: ExtractionAgent) -> None:
result = extraction_agent.extract(test_case.input_file).data
with open(test_case.expected_output, "r") as f:
expected = json.load(f)
# TODO: fix the saas_slide test
assert json_subset_match_score(expected, result) > 0.3, DeepDiff(
expected, result, ignore_order=True
)
-37
View File
@@ -1,37 +0,0 @@
from typing import Any
from autoevals.string import Levenshtein
from autoevals.number import NumericDiff
def json_subset_match_score(expected: Any, actual: Any) -> float:
"""
Adapted from autoevals.JsonDiff to only test on the subset of keys within the expected json.
"""
string_scorer = Levenshtein()
number_scorer = NumericDiff()
if isinstance(expected, dict) and isinstance(actual, dict):
if len(expected) == 0 and len(actual) == 0:
return 1
keys = set(expected.keys())
scores = [json_subset_match_score(expected.get(k), actual.get(k)) for k in keys]
scores = [s for s in scores if s is not None]
return sum(scores) / len(scores)
elif isinstance(expected, list) and isinstance(actual, list):
if len(expected) == 0 and len(actual) == 0:
return 1
scores = [json_subset_match_score(e1, e2) for (e1, e2) in zip(expected, actual)]
scores = [s for s in scores if s is not None]
return sum(scores) / max(len(expected), len(actual))
elif isinstance(expected, str) and isinstance(actual, str):
return string_scorer.eval(expected, actual).score
elif (isinstance(expected, int) or isinstance(expected, float)) and (
isinstance(actual, int) or isinstance(actual, float)
):
return number_scorer.eval(expected, actual).score
elif expected is None and actual is None:
return 1
elif expected is None or actual is None:
return 0
else:
return 0