mirror of
https://github.com/run-llama/template-workflow-extract-reconcile-invoice.git
synced 2026-06-30 22:17:53 -04:00
llama-cloud v2: classify-extract-sec + extract-reconcile-invoice (#257)
This commit is contained in:
+131
-24
@@ -1,37 +1,80 @@
|
||||
{
|
||||
"extract": {
|
||||
"extraction_agent_id": null,
|
||||
"json_schema": {
|
||||
"product_type": "extract_v2",
|
||||
"configuration_id": null,
|
||||
"data_schema": {
|
||||
"type": "object",
|
||||
"description": "Schema for extracting invoice data",
|
||||
"properties": {
|
||||
"invoice_number": {
|
||||
"anyOf": [{ "type": "string" }, { "type": "null" }],
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"default": null,
|
||||
"description": "Invoice number or identifier"
|
||||
},
|
||||
"invoice_date": {
|
||||
"anyOf": [{ "type": "string" }, { "type": "null" }],
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"default": null,
|
||||
"description": "Date of the invoice (YYYY-MM-DD format if possible)"
|
||||
},
|
||||
"vendor_name": {
|
||||
"anyOf": [{ "type": "string" }, { "type": "null" }],
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"default": null,
|
||||
"description": "Name of the vendor or supplier"
|
||||
},
|
||||
"vendor_address": {
|
||||
"anyOf": [{ "type": "string" }, { "type": "null" }],
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"default": null,
|
||||
"description": "Address of the vendor"
|
||||
},
|
||||
"purchase_order_number": {
|
||||
"anyOf": [{ "type": "string" }, { "type": "null" }],
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"default": null,
|
||||
"description": "Purchase order (PO) number if present"
|
||||
},
|
||||
"payment_terms": {
|
||||
"anyOf": [{ "type": "string" }, { "type": "null" }],
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"default": null,
|
||||
"description": "Payment terms (e.g., Net 30, Net 60, Due on receipt)"
|
||||
},
|
||||
@@ -41,22 +84,50 @@
|
||||
"items": {
|
||||
"properties": {
|
||||
"description": {
|
||||
"anyOf": [{ "type": "string" }, { "type": "null" }],
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"default": null,
|
||||
"description": "Description of the line item"
|
||||
},
|
||||
"quantity": {
|
||||
"anyOf": [{ "type": "number" }, { "type": "null" }],
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "number"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"default": null,
|
||||
"description": "Quantity of the item"
|
||||
},
|
||||
"unit_price": {
|
||||
"anyOf": [{ "type": "number" }, { "type": "null" }],
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "number"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"default": null,
|
||||
"description": "Price per unit of the item"
|
||||
},
|
||||
"total": {
|
||||
"anyOf": [{ "type": "number" }, { "type": "null" }],
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "number"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"default": null,
|
||||
"description": "Total price for this line item"
|
||||
}
|
||||
@@ -65,35 +136,71 @@
|
||||
},
|
||||
"type": "array"
|
||||
},
|
||||
{ "type": "null" }
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"default": null,
|
||||
"description": "List of line items on the invoice"
|
||||
},
|
||||
"subtotal": {
|
||||
"anyOf": [{ "type": "number" }, { "type": "null" }],
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "number"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"default": null,
|
||||
"description": "Subtotal before tax and other charges"
|
||||
},
|
||||
"tax": {
|
||||
"anyOf": [{ "type": "number" }, { "type": "null" }],
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "number"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"default": null,
|
||||
"description": "Tax amount"
|
||||
},
|
||||
"total": {
|
||||
"anyOf": [{ "type": "number" }, { "type": "null" }],
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "number"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"default": null,
|
||||
"description": "Total amount due on the invoice"
|
||||
}
|
||||
}
|
||||
},
|
||||
"settings": {
|
||||
"extraction_mode": "PREMIUM",
|
||||
"system_prompt": null,
|
||||
"citation_bbox": true,
|
||||
"use_reasoning": false,
|
||||
"cite_sources": true,
|
||||
"confidence_scores": true
|
||||
}
|
||||
"tier": "agentic",
|
||||
"extraction_target": "per_doc",
|
||||
"system_prompt": null,
|
||||
"cite_sources": true,
|
||||
"confidence_scores": true
|
||||
},
|
||||
"classify": {
|
||||
"product_type": "classify_v2",
|
||||
"configuration_id": null,
|
||||
"rules": []
|
||||
},
|
||||
"parse": {
|
||||
"product_type": "parse_v2",
|
||||
"configuration_id": null,
|
||||
"tier": "agentic",
|
||||
"version": "latest"
|
||||
},
|
||||
"split": {
|
||||
"product_type": "split_v1",
|
||||
"configuration_id": null,
|
||||
"categories": []
|
||||
}
|
||||
}
|
||||
|
||||
+1
-1
@@ -5,7 +5,7 @@ description = "Extracts data"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"llama-cloud>=1.3.0,<2",
|
||||
"llama-cloud>=2.3.0,<3",
|
||||
"json-schema-to-pydantic>=0.4.8",
|
||||
"llama-index-workflows>=2.16.0,<3.0.0",
|
||||
"python-dotenv>=1.1.0",
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import logging
|
||||
import os
|
||||
|
||||
from llama_cloud import AsyncLlamaCloud, ConflictError
|
||||
from llama_cloud import AsyncLlamaCloud
|
||||
from llama_index.llms.openai import OpenAI
|
||||
|
||||
from extraction_review.config import CONTRACTS_INDEX_NAME
|
||||
@@ -34,18 +34,15 @@ async def get_contracts_pipeline_id() -> str:
|
||||
global _contracts_pipeline_id
|
||||
if _contracts_pipeline_id is None:
|
||||
client = get_llama_cloud_client()
|
||||
try:
|
||||
pipeline = await client.pipelines.upsert(
|
||||
name=CONTRACTS_INDEX_NAME,
|
||||
project_id=project_id,
|
||||
)
|
||||
except ConflictError:
|
||||
# Pipeline already exists — look it up by name
|
||||
pipelines = await client.pipelines.list(
|
||||
pipeline_name=CONTRACTS_INDEX_NAME,
|
||||
project_id=project_id,
|
||||
)
|
||||
pipeline = pipelines[0]
|
||||
# data_sink_id=None + transform_config are both required; omitting
|
||||
# either trips a misleading 409 "integrity constraint" error. Leaving
|
||||
# embedding_config unset lets the platform use managed OpenAI embeddings.
|
||||
pipeline = await client.pipelines.upsert(
|
||||
name=CONTRACTS_INDEX_NAME,
|
||||
project_id=project_id,
|
||||
data_sink_id=None,
|
||||
transform_config={"mode": "auto"},
|
||||
)
|
||||
_contracts_pipeline_id = pipeline.id
|
||||
return _contracts_pipeline_id
|
||||
|
||||
|
||||
@@ -2,24 +2,20 @@
|
||||
Configuration for the extraction review application.
|
||||
|
||||
Configuration is loaded from configs/config.json via ResourceConfig.
|
||||
The unified config contains both extraction settings and the JSON schema.
|
||||
|
||||
Extraction can run in two modes, controlled by the "extraction_agent_id" field
|
||||
in configs/config.json:
|
||||
|
||||
- Local (default): extraction_agent_id is null. Uses the json_schema and
|
||||
settings defined in config.json directly via extraction.run().
|
||||
|
||||
- Remote agent: extraction_agent_id is set to a LlamaCloud extraction agent
|
||||
ID. Uses extraction.jobs.extract(extraction_agent_id=...) which delegates
|
||||
schema and settings to the remote agent. The local json_schema and settings
|
||||
in config.json are ignored — both extraction and the metadata workflow fetch
|
||||
the schema directly from the remote agent.
|
||||
Each top-level key in config.json maps to an SDK product-configuration type:
|
||||
the discriminated union members returned by `client.configurations.retrieve`.
|
||||
Each template-side subclass adds an optional `configuration_id` so a key
|
||||
can either carry an inline snapshot OR point at a saved platform config.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any, Literal
|
||||
import os
|
||||
|
||||
from llama_cloud.types.beta.split_category import SplitCategory
|
||||
from llama_cloud.types.classify_v2_parameters import ClassifyV2Parameters, Rule
|
||||
from llama_cloud.types.extract_v2_parameters import ExtractV2Parameters
|
||||
from llama_cloud.types.parse_v2_parameters import ParseV2Parameters
|
||||
from llama_cloud.types.split_v1_parameters import SplitV1Parameters
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from .json_util import get_extraction_schema as get_extraction_schema
|
||||
@@ -31,35 +27,49 @@ logger = logging.getLogger(__name__)
|
||||
EXTRACTED_DATA_COLLECTION: str = "invoices"
|
||||
|
||||
# The name of the LlamaCloud index for storing contracts
|
||||
CONTRACTS_INDEX_NAME: str = "contracts"
|
||||
# Override with CONTRACTS_INDEX_NAME env var (useful when running against a
|
||||
# shared staging project where the default name may collide with other data).
|
||||
CONTRACTS_INDEX_NAME: str = os.getenv("CONTRACTS_INDEX_NAME", "contracts")
|
||||
|
||||
|
||||
class ExtractSettings(BaseModel):
|
||||
extraction_mode: Literal["FAST", "PREMIUM", "MULTIMODAL"]
|
||||
system_prompt: str | None = None
|
||||
citation_bbox: bool = False
|
||||
use_reasoning: bool = False
|
||||
cite_sources: bool = False
|
||||
confidence_scores: bool = False
|
||||
class ExtractConfig(ExtractV2Parameters):
|
||||
"""Extract product configuration.
|
||||
|
||||
Inherits the SDK `ExtractV2Parameters` shape. Set `configuration_id`
|
||||
to a saved LlamaCloud configuration id (cfg-...) to pull parameters
|
||||
from the platform instead of using the local values.
|
||||
"""
|
||||
|
||||
configuration_id: str | None = None
|
||||
|
||||
|
||||
class ExtractConfig(BaseModel):
|
||||
json_schema: dict[str, Any]
|
||||
settings: ExtractSettings
|
||||
# Set this to a LlamaCloud extraction agent ID to use a remote agent's
|
||||
# schema and settings instead of the local json_schema/settings above.
|
||||
# When set, extraction uses extraction.jobs.extract(extraction_agent_id=...)
|
||||
# and the local settings are ignored for extraction.
|
||||
extraction_agent_id: str | None = None
|
||||
class ClassifyConfig(ClassifyV2Parameters):
|
||||
"""Classify product configuration (extension slot, unused by the workflow)."""
|
||||
|
||||
rules: list[Rule] = []
|
||||
configuration_id: str | None = None
|
||||
|
||||
|
||||
class JsonSchema(BaseModel):
|
||||
type: str = "object"
|
||||
properties: dict[str, Any] = {}
|
||||
required: list[str] = []
|
||||
class ParseConfig(ParseV2Parameters):
|
||||
"""Parse product configuration (extension slot)."""
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return self.model_dump(exclude_none=True)
|
||||
configuration_id: str | None = None
|
||||
|
||||
|
||||
class SplitConfig(SplitV1Parameters):
|
||||
"""Split product configuration (extension slot)."""
|
||||
|
||||
categories: list[SplitCategory] = []
|
||||
configuration_id: str | None = None
|
||||
|
||||
|
||||
class Config(BaseModel):
|
||||
"""Root configuration model for configs/config.json."""
|
||||
|
||||
extract: ExtractConfig
|
||||
classify: ClassifyConfig
|
||||
parse: ParseConfig
|
||||
split: SplitConfig
|
||||
|
||||
|
||||
# Invoice extraction schema - extracted from invoice documents
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
from typing import Annotated, Any
|
||||
|
||||
import jsonref
|
||||
from llama_cloud.types.configuration_response import ExtractV2Parameters
|
||||
from workflows import Workflow, step
|
||||
from workflows.events import StartEvent, StopEvent
|
||||
from workflows.resource import ResourceConfig
|
||||
|
||||
from .clients import get_contracts_pipeline_id, get_llama_cloud_client
|
||||
from .config import EXTRACTED_DATA_COLLECTION, ExtractConfig, JsonSchema
|
||||
from .clients import get_contracts_pipeline_id, get_llama_cloud_client, project_id
|
||||
from .config import EXTRACTED_DATA_COLLECTION, ExtractConfig
|
||||
|
||||
|
||||
class MetadataResponse(StopEvent):
|
||||
@@ -22,15 +23,6 @@ class MetadataWorkflow(Workflow):
|
||||
async def get_metadata(
|
||||
self,
|
||||
_: StartEvent,
|
||||
extraction_schema: Annotated[
|
||||
JsonSchema,
|
||||
ResourceConfig(
|
||||
config_file="configs/config.json",
|
||||
path_selector="extract.json_schema",
|
||||
label="Extraction Schema",
|
||||
description="JSON Schema defining the fields to extract from documents",
|
||||
),
|
||||
],
|
||||
extract_config: Annotated[
|
||||
ExtractConfig,
|
||||
ResourceConfig(
|
||||
@@ -43,18 +35,24 @@ class MetadataWorkflow(Workflow):
|
||||
) -> MetadataResponse:
|
||||
"""Return the data schema and storage settings for the review interface.
|
||||
|
||||
When extraction_agent_id is set, fetches the schema from the remote
|
||||
agent so the UI always reflects what the agent will actually extract.
|
||||
Otherwise uses the local schema from config.json.
|
||||
When `configuration_id` is set, fetches the schema from the saved
|
||||
extract configuration so the UI always reflects what will actually be
|
||||
extracted. Otherwise uses the local schema from config.json.
|
||||
"""
|
||||
if extract_config.extraction_agent_id:
|
||||
if extract_config.configuration_id:
|
||||
client = get_llama_cloud_client()
|
||||
agent = await client.extraction.extraction_agents.get(
|
||||
extract_config.extraction_agent_id
|
||||
config_resp = await client.configurations.retrieve(
|
||||
extract_config.configuration_id,
|
||||
project_id=project_id,
|
||||
)
|
||||
schema_dict = agent.data_schema
|
||||
params = config_resp.parameters
|
||||
if not isinstance(params, ExtractV2Parameters):
|
||||
raise ValueError(
|
||||
f"Configuration {extract_config.configuration_id} is not extract_v2"
|
||||
)
|
||||
schema_dict = dict(params.data_schema)
|
||||
else:
|
||||
schema_dict = extraction_schema.to_dict()
|
||||
schema_dict = dict(extract_config.data_schema)
|
||||
|
||||
json_schema = jsonref.replace_refs(schema_dict, proxies=False)
|
||||
contracts_pipeline_id = await get_contracts_pipeline_id()
|
||||
|
||||
@@ -26,6 +26,19 @@ from .config import (
|
||||
InvoiceWithReconciliation,
|
||||
)
|
||||
|
||||
|
||||
def _field_metadata_dict(job: Any) -> dict[str, Any]:
|
||||
"""Pull document-level per-field metadata from a v2 extract job.
|
||||
|
||||
ExtractedData.create() only accepts dict/list/ExtractedFieldMetadata values,
|
||||
so None entries for unextracted fields must be dropped.
|
||||
"""
|
||||
if job.extract_metadata is None or job.extract_metadata.field_metadata is None:
|
||||
return {}
|
||||
doc_metadata = job.extract_metadata.field_metadata.document_metadata or {}
|
||||
return {k: v for k, v in doc_metadata.items() if v is not None}
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -95,10 +108,14 @@ class ProcessFileWorkflow(Workflow):
|
||||
logger.info(f"Running file {file_id}")
|
||||
|
||||
try:
|
||||
files_page = await llama_cloud_client.files.list(
|
||||
file_metadata = None
|
||||
async for f in llama_cloud_client.files.list(
|
||||
file_ids=[file_id], project_id=project_id
|
||||
)
|
||||
file_metadata = files_page.items[0]
|
||||
):
|
||||
file_metadata = f
|
||||
break
|
||||
if file_metadata is None:
|
||||
raise ValueError(f"File {file_id} not found")
|
||||
filename = file_metadata.name
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching file metadata {file_id}: {e}", exc_info=True)
|
||||
@@ -115,18 +132,19 @@ class ProcessFileWorkflow(Workflow):
|
||||
Status(level="info", message=f"Extracting data from file {filename}")
|
||||
)
|
||||
|
||||
if extract_config.extraction_agent_id:
|
||||
# Remote agent mode: delegate schema and settings to the agent
|
||||
extract_job = await llama_cloud_client.extraction.jobs.extract(
|
||||
extraction_agent_id=extract_config.extraction_agent_id,
|
||||
file_id=file_id,
|
||||
if extract_config.configuration_id:
|
||||
extract_job = await llama_cloud_client.extract.create(
|
||||
file_input=file_id,
|
||||
configuration_id=extract_config.configuration_id,
|
||||
project_id=project_id,
|
||||
)
|
||||
else:
|
||||
# Local mode: use schema and settings from config.json
|
||||
extract_job = await llama_cloud_client.extraction.run(
|
||||
config=extract_config.settings.model_dump(),
|
||||
data_schema=extract_config.json_schema,
|
||||
file_id=file_id,
|
||||
extract_job = await llama_cloud_client.extract.create(
|
||||
file_input=file_id,
|
||||
configuration=extract_config.model_dump(
|
||||
exclude={"configuration_id", "product_type"},
|
||||
exclude_none=True,
|
||||
),
|
||||
project_id=project_id,
|
||||
)
|
||||
|
||||
@@ -163,28 +181,27 @@ class ProcessFileWorkflow(Workflow):
|
||||
if state.extract_job_id is None:
|
||||
raise ValueError("Job ID cannot be null when waiting for its completion")
|
||||
|
||||
await llama_cloud_client.extraction.jobs.wait_for_completion(
|
||||
state.extract_job_id
|
||||
await llama_cloud_client.extract.wait_for_completion(
|
||||
state.extract_job_id,
|
||||
project_id=project_id,
|
||||
)
|
||||
job = await llama_cloud_client.extract.get(
|
||||
state.extract_job_id,
|
||||
expand=["extract_metadata"],
|
||||
project_id=project_id,
|
||||
)
|
||||
|
||||
extracted_result = await llama_cloud_client.extraction.jobs.get_result(
|
||||
state.extract_job_id
|
||||
)
|
||||
try:
|
||||
logger.info(
|
||||
f"Extracted data: {json.dumps(extracted_result.model_dump(), indent=2)}"
|
||||
f"Extracted data: {json.dumps(job.model_dump(mode='json'), indent=2, default=str)}"
|
||||
)
|
||||
|
||||
# Validate the extracted data as invoice
|
||||
if not extracted_result.data:
|
||||
if not job.extract_result:
|
||||
raise ValueError("No data extracted from invoice")
|
||||
|
||||
invoice_data = InvoiceExtractionSchema.model_validate(extracted_result.data)
|
||||
invoice_data = InvoiceExtractionSchema.model_validate(job.extract_result)
|
||||
logger.info(f"Extracted invoice data: {invoice_data}")
|
||||
# Extract only the field_metadata we need, not the entire ExtractRun object
|
||||
field_metadata = extracted_result.extraction_metadata.get(
|
||||
"field_metadata", {}
|
||||
)
|
||||
field_metadata = _field_metadata_dict(job)
|
||||
return ExtractedEvent(
|
||||
invoice_data=invoice_data, field_metadata=field_metadata
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user