feat: add invoices reconcilioation template (#43)

This commit is contained in:
Adrian Lyjak
2025-11-06 22:43:29 -05:00
committed by GitHub
parent ffe3842030
commit 42822cea74
37 changed files with 1978 additions and 2 deletions
+3
View File
@@ -0,0 +1,3 @@
# Changes here will be overwritten by Copier; NEVER EDIT MANUALLY
_commit: v0.2.1
_src_path: https://github.com/run-llama/template-workflow-data-extraction
+2
View File
@@ -0,0 +1,2 @@
# copy to .env and place any needed secrets here. LLAMA_CLOUD_API_KEY will be automatically set
# OPENAI_API_KEY=sk-xxx
+7
View File
@@ -0,0 +1,7 @@
.env
__pycache__
workflows.db
.venv
package-lock.json
node_modules
+48 -2
View File
@@ -1,2 +1,48 @@
# template-workflow-extract-reconcile-invoice
Llama Index Workflow Template
# Data Extraction and Ingestion
This is a starter for LlamaAgents. See the [LlamaAgents (llamactl) getting started guide](https://developers.llamaindex.ai/python/llamaagents/llamactl/getting-started/) for context on local development and deployment.
To run the application, install [`uv`](https://docs.astral.sh/uv/) and run `uvx llamactl serve`.
## Simple customizations
For some basic customizations, you can modify `src/extraction_review/config.py`
- **`USE_REMOTE_EXTRACTION_SCHEMA`**: Set to `False` to define your own Pydantic `ExtractionSchema` in this file. Set to `True` to reuse the schema from an existing LlamaCloud Extraction Agent.
- **`EXTRACTION_AGENT_NAME`**: Logical name for your Extraction Agent. When `USE_REMOTE_EXTRACTION_SCHEMA` is `False`, this name is used to upsert the agent with your local schema; when `True`, it is used to fetch an existing agent.
- **`EXTRACTED_DATA_COLLECTION`**: The Agent Data collection name used to store extractions (namespaced by agent name and environment).
- **`ExtractionSchema`**: When using a local schema, edit this Pydantic model to match the fields you want extracted. Prefer optional types where possible to allow for partial extractions.
The UI fetches the JSON Schema and collection name from the backend metadata workflow at runtime, and dynamically
generates an editing UI based on the schema.
## Complex customizations
For more complex customizations, you can edit the rest of the application. For example, you could
- Modify the existing file processing workflow to provide additional context for the extraction process
- Take further action based on the extracted data.
- Add additional workflows to submit data upon approval.
## Linting and type checking
Python and javascript pacakges contain helpful scripts to lint, format, and type check the code.
To check and fix python code:
```bash
uv run hatch run lint
uv run hatch run typecheck
uv run hatch run test
# run all at once
uv run hatch run all-fix
```
To check and fix javascript code, within the `ui` directory:
```bash
pnpm run lint
pnpm run typecheck
pnpm run test
# run all at once
pnpm run all-fix
```
+54
View File
@@ -0,0 +1,54 @@
[project]
name = "extraction-review"
version = "0.1.0"
description = "Extracts data"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"llama-cloud-services>=0.6.69",
"llama-index-workflows>=2.2.0,<3.0.0",
"python-dotenv>=1.1.0",
"jsonref>=1.1.0",
"click>=8.2.1,<8.3.0",
"httpx>=0.28.1",
"llama-index-core>=0.14.0",
"llama-index-llms-openai>=0.3.0",
]
[dependency-groups]
dev = [
"ruff>=0.11.10",
"typescript>=0.0.12",
"ty>=0.0.1a16",
"pytest>=8.4.1",
"hatch>=1.14.1",
"llamactl>=0.3.0"
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.envs.default.scripts]
"format" = "ruff format ."
"format-check" = "ruff format --check ."
"lint" = "ruff check --fix ."
"lint-check" = ["ruff check ."]
typecheck = "ty check src"
test = "pytest"
"all-check" = ["format-check", "lint-check", "test"]
"all-fix" = ["format", "lint", "test"]
[tool.llamadeploy]
env_files = [".env"]
llama_cloud = true
required_env_vars = ["OPENAI_API_KEY"]
[tool.llamadeploy.workflows]
process-file = "extraction_review.process_file:workflow"
metadata = "extraction_review.metadata_workflow:workflow"
index-contract = "extraction_review.index_contract:workflow"
[tool.llamadeploy.ui]
directory = "ui"
View File
+92
View File
@@ -0,0 +1,92 @@
import functools
import logging
import os
import httpx
from llama_cloud.client import AsyncLlamaCloud
from llama_cloud.core.api_error import ApiError
from llama_cloud_services import ExtractionAgent, LlamaExtract, LlamaCloudIndex
from llama_cloud_services.beta.agent_data import AsyncAgentDataClient, ExtractedData
from llama_index.llms.openai import OpenAI
from extraction_review.config import (
CONTRACTS_INDEX_NAME,
EXTRACTED_DATA_COLLECTION,
EXTRACT_CONFIG,
EXTRACTION_AGENT_NAME,
InvoiceExtractionSchema,
InvoiceWithReconciliation,
)
logger = logging.getLogger(__name__)
# deployed agents may infer their name from the deployment name
# Note: Make sure that an agent deployment with this name actually exists
# otherwise calls to get or set data will fail. You may need to adjust the `or `
# name for development
agent_name = os.getenv("LLAMA_DEPLOY_DEPLOYMENT_NAME")
# required for all llama cloud calls
api_key = os.environ["LLAMA_CLOUD_API_KEY"]
# get this in case running against a different environment than production
base_url = os.getenv("LLAMA_CLOUD_BASE_URL")
project_id = os.getenv("LLAMA_DEPLOY_PROJECT_ID")
@functools.lru_cache(maxsize=None)
def get_extract_agent() -> ExtractionAgent:
extract_api = LlamaExtract(
api_key=api_key, base_url=base_url, project_id=project_id
)
try:
existing = extract_api.get_agent(EXTRACTION_AGENT_NAME)
existing.data_schema = InvoiceExtractionSchema
existing.config = EXTRACT_CONFIG
return existing
except ApiError as e:
if e.status_code == 404:
return extract_api.create_agent(
name=EXTRACTION_AGENT_NAME,
data_schema=InvoiceExtractionSchema,
config=EXTRACT_CONFIG,
)
else:
raise
@functools.lru_cache(maxsize=None)
def get_data_client() -> AsyncAgentDataClient[ExtractedData[InvoiceWithReconciliation]]:
return AsyncAgentDataClient(
deployment_name=agent_name,
collection=EXTRACTED_DATA_COLLECTION,
type=ExtractedData[InvoiceWithReconciliation],
client=get_llama_cloud_client(),
)
@functools.lru_cache(maxsize=None)
def get_llama_cloud_client():
return AsyncLlamaCloud(
base_url=base_url,
token=api_key,
httpx_client=httpx.AsyncClient(
timeout=60, headers={"Project-Id": project_id} if project_id else None
),
)
@functools.lru_cache(maxsize=None)
def get_contracts_index() -> LlamaCloudIndex:
"""Get or create the contracts index for storing and retrieving contract documents"""
return LlamaCloudIndex(
name=CONTRACTS_INDEX_NAME,
project_id=project_id,
api_key=api_key,
base_url=base_url,
)
@functools.lru_cache(maxsize=None)
def get_llm() -> OpenAI:
"""Get OpenAI LLM for structured predictions"""
return OpenAI(model="gpt-5-mini", temperature=0)
+130
View File
@@ -0,0 +1,130 @@
"""
For simple configuration of the extraction review application, just customize this file.
If you need more control, feel free to edit the rest of the application
"""
from __future__ import annotations
import os
from llama_cloud import ExtractConfig
from llama_cloud_services.extract import ExtractMode
from pydantic import BaseModel, Field
# The name of the extraction agent to use. Prefers the name of this deployment when deployed to isolate environments.
# Note that the application will create a new agent from the below ExtractionSchema if the extraction agent does not yet exist.
EXTRACTION_AGENT_NAME: str = (
os.getenv("LLAMA_DEPLOY_DEPLOYMENT_NAME") or "invoice-reconciliation"
)
# The name of the collection to use for storing extracted data. This will be qualified by the agent name.
# When developing locally, this will use the _public collection (shared within the project), otherwise agent
# data is isolated to each agent
EXTRACTED_DATA_COLLECTION: str = "invoices"
# The name of the LlamaCloud index for storing contracts
CONTRACTS_INDEX_NAME: str = "contracts"
# Invoice extraction schema - extracted from invoice documents
class LineItem(BaseModel):
description: str | None = Field(
default=None, description="Description of the line item"
)
quantity: float | None = Field(default=None, description="Quantity of the item")
unit_price: float | None = Field(
default=None, description="Price per unit of the item"
)
total: float | None = Field(
default=None, description="Total price for this line item"
)
class InvoiceExtractionSchema(BaseModel):
"""Schema for extracting invoice data"""
invoice_number: str | None = Field(
default=None, description="Invoice number or identifier"
)
invoice_date: str | None = Field(
default=None, description="Date of the invoice (YYYY-MM-DD format if possible)"
)
vendor_name: str | None = Field(
default=None, description="Name of the vendor or supplier"
)
vendor_address: str | None = Field(
default=None, description="Address of the vendor"
)
purchase_order_number: str | None = Field(
default=None, description="Purchase order (PO) number if present"
)
payment_terms: str | None = Field(
default=None,
description="Payment terms (e.g., Net 30, Net 60, Due on receipt)",
)
line_items: list[LineItem] | None = Field(
default=None, description="List of line items on the invoice"
)
subtotal: float | None = Field(
default=None, description="Subtotal before tax and other charges"
)
tax: float | None = Field(default=None, description="Tax amount")
total: float | None = Field(
default=None, description="Total amount due on the invoice"
)
# For backward compatibility
ExtractionSchema = InvoiceExtractionSchema
# Reconciliation schema - extends invoice data with contract matching and discrepancy information
class Discrepancy(BaseModel):
"""Represents a single discrepancy between invoice and contract"""
field: str = Field(description="Field name where discrepancy was found")
invoice_value: str | None = Field(
default=None, description="Value from the invoice"
)
contract_value: str | None = Field(
default=None, description="Expected value from the contract"
)
severity: str | None = Field(
default=None,
description="Severity of the discrepancy (e.g., 'high', 'medium', 'low')",
)
note: str | None = Field(
default=None, description="Additional notes about the discrepancy"
)
class InvoiceWithReconciliation(InvoiceExtractionSchema):
"""Invoice data with reconciliation information"""
matched_contract_id: str | None = Field(
default=None, description="ID of the matched contract file in LlamaCloud"
)
matched_contract_name: str | None = Field(
default=None, description="Name of the matched contract file"
)
match_confidence: str | None = Field(
default=None,
description="Confidence level of the match (e.g., 'high', 'medium', 'low', 'none')",
)
match_rationale: str | None = Field(
default=None, description="Explanation of why this contract was matched"
)
discrepancies: list[Discrepancy] | None = Field(
default=None,
description="List of discrepancies found between invoice and contract",
)
EXTRACT_CONFIG = ExtractConfig(
extraction_mode=ExtractMode.PREMIUM,
system_prompt=None,
# advanced. Only compatible with Premium mode.
use_reasoning=False,
cite_sources=False,
confidence_scores=True,
)
+161
View File
@@ -0,0 +1,161 @@
"""
Workflow for indexing contract documents into LlamaCloud Index for retrieval.
"""
import logging
import os
import tempfile
from pathlib import Path
from typing import Literal
import httpx
from llama_index.core import Document
from pydantic import BaseModel
from workflows import Context, Workflow, step
from workflows.events import Event, StartEvent, StopEvent
from .clients import get_contracts_index, get_llama_cloud_client
logger = logging.getLogger(__name__)
class ContractFileEvent(StartEvent):
"""Event to start contract indexing with a file ID"""
file_id: str
class DownloadContractEvent(Event):
"""Event to trigger contract download"""
pass
class ContractDownloadedEvent(Event):
"""Event indicating contract has been downloaded"""
pass
class UIToast(Event):
"""Event to show toast notifications in the UI"""
level: Literal["info", "warning", "error"]
message: str
class ContractIndexState(BaseModel):
"""State for contract indexing workflow"""
file_id: str | None = None
file_path: str | None = None
filename: str | None = None
class IndexContractWorkflow(Workflow):
"""
Workflow to download and index a contract document into LlamaCloud Index.
"""
@step()
async def start_indexing(
self, event: ContractFileEvent, ctx: Context[ContractIndexState]
) -> DownloadContractEvent:
"""Initialize the workflow with the file ID"""
logger.info(f"Starting contract indexing for file {event.file_id}")
async with ctx.store.edit_state() as state:
state.file_id = event.file_id
return DownloadContractEvent()
@step()
async def download_contract(
self, event: DownloadContractEvent, ctx: Context[ContractIndexState]
) -> ContractDownloadedEvent:
"""Download the contract file from LlamaCloud storage"""
state = await ctx.store.get_state()
if state.file_id is None:
raise ValueError("File ID is not set")
file_metadata = await get_llama_cloud_client().files.get_file(id=state.file_id)
file_url = await get_llama_cloud_client().files.read_file_content(state.file_id)
temp_dir = tempfile.gettempdir()
filename = file_metadata.name
file_path = os.path.join(temp_dir, filename)
logger.info(f"Downloading contract {filename} from {file_url.url}")
ctx.write_event_to_stream(
UIToast(level="info", message=f"Downloading contract: {filename}")
)
client = httpx.AsyncClient()
async with client.stream("GET", file_url.url) as response:
with open(file_path, "wb") as f:
async for chunk in response.aiter_bytes():
f.write(chunk)
logger.info(f"Downloaded contract to {file_path}")
async with ctx.store.edit_state() as state:
state.file_path = file_path
state.filename = filename
return ContractDownloadedEvent()
@step()
async def index_contract(
self, event: ContractDownloadedEvent, ctx: Context[ContractIndexState]
) -> StopEvent:
"""Index the contract document into LlamaCloud Index"""
state = await ctx.store.get_state()
if state.file_path is None or state.filename is None:
raise ValueError("File path or filename is not set")
logger.info(f"Indexing contract {state.filename}")
ctx.write_event_to_stream(
UIToast(level="info", message=f"Indexing contract: {state.filename}")
)
# Create a document with metadata
file_content = Path(state.file_path).read_text(errors="ignore")
document = Document(
text=file_content,
metadata={
"filename": state.filename,
"file_id": state.file_id,
"document_type": "contract",
},
)
# Get the contracts index and insert the document
index = get_contracts_index()
await index.ainsert(document)
logger.info(f"Successfully indexed contract {state.filename}")
ctx.write_event_to_stream(
UIToast(
level="info",
message=f"Successfully indexed contract: {state.filename}",
)
)
return StopEvent(result={"file_id": state.file_id, "filename": state.filename})
workflow = IndexContractWorkflow(timeout=None)
if __name__ == "__main__":
import asyncio
from dotenv import load_dotenv
load_dotenv()
logging.basicConfig(level=logging.INFO)
async def main():
# Example usage - upload a contract and index it
file = await get_llama_cloud_client().files.upload_file(
upload_file=Path("sample_contract.pdf").open("rb")
)
result = await workflow.run(start_event=ContractFileEvent(file_id=file.id))
print(f"Indexed contract: {result}")
asyncio.run(main())
@@ -0,0 +1,30 @@
from typing import Any
from workflows import Workflow, step
from workflows.events import StartEvent, StopEvent
from extraction_review.schema import get_extraction_schema_json
from .config import EXTRACTED_DATA_COLLECTION
class MetadataResponse(StopEvent):
json_schema: dict[str, Any]
extracted_data_collection: str
class MetadataWorkflow(Workflow):
"""
Simple single step workflow to expose configuration to the UI, such as the JSON schema and collection name.
"""
@step
async def get_metadata(self, _: StartEvent) -> MetadataResponse:
json_schema = await get_extraction_schema_json()
return MetadataResponse(
json_schema=json_schema,
extracted_data_collection=EXTRACTED_DATA_COLLECTION,
)
workflow = MetadataWorkflow(timeout=None)
+439
View File
@@ -0,0 +1,439 @@
import asyncio
import hashlib
import logging
import os
import tempfile
from pathlib import Path
from typing import Any, Literal
import httpx
from llama_cloud import ExtractRun
from llama_cloud_services.beta.agent_data import ExtractedData, InvalidExtractionData
from llama_cloud_services.extract import SourceText
from llama_index.core.prompts import PromptTemplate
from pydantic import BaseModel, Field
from workflows import Context, Workflow, step
from workflows.events import Event, StartEvent, StopEvent
from .clients import (
get_contracts_index,
get_data_client,
get_extract_agent,
get_llama_cloud_client,
get_llm,
)
from .config import Discrepancy, InvoiceExtractionSchema, InvoiceWithReconciliation
logger = logging.getLogger(__name__)
class FileEvent(StartEvent):
file_id: str
class DownloadFileEvent(Event):
pass
class FileDownloadedEvent(Event):
pass
class UIToast(Event):
level: Literal["info", "warning", "error"]
message: str
class ExtractedEvent(Event):
"""Event when invoice data is successfully extracted"""
invoice_data: InvoiceExtractionSchema
field_metadata: dict[str, Any]
class ExtractedInvalidEvent(Event):
"""Event when extraction validation fails"""
data: ExtractedData[dict[str, Any]]
class ReconciledEvent(Event):
"""Event when invoice is reconciled with contracts"""
data: ExtractedData[InvoiceWithReconciliation]
class ExtractionState(BaseModel):
file_id: str | None = None
file_path: str | None = None
filename: str | None = None
class ProcessFileWorkflow(Workflow):
"""
Given a file path, this workflow will process a single file through the custom extraction logic.
"""
@step()
async def run_file(self, event: FileEvent, ctx: Context) -> DownloadFileEvent:
logger.info(f"Running file {event.file_id}")
async with ctx.store.edit_state() as state:
state.file_id = event.file_id
return DownloadFileEvent()
@step()
async def download_file(
self, event: DownloadFileEvent, ctx: Context[ExtractionState]
) -> FileDownloadedEvent:
"""Download the file reference from the cloud storage"""
state = await ctx.store.get_state()
if state.file_id is None:
raise ValueError("File ID is not set")
try:
file_metadata = await get_llama_cloud_client().files.get_file(
id=state.file_id
)
file_url = await get_llama_cloud_client().files.read_file_content(
state.file_id
)
temp_dir = tempfile.gettempdir()
filename = file_metadata.name
file_path = os.path.join(temp_dir, filename)
client = httpx.AsyncClient()
# Report progress to the UI
logger.info(f"Downloading file {file_url.url} to {file_path}")
async with client.stream("GET", file_url.url) as response:
with open(file_path, "wb") as f:
async for chunk in response.aiter_bytes():
f.write(chunk)
logger.info(f"Downloaded file {file_url.url} to {file_path}")
async with ctx.store.edit_state() as state:
state.file_path = file_path
state.filename = filename
return FileDownloadedEvent()
except Exception as e:
logger.error(f"Error downloading file {state.file_id}: {e}", exc_info=True)
ctx.write_event_to_stream(
UIToast(
level="error",
message=f"Error downloading file {state.file_id}: {e}",
)
)
raise e
@step()
async def process_file(
self, event: FileDownloadedEvent, ctx: Context[ExtractionState]
) -> ExtractedEvent | ExtractedInvalidEvent:
"""Runs the extraction against the file"""
state = await ctx.store.get_state()
if state.file_path is None or state.filename is None:
raise ValueError("File path or filename is not set")
try:
agent = get_extract_agent()
source_text = SourceText(
file=state.file_path,
filename=state.filename,
)
logger.info(f"Extracting data from file {state.filename}")
ctx.write_event_to_stream(
UIToast(
level="info", message=f"Extracting data from file {state.filename}"
)
)
extracted_result: ExtractRun = await agent.aextract(source_text)
# Validate the extracted data
if not extracted_result.data:
raise ValueError("No data extracted from invoice")
invoice_data = InvoiceExtractionSchema.model_validate(extracted_result.data)
logger.info(f"Extracted invoice data: {invoice_data}")
# Extract only the field_metadata we need, not the entire ExtractRun object
field_metadata = extracted_result.extraction_metadata.get(
"field_metadata", {}
)
return ExtractedEvent(
invoice_data=invoice_data, field_metadata=field_metadata
)
except InvalidExtractionData as e:
logger.error(f"Error validating extracted data: {e}", exc_info=True)
return ExtractedInvalidEvent(data=e.invalid_item)
except Exception as e:
logger.error(
f"Error extracting data from file {state.filename}: {e}",
exc_info=True,
)
ctx.write_event_to_stream(
UIToast(
level="error",
message=f"Error extracting data from file {state.filename}: {e}",
)
)
raise e
@step()
async def reconcile_with_contract(
self, event: ExtractedEvent, ctx: Context[ExtractionState]
) -> ReconciledEvent:
"""Reconcile the invoice with matching contracts using retrieval and LLM"""
state = await ctx.store.get_state()
invoice_data = event.invoice_data
logger.info("Reconciling invoice with contracts")
ctx.write_event_to_stream(
UIToast(level="info", message="Matching invoice with contracts...")
)
try:
# Build a query from invoice data for contract retrieval
query_parts = []
if invoice_data.vendor_name:
query_parts.append(f"vendor: {invoice_data.vendor_name}")
if invoice_data.purchase_order_number:
query_parts.append(f"PO: {invoice_data.purchase_order_number}")
if invoice_data.invoice_number:
query_parts.append(f"invoice: {invoice_data.invoice_number}")
query = " ".join(query_parts) if query_parts else "contract agreement"
# Retrieve relevant contracts
index = get_contracts_index()
retriever = index.as_retriever(similarity_top_k=3)
retrieved_nodes = await retriever.aretrieve(query)
if not retrieved_nodes:
logger.info("No contracts found in index")
# No contracts available - create reconciliation data with no match
reconciled_data = InvoiceWithReconciliation(
**invoice_data.model_dump(),
match_confidence="none",
match_rationale="No contracts found in the system",
discrepancies=[],
)
else:
# Use LLM to match and reconcile
reconciled_data = await self._match_and_reconcile(
invoice_data, retrieved_nodes
)
# Create ExtractedData with reconciliation information
file_content = Path(state.file_path).read_bytes()
file_hash = hashlib.sha256(file_content).hexdigest()
# Get field metadata from extraction event
field_metadata = event.field_metadata
extracted_data = ExtractedData.create(
data=reconciled_data,
file_id=state.file_id,
file_name=state.filename,
file_hash=file_hash,
field_metadata=field_metadata,
)
logger.info(f"Reconciliation complete: {reconciled_data.match_confidence}")
return ReconciledEvent(data=extracted_data)
except Exception as e:
logger.error(f"Error during reconciliation: {e}", exc_info=True)
# If reconciliation fails, still create data without reconciliation
reconciled_data = InvoiceWithReconciliation(
**invoice_data.model_dump(),
match_confidence="error",
match_rationale=f"Error during reconciliation: {str(e)}",
discrepancies=[],
)
file_content = Path(state.file_path).read_bytes()
file_hash = hashlib.sha256(file_content).hexdigest()
field_metadata = event.field_metadata
extracted_data = ExtractedData.create(
data=reconciled_data,
file_id=state.file_id,
file_name=state.filename,
file_hash=file_hash,
field_metadata=field_metadata,
)
return ReconciledEvent(data=extracted_data)
async def _match_and_reconcile(
self, invoice_data: InvoiceExtractionSchema, retrieved_nodes: list
) -> InvoiceWithReconciliation:
"""Use LLM to match invoice with contract and identify discrepancies"""
# Define structured output schema for LLM
class ContractMatchResult(BaseModel):
"""Result of matching invoice to contract"""
is_match: bool = Field(
description="Whether a plausible contract match was found"
)
matched_contract_index: int | None = Field(
default=None,
description="Index (0-based) of the matched contract in the provided list, or None if no match",
)
match_confidence: str = Field(
description="Confidence level: 'high', 'medium', 'low', or 'none'"
)
match_rationale: str = Field(
description="Explanation of why this contract was or was not matched"
)
contract_payment_terms: str | None = Field(
default=None, description="Payment terms found in the matched contract"
)
discrepancies: list[Discrepancy] = Field(
default_factory=list,
description="List of discrepancies found between invoice and contract",
)
# Prepare contract context
contracts_text = "\n\n".join(
[
f"Contract {i} (File: {node.metadata.get('filename', 'Unknown')}):\n{node.text[:1000]}"
for i, node in enumerate(retrieved_nodes)
]
)
# Create prompt for matching
prompt_template = PromptTemplate(
"""You are analyzing an invoice to match it with the correct contract and identify any discrepancies.
Invoice Details:
- Vendor: {vendor_name}
- Invoice Number: {invoice_number}
- Invoice Date: {invoice_date}
- PO Number: {po_number}
- Payment Terms: {payment_terms}
- Total: {total}
Retrieved Contracts:
{contracts_text}
Task:
1. Determine if any of the retrieved contracts plausibly matches this invoice based on:
- Vendor name matching or similarity
- PO number or invoice number references
- Date ranges or validity periods
- Any other relevant identifiers
2. If a match is found, identify discrepancies between invoice and contract, focusing on:
- Payment terms differences (CRITICAL)
- Total amount mismatches if contract specifies amounts
- Vendor name discrepancies
- Any other obvious conflicts
3. Assess match confidence:
- 'high': Clear match with strong vendor/PO/identifier alignment
- 'medium': Probable match with some uncertainty
- 'low': Weak match, possibly relevant but uncertain
- 'none': No plausible match found
Provide your analysis in the specified format."""
)
# Use LLM with structured prediction
llm = get_llm()
result = await llm.astructured_predict(
ContractMatchResult,
prompt_template,
**{
"vendor_name": invoice_data.vendor_name or "N/A",
"invoice_number": invoice_data.invoice_number or "N/A",
"invoice_date": invoice_data.invoice_date or "N/A",
"po_number": invoice_data.purchase_order_number or "N/A",
"payment_terms": invoice_data.payment_terms or "N/A",
"total": invoice_data.total or "N/A",
"contracts_text": contracts_text,
},
)
# Build reconciled invoice data
matched_contract_id = None
matched_contract_name = None
if result.is_match and result.matched_contract_index is not None:
matched_node = retrieved_nodes[result.matched_contract_index]
matched_contract_id = matched_node.metadata.get("file_id")
matched_contract_name = matched_node.metadata.get("filename")
return InvoiceWithReconciliation(
**invoice_data.model_dump(),
matched_contract_id=matched_contract_id,
matched_contract_name=matched_contract_name,
match_confidence=result.match_confidence,
match_rationale=result.match_rationale,
discrepancies=result.discrepancies,
)
@step()
async def record_extracted_data(
self, event: ReconciledEvent | ExtractedInvalidEvent, ctx: Context
) -> StopEvent:
"""Records the extracted data to the agent data API"""
try:
logger.info(f"Recorded extracted data for file {event.data.file_name}")
ctx.write_event_to_stream(
UIToast(
level="info",
message=f"Recorded extracted data for file {event.data.file_name}",
)
)
# remove past data when reprocessing the same file
if event.data.file_hash:
existing_data = await get_data_client().untyped_search(
filter={
"file_hash": {
"eq": event.data.file_hash,
},
},
)
if existing_data.items:
logger.info(
f"Removing past data for file {event.data.file_name} with hash {event.data.file_hash}"
)
await asyncio.gather(
*[
get_data_client().delete_item(item.id)
for item in existing_data.items
]
)
# finally, save the new data
item_id = await get_data_client().create_item(event.data)
return StopEvent(
result=item_id.id,
)
except Exception as e:
logger.error(
f"Error recording extracted data for file {event.data.file_name}: {e}",
exc_info=True,
)
ctx.write_event_to_stream(
UIToast(
level="error",
message=f"Error recording extracted data for file {event.data.file_name}: {e}",
)
)
raise e
workflow = ProcessFileWorkflow(timeout=None)
if __name__ == "__main__":
from dotenv import load_dotenv
load_dotenv()
logging.basicConfig(level=logging.INFO)
async def main():
file = await get_llama_cloud_client().files.upload_file(
upload_file=Path("test.pdf").open("rb")
)
await workflow.run(start_event=FileEvent(file_id=file.id))
asyncio.run(main())
+32
View File
@@ -0,0 +1,32 @@
from typing import Any, Type
import jsonref
from pydantic import BaseModel, Field, create_model
from extraction_review.config import InvoiceWithReconciliation
async def get_extraction_schema_json() -> dict[str, Any]:
json_schema = InvoiceWithReconciliation.model_json_schema()
json_schema = jsonref.replace_refs(json_schema, proxies=False)
return json_schema
def model_from_schema(schema: dict[str, Any]) -> Type[BaseModel]:
"""
Converts a JSON schema back to a Pydantic model.
"""
typemap = {
"string": str,
"integer": int,
"number": float,
"boolean": bool,
"array": list,
"object": dict,
}
fields = {}
for prop, meta in schema.get("properties", {}).items():
py_type = typemap.get(meta.get("type"), Any)
default = ... if prop in schema.get("required", []) else None
fields[prop] = (py_type, Field(default, description=meta.get("description")))
return create_model(schema.get("title", "DynamicModel"), **fields)
+24
View File
@@ -0,0 +1,24 @@
We are building an invoice extraction and reconciliation workflow app.
Invoices are parsed into structured data, then compared against indexed contracts to reconcile the invoice with its matching contract. Update the invoice record with contract-derived information and any discrepancies.
Using the UI, the user should be able to:
- add and index new contracts
- add and reconcile new invoices
This should be based off of the base extraction review template, which has 2 pages, one that displays a table of all extracted items (one row per invoice), and one for the item details (the extracted data for one invoice, e.g. total and line items). The items and details view should show the invoices.
Contracts can remain largely invisible in the UI for now, but there should be a minimal way to add them. These should be placed into a LlamaCloud index (which parses PDFs to plain text for retrieval).
The stored schema should extend the extracted invoice schema with reconciliation fields, such as links to the matched contract, a match confidence/score, and a structured list of discrepancies.
Matching should retrieve candidate contracts and then use an LLM, with context for both the candidate contracts and the invoice data, to make the final selection and provide rationale. When no contract matches, record that outcome clearly.
When matching and reconciling, consider:
- Whether there is any plausible matching contract versus only irrelevant results (e.g., vendor name, contract dates/ranges, contract or PO numbers).
- Whether payment terms are matching (at minimum).
- Optionally, check other obvious alignments if cheaply available (e.g., totals, vendor identifiers).
Represent reconciliation results in the details view with a clear, structured list of discrepancies (e.g., field, invoice_value, contract_value, optional note/severity).
The vast majority of this change should be kept in the python codebase. Some minor changes may need to be added to the UI, however do not do anything complex, just a button or small widget.
+2
View File
@@ -0,0 +1,2 @@
def test_placeholder():
pass
+43
View File
@@ -0,0 +1,43 @@
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
# dependencies
/node_modules
/.pnp
.pnp.*
.yarn/*
!.yarn/patches
!.yarn/plugins
!.yarn/releases
!.yarn/versions
# testing
/coverage
# next.js
/.next/
/out/
/dist/
# production
/build
# misc
.DS_Store
*.pem
# debug
npm-debug.log*
yarn-debug.log*
yarn-error.log*
.pnpm-debug.log*
# env files (can opt-in for committing if needed)
.env*
# vercel
.vercel
# typescript
*.tsbuildinfo
next-env.d.ts
+7
View File
@@ -0,0 +1,7 @@
# Data Extraction UI
This is a simple next.js template that builds on the @llamaindex/agent-app ui component library
for showing displaying tables of extracted data.
Ideally run this with `llamactl` in the parent directory (See [README.md](../README.md)),
but you can also run it standalone with `npm run dev`, but workflow integrations will not work
+21
View File
@@ -0,0 +1,21 @@
{
"$schema": "https://ui.shadcn.com/schema.json",
"style": "new-york",
"rsc": true,
"tsx": true,
"tailwind": {
"config": "",
"css": "src/index.css",
"baseColor": "zinc",
"cssVariables": true,
"prefix": ""
},
"aliases": {
"components": "@/components",
"utils": "@/lib/utils",
"ui": "@/components/ui",
"lib": "@/lib",
"hooks": "@/hooks"
},
"iconLibrary": "lucide"
}
+12
View File
@@ -0,0 +1,12 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Review</title>
</head>
<body>
<div id="root"></div>
<script type="module" src="/src/main.tsx"></script>
</body>
</html>
+45
View File
@@ -0,0 +1,45 @@
{
"name": "extraction-review-ui",
"version": "0.1.0",
"private": true,
"type": "module",
"scripts": {
"dev": "vite",
"build": "tsc && vite build",
"preview": "vite preview",
"lint": "tsc --noEmit",
"format": "prettier --write src",
"format-check": "prettier --check src",
"all-check": "pnpm i && pnpm run lint && pnpm run format-check && pnpm run build",
"all-fix": "pnpm i && pnpm run lint && pnpm run format && pnpm run build"
},
"dependencies": {
"@babel/runtime": "^7.27.6",
"@lezer/highlight": "^1.2.1",
"@llamaindex/ui": "^2.1.2",
"@radix-ui/themes": "^3.2.1",
"class-variance-authority": "^0.7.1",
"clsx": "^2.1.1",
"llama-cloud-services": "^0.3.4",
"lucide-react": "^0.514.0",
"react": "^18.3.0",
"react-dom": "^18.3.0",
"react-router-dom": "^6.30.0",
"sonner": "^2.0.5",
"tw-animate-css": "^1.3.5"
},
"devDependencies": {
"@tailwindcss/postcss": "^4.1.10",
"@types/node": "^20",
"@types/react": "^19",
"@types/react-dom": "^19",
"@vitejs/plugin-react": "^4.3.4",
"postcss": "^8.5.5",
"prettier": "^3.6.2",
"tailwind-merge": "^3.3.1",
"tailwindcss": "^4.1.8",
"typescript": "^5",
"vite": "^6.0.5"
},
"packageManager": "pnpm@10.11.1+sha512.e519b9f7639869dc8d5c3c5dfef73b3f091094b0a006d7317353c72b124e80e1afd429732e28705ad6bfa1ee879c1fce46c128ccebd3192101f43dd67c667912"
}
+7
View File
@@ -0,0 +1,7 @@
const config = {
plugins: {
"@tailwindcss/postcss": {},
},
};
export default config;
+70
View File
@@ -0,0 +1,70 @@
import React from "react";
import { Routes, Route } from "react-router-dom";
import { Theme } from "@radix-ui/themes";
import {
Breadcrumb,
BreadcrumbItem,
BreadcrumbList,
BreadcrumbSeparator,
} from "@llamaindex/ui";
import { Link } from "react-router-dom";
import { Toaster } from "@llamaindex/ui";
import { useToolbar, ToolbarProvider } from "@/lib/ToolbarContext";
import { MetadataProvider } from "@/lib/MetadataProvider";
// Import pages
import HomePage from "./pages/HomePage";
import ItemPage from "./pages/ItemPage";
export default function App() {
return (
<Theme>
<MetadataProvider>
<ToolbarProvider>
<div className="grid grid-rows-[auto_1fr] h-screen">
<Toolbar />
<main className="overflow-auto">
<Routes>
<Route path="/" element={<HomePage />} />
<Route path="/item/:itemId" element={<ItemPage />} />
</Routes>
</main>
</div>
<Toaster />
</ToolbarProvider>
</MetadataProvider>
</Theme>
);
}
const Toolbar = () => {
const { buttons, breadcrumbs } = useToolbar();
return (
<header className="sticky top-0 z-50 flex h-16 shrink-0 items-center gap-2 border-b px-4 bg-white/95 backdrop-blur supports-[backdrop-filter]:bg-white/60">
<Breadcrumb>
<BreadcrumbList>
{breadcrumbs.map((item, index) => (
<React.Fragment key={index}>
{index > 0 && <BreadcrumbSeparator />}
<BreadcrumbItem>
{item.href && !item.isCurrentPage ? (
<Link to={item.href} className="font-medium text-base">
{item.label}
</Link>
) : (
<span
className={`font-medium ${index === 0 ? "text-base" : ""}`}
>
{item.label}
</span>
)}
</BreadcrumbItem>
</React.Fragment>
))}
</BreadcrumbList>
</Breadcrumb>
{buttons}
</header>
);
};
+120
View File
@@ -0,0 +1,120 @@
@import "tailwindcss";
@import "tw-animate-css";
@custom-variant dark (&:is(.dark *));
@theme inline {
--radius-sm: calc(var(--radius) - 4px);
--radius-md: calc(var(--radius) - 2px);
--radius-lg: var(--radius);
--radius-xl: calc(var(--radius) + 4px);
--color-background: var(--background);
--color-foreground: var(--foreground);
--color-card: var(--card);
--color-card-foreground: var(--card-foreground);
--color-popover: var(--popover);
--color-popover-foreground: var(--popover-foreground);
--color-primary: var(--primary);
--color-primary-foreground: var(--primary-foreground);
--color-secondary: var(--secondary);
--color-secondary-foreground: var(--secondary-foreground);
--color-muted: var(--muted);
--color-muted-foreground: var(--muted-foreground);
--color-accent: var(--accent);
--color-accent-foreground: var(--accent-foreground);
--color-destructive: var(--destructive);
--color-border: var(--border);
--color-input: var(--input);
--color-ring: var(--ring);
--color-chart-1: var(--chart-1);
--color-chart-2: var(--chart-2);
--color-chart-3: var(--chart-3);
--color-chart-4: var(--chart-4);
--color-chart-5: var(--chart-5);
--color-sidebar: var(--sidebar);
--color-sidebar-foreground: var(--sidebar-foreground);
--color-sidebar-primary: var(--sidebar-primary);
--color-sidebar-primary-foreground: var(--sidebar-primary-foreground);
--color-sidebar-accent: var(--sidebar-accent);
--color-sidebar-accent-foreground: var(--sidebar-accent-foreground);
--color-sidebar-border: var(--sidebar-border);
--color-sidebar-ring: var(--sidebar-ring);
}
:root {
--radius: 0.625rem;
--card: oklch(1 0 0);
--card-foreground: oklch(0.141 0.005 285.823);
--popover: oklch(1 0 0);
--popover-foreground: oklch(0.141 0.005 285.823);
--primary: oklch(0.21 0.006 285.885);
--primary-foreground: oklch(0.985 0 0);
--secondary: oklch(0.967 0.001 286.375);
--secondary-foreground: oklch(0.21 0.006 285.885);
--muted: oklch(0.967 0.001 286.375);
--muted-foreground: oklch(0.552 0.016 285.938);
--accent: oklch(0.967 0.001 286.375);
--accent-foreground: oklch(0.21 0.006 285.885);
--destructive: oklch(0.577 0.245 27.325);
--border: oklch(0.92 0.004 286.32);
--input: oklch(0.92 0.004 286.32);
--ring: oklch(0.705 0.015 286.067);
--chart-1: oklch(0.646 0.222 41.116);
--chart-2: oklch(0.6 0.118 184.704);
--chart-3: oklch(0.398 0.07 227.392);
--chart-4: oklch(0.828 0.189 84.429);
--chart-5: oklch(0.769 0.188 70.08);
--sidebar: oklch(0.985 0 0);
--sidebar-foreground: oklch(0.141 0.005 285.823);
--sidebar-primary: oklch(0.21 0.006 285.885);
--sidebar-primary-foreground: oklch(0.985 0 0);
--sidebar-accent: oklch(0.967 0.001 286.375);
--sidebar-accent-foreground: oklch(0.21 0.006 285.885);
--sidebar-border: oklch(0.92 0.004 286.32);
--sidebar-ring: oklch(0.705 0.015 286.067);
--background: oklch(1 0 0);
--foreground: oklch(0.141 0.005 285.823);
}
.dark {
--background: oklch(0.141 0.005 285.823);
--foreground: oklch(0.985 0 0);
--card: oklch(0.21 0.006 285.885);
--card-foreground: oklch(0.985 0 0);
--popover: oklch(0.21 0.006 285.885);
--popover-foreground: oklch(0.985 0 0);
--primary: oklch(0.92 0.004 286.32);
--primary-foreground: oklch(0.21 0.006 285.885);
--secondary: oklch(0.274 0.006 286.033);
--secondary-foreground: oklch(0.985 0 0);
--muted: oklch(0.274 0.006 286.033);
--muted-foreground: oklch(0.705 0.015 286.067);
--accent: oklch(0.274 0.006 286.033);
--accent-foreground: oklch(0.985 0 0);
--destructive: oklch(0.704 0.191 22.216);
--border: oklch(1 0 0 / 10%);
--input: oklch(1 0 0 / 15%);
--ring: oklch(0.552 0.016 285.938);
--chart-1: oklch(0.488 0.243 264.376);
--chart-2: oklch(0.696 0.17 162.48);
--chart-3: oklch(0.769 0.188 70.08);
--chart-4: oklch(0.627 0.265 303.9);
--chart-5: oklch(0.645 0.246 16.439);
--sidebar: oklch(0.21 0.006 285.885);
--sidebar-foreground: oklch(0.985 0 0);
--sidebar-primary: oklch(0.488 0.243 264.376);
--sidebar-primary-foreground: oklch(0.985 0 0);
--sidebar-accent: oklch(0.274 0.006 286.033);
--sidebar-accent-foreground: oklch(0.985 0 0);
--sidebar-border: oklch(1 0 0 / 10%);
--sidebar-ring: oklch(0.552 0.016 285.938);
}
@layer base {
* {
@apply border-border outline-ring/50;
}
body {
@apply bg-background text-foreground;
}
}
+71
View File
@@ -0,0 +1,71 @@
import { createContext, useContext, ReactNode, useMemo } from "react";
import { ApiProvider, ApiClients } from "@llamaindex/ui";
import { useMetadata, Metadata } from "./useMetadata";
import { createBaseWorkflowClient, createClients } from "./client";
import { Clock, XCircle } from "lucide-react";
interface MetadataContextValue {
metadata: Metadata;
clients: ApiClients;
}
const MetadataContext = createContext<MetadataContextValue | null>(null);
export function MetadataProvider({ children }: { children: ReactNode }) {
const baseClients: ApiClients = useMemo(() => {
return {
workflowsClient: createBaseWorkflowClient(),
} as ApiClients;
}, []);
return (
<ApiProvider clients={baseClients}>
<InnerMetadataProvider>{children}</InnerMetadataProvider>
</ApiProvider>
);
}
function InnerMetadataProvider({ children }: { children: ReactNode }) {
const { metadata, loading, error } = useMetadata();
const clients = useMemo(
() => (metadata ? createClients(metadata) : undefined),
[metadata],
);
if (loading) {
return (
<div className="flex h-screen items-center justify-center">
<div className="text-center">
<Clock className="h-8 w-8 animate-spin mx-auto mb-2" />
<div className="text-sm text-gray-500">Loading configuration...</div>
</div>
</div>
);
}
if (error || !metadata || !clients) {
return (
<div className="flex h-screen items-center justify-center">
<div className="text-center">
<XCircle className="h-8 w-8 text-red-500 mx-auto mb-2" />
<div className="text-sm text-gray-500">
Error loading configuration: {error || "Unknown error"}
</div>
</div>
</div>
);
}
return (
<MetadataContext.Provider value={{ metadata, clients }}>
<ApiProvider clients={clients}>{children}</ApiProvider>
</MetadataContext.Provider>
);
}
export function useMetadataContext() {
const context = useContext(MetadataContext);
if (!context) {
throw new Error("useMetadataContext must be used within MetadataProvider");
}
return context;
}
+41
View File
@@ -0,0 +1,41 @@
import React from "react";
import { APP_TITLE } from "./config";
export interface BreadcrumbItem {
label: string;
href?: string;
isCurrentPage?: boolean;
}
export const ToolbarCtx = React.createContext<{
buttons: React.ReactNode[];
setButtons: (fn: (prev: React.ReactNode[]) => React.ReactNode[]) => void;
breadcrumbs: BreadcrumbItem[];
setBreadcrumbs: (items: BreadcrumbItem[]) => void;
}>({
buttons: [],
setButtons: () => {},
breadcrumbs: [],
setBreadcrumbs: () => {},
});
export const ToolbarProvider = ({
children,
}: {
children: React.ReactNode;
}) => {
const [buttons, setButtons] = React.useState<React.ReactNode[]>([]);
const [breadcrumbs, setBreadcrumbs] = React.useState<BreadcrumbItem[]>([
{ label: APP_TITLE, href: "/" },
]);
return (
<ToolbarCtx.Provider
value={{ buttons, setButtons, breadcrumbs, setBreadcrumbs }}
>
{children}
</ToolbarCtx.Provider>
);
};
export const useToolbar = () => React.useContext(ToolbarCtx);
+51
View File
@@ -0,0 +1,51 @@
import { ExtractedData } from "llama-cloud-services/beta/agent";
import {
ApiClients,
createWorkflowsClient,
createWorkflowsConfig,
createCloudAgentClient,
cloudApiClient,
} from "@llamaindex/ui";
import { AGENT_NAME } from "./config";
import type { Metadata } from "./useMetadata";
const platformToken = import.meta.env.VITE_LLAMA_CLOUD_API_KEY;
const apiBaseUrl = import.meta.env.VITE_LLAMA_CLOUD_BASE_URL;
const projectId = import.meta.env.VITE_LLAMA_DEPLOY_PROJECT_ID;
// Configure the platform client
cloudApiClient.setConfig({
...(apiBaseUrl && { baseUrl: apiBaseUrl }),
headers: {
// optionally use a backend API token scoped to a project. For local development,
...(platformToken && { authorization: `Bearer ${platformToken}` }),
// This header is required for requests to correctly scope to the agent's project
// when authenticating with a user cookie
...(projectId && { "Project-Id": projectId }),
},
});
export function createBaseWorkflowClient(): ReturnType<
typeof createWorkflowsClient
> {
return createWorkflowsClient(
createWorkflowsConfig({
baseUrl: `/deployments/${AGENT_NAME}/`,
}),
);
}
export function createClients(metadata: Metadata): ApiClients {
const workflowsClient = createBaseWorkflowClient();
const agentClient = createCloudAgentClient<ExtractedData<any>>({
client: cloudApiClient,
windowUrl: typeof window !== "undefined" ? window.location.href : undefined,
collection: metadata.extracted_data_collection,
});
return {
workflowsClient,
cloudApiClient,
agentDataClient: agentClient,
} as ApiClients;
}
+2
View File
@@ -0,0 +1,2 @@
export const APP_TITLE = "Extraction Review";
export const AGENT_NAME = import.meta.env.VITE_LLAMA_DEPLOY_DEPLOYMENT_NAME;
+39
View File
@@ -0,0 +1,39 @@
import type {
ExtractedData,
TypedAgentData,
} from "llama-cloud-services/beta/agent";
/**
* Downloads data as a JSON file
*/
export function downloadJSON<T>(
data: T,
filename: string = "extraction-results.json",
) {
const jsonString = JSON.stringify(data, null, 2);
const blob = new Blob([jsonString], { type: "application/json" });
const url = URL.createObjectURL(blob);
const link = document.createElement("a");
link.href = url;
link.download = filename;
document.body.appendChild(link);
link.click();
// Cleanup
document.body.removeChild(link);
URL.revokeObjectURL(url);
}
/**
* Downloads extracted data item as JSON
*/
export function downloadExtractedDataItem<T>(
item: TypedAgentData<ExtractedData<T>>,
) {
const fileName = item.data.file_name || "item";
const timestamp = item.createdAt.toISOString().split("T")[0];
const filename = `${fileName}-${timestamp}.json`;
downloadJSON(item, filename);
}
+41
View File
@@ -0,0 +1,41 @@
import { useWorkflowHandler, useWorkflowRun } from "@llamaindex/ui";
import { useEffect, useState } from "react";
export interface Metadata {
json_schema: any;
extracted_data_collection: string;
}
export interface UseMetadataResult {
metadata: Metadata;
loading: boolean;
error: string | undefined;
}
export function useMetadata() {
const run = useWorkflowRun();
const [handlerId, setHandlerId] = useState<string | undefined>(undefined);
const handler = useWorkflowHandler(handlerId ?? "");
const [error, setError] = useState<string | undefined>(undefined);
const [loading, setLoading] = useState(true);
useEffect(() => {
setLoading(true);
run
.runWorkflow("metadata", {})
.then((handlerSummary) => {
setHandlerId(handlerSummary.handler_id);
})
.catch((error) => {
setError(error.message);
})
.finally(() => {
setLoading(false);
});
}, []);
const stopEvent = handler.events.find((event) =>
event.type.endsWith("MetadataResponse"),
);
const metadata = stopEvent?.data as Metadata | undefined;
return { metadata, loading, error };
}
+6
View File
@@ -0,0 +1,6 @@
import { clsx, type ClassValue } from "clsx";
import { twMerge } from "tailwind-merge";
export function cn(...inputs: ClassValue[]) {
return twMerge(clsx(inputs));
}
+14
View File
@@ -0,0 +1,14 @@
import { StrictMode } from "react";
import { createRoot } from "react-dom/client";
import { HashRouter } from "react-router-dom";
import App from "./App";
import "@llamaindex/ui/styles.css";
import "./index.css";
createRoot(document.getElementById("root")!).render(
<StrictMode>
<HashRouter>
<App />
</HashRouter>
</StrictMode>,
);
+23
View File
@@ -0,0 +1,23 @@
.main {
padding: 1rem;
}
.grid {
display: flex;
flex-direction: row;
gap: 1rem;
margin-bottom: 1rem;
& > * {
flex: 1;
}
}
.commandBar {
display: flex;
justify-content: flex-end;
margin-bottom: 1rem;
}
.progressBar {
margin-bottom: 1rem;
}
+98
View File
@@ -0,0 +1,98 @@
import {
ItemCount,
WorkflowTrigger,
WorkflowProgressBar,
ExtractedDataItemGrid,
useWorkflowHandlerList,
} from "@llamaindex/ui";
import type { TypedAgentData } from "llama-cloud-services/beta/agent";
import styles from "./HomePage.module.css";
import { useNavigate } from "react-router-dom";
import { useEffect, useState } from "react";
export default function HomePage() {
const { taskKey } = taskCompletedState();
return <TaskList key={taskKey} />;
}
/**
* Returns a key that increments when a task is completed, can be used to force a re-render of the task list
*/
function taskCompletedState() {
const { handlers } = useWorkflowHandlerList("process-file");
const runningTasks = handlers.filter(
(handler) => handler.status === "running",
);
const [runningTaskCount, setRunningTaskCount] = useState(runningTasks.length);
const [taskKey, setTaskKey] = useState(0);
useEffect(() => {
if (runningTasks.length < runningTaskCount) {
// forcefully reload task list after a task is completed
setTaskKey(taskKey + 1);
}
setRunningTaskCount(runningTasks.length);
}, [runningTasks.length]);
return { runningTaskCount, taskKey };
}
function TaskList() {
const navigate = useNavigate();
const goToItem = (item: TypedAgentData) => {
navigate(`/item/${item.id}`);
};
return (
<div className={styles.page}>
<main className={styles.main}>
<div className={styles.grid}>
<ItemCount title="Total Items" />
<ItemCount
title="Reviewed"
filter={{
status: { eq: "approved" },
}}
/>
<ItemCount
title="Needs Review"
filter={{
status: { eq: "pending_review" },
}}
/>
</div>
<div className={styles.commandBar}>
<WorkflowTrigger
workflowName="process-file"
customWorkflowInput={(files) => {
return {
file_id: files[0].fileId,
};
}}
title="Upload Invoice"
/>
<WorkflowTrigger
workflowName="index-contract"
customWorkflowInput={(files) => {
return {
file_id: files[0].fileId,
};
}}
title="Upload Contract"
/>
</div>
<WorkflowProgressBar
className={styles.progressBar}
workflowName="process-file"
/>
<ExtractedDataItemGrid
onRowClick={goToItem}
builtInColumns={{
fileName: true,
status: true,
createdAt: true,
itemsToReview: true,
actions: true,
}}
/>
</main>
</div>
);
}
+152
View File
@@ -0,0 +1,152 @@
import { useEffect, useState } from "react";
import {
AcceptReject,
ExtractedDataDisplay,
FilePreview,
useItemData,
type Highlight,
Button,
} from "@llamaindex/ui";
import { Clock, XCircle, Download } from "lucide-react";
import { useParams } from "react-router-dom";
import { useToolbar } from "@/lib/ToolbarContext";
import { useNavigate } from "react-router-dom";
import { modifyJsonSchema } from "@llamaindex/ui/lib";
import { APP_TITLE } from "@/lib/config";
import { downloadExtractedDataItem } from "@/lib/export";
import { useMetadataContext } from "@/lib/MetadataProvider";
export default function ItemPage() {
const { itemId } = useParams<{ itemId: string }>();
const { setButtons, setBreadcrumbs } = useToolbar();
const [highlight, setHighlight] = useState<Highlight | undefined>(undefined);
const { metadata } = useMetadataContext();
// Use the hook to fetch item data
const itemHookData = useItemData<any>({
// order/remove fields as needed here
jsonSchema: modifyJsonSchema(metadata.json_schema, {}),
itemId: itemId as string,
isMock: false,
});
const navigate = useNavigate();
// Update breadcrumb when item data loads
useEffect(() => {
const fileName = itemHookData.item?.data?.file_name;
if (fileName) {
setBreadcrumbs([
{ label: APP_TITLE, href: "/" },
{
label: fileName,
isCurrentPage: true,
},
]);
}
return () => {
// Reset to default breadcrumb when leaving the page
setBreadcrumbs([{ label: APP_TITLE, href: "/" }]);
};
}, [itemHookData.item?.data?.file_name, setBreadcrumbs]);
useEffect(() => {
setButtons(() => [
<div className="ml-auto flex items-center gap-2">
<Button
variant="outline"
size="sm"
onClick={() => {
if (itemData) {
downloadExtractedDataItem(itemData);
}
}}
disabled={!itemData}
>
<Download className="h-4 w-4 mr-2" />
Export JSON
</Button>
<AcceptReject<any>
itemData={itemHookData}
onComplete={() => navigate("/")}
/>
</div>,
]);
return () => {
setButtons(() => []);
};
}, [itemHookData.data, setButtons]);
const {
item: itemData,
updateData,
loading: isLoading,
error,
} = itemHookData;
if (isLoading) {
return (
<div className="flex h-screen items-center justify-center">
<div className="text-center">
<Clock className="h-8 w-8 animate-spin mx-auto mb-2" />
<div className="text-sm text-gray-500">Loading item...</div>
</div>
</div>
);
}
if (error || !itemData) {
return (
<div className="flex h-screen items-center justify-center">
<div className="text-center">
<XCircle className="h-8 w-8 text-red-500 mx-auto mb-2" />
<div className="text-sm text-gray-500">
Error loading item: {error || "Item not found"}
</div>
</div>
</div>
);
}
return (
<div className="flex h-full bg-gray-50">
{/* Left Side - File Preview */}
<div className="w-1/2 border-r border-gray-200 bg-white">
{itemData.data.file_id && (
<FilePreview
fileId={itemData.data.file_id}
onBoundingBoxClick={(box, pageNumber) => {
console.log("Bounding box clicked:", box, "on page:", pageNumber);
}}
highlight={highlight}
/>
)}
</div>
{/* Right Side - Review Panel */}
<div className="flex-1 bg-white h-full overflow-y-auto">
<div className="p-4 space-y-4">
{/* Extracted Data */}
<ExtractedDataDisplay<any>
extractedData={itemData.data}
title="Extracted Data"
onChange={(updatedData) => {
updateData(updatedData);
}}
onClickField={(args) => {
// TODO: set multiple highlights
setHighlight({
page: args.metadata?.citation?.[0]?.page ?? 1,
x: 100,
y: 100,
width: 0,
height: 0,
});
}}
jsonSchema={itemHookData.jsonSchema}
/>
</div>
</div>
</div>
);
}
+15
View File
@@ -0,0 +1,15 @@
/// <reference types="vite/client" />
interface ImportMetaEnv {
readonly VITE_LLAMA_CLOUD_API_KEY?: string;
readonly VITE_LLAMA_CLOUD_BASE_URL?: string;
// injected from llama_deploy
readonly VITE_LLAMA_DEPLOY_BASE_PATH: string;
readonly VITE_LLAMA_DEPLOY_DEPLOYMENT_NAME: string;
readonly VITE_LLAMA_DEPLOY_PROJECT_ID: string;
}
interface ImportMeta {
readonly env: ImportMetaEnv;
}
+31
View File
@@ -0,0 +1,31 @@
{
"compilerOptions": {
"target": "ES2020",
"useDefineForClassFields": true,
"lib": ["ES2020", "DOM", "DOM.Iterable"],
"module": "ESNext",
"skipLibCheck": true,
/* Bundler mode */
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
"resolveJsonModule": true,
"isolatedModules": true,
"noEmit": true,
"jsx": "react-jsx",
/* Linting */
"strict": true,
"noUnusedLocals": true,
"noUnusedParameters": true,
"noFallthroughCasesInSwitch": true,
/* Path mapping */
"baseUrl": ".",
"paths": {
"@/*": ["./src/*"]
}
},
"include": ["src", "vite.config.ts", "src/vite-env.d.ts"],
"exclude": ["node_modules"]
}
+43
View File
@@ -0,0 +1,43 @@
import { defineConfig } from "vite";
import react from "@vitejs/plugin-react";
import path from "path";
// https://vitejs.dev/config/
export default defineConfig(({}) => {
const deploymentName = process.env.LLAMA_DEPLOY_DEPLOYMENT_NAME;
const basePath = process.env.LLAMA_DEPLOY_DEPLOYMENT_BASE_PATH;
const projectId = process.env.LLAMA_DEPLOY_PROJECT_ID;
const port = process.env.PORT ? Number(process.env.PORT) : 3000;
const baseUrl = process.env.LLAMA_CLOUD_BASE_URL;
return {
plugins: [react()],
resolve: {
alias: {
"@": path.resolve(__dirname, "./src"),
},
},
server: {
port: port,
host: true,
},
build: {
outDir: "dist",
sourcemap: true,
},
base: basePath,
define: {
// Primary define uses NAME
"import.meta.env.VITE_LLAMA_DEPLOY_DEPLOYMENT_NAME": JSON.stringify(
deploymentName
),
"import.meta.env.VITE_LLAMA_DEPLOY_DEPLOYMENT_BASE_PATH": JSON.stringify(basePath),
...(projectId && {
"import.meta.env.VITE_LLAMA_DEPLOY_PROJECT_ID":
JSON.stringify(projectId),
}),
...(baseUrl && {
"import.meta.env.VITE_LLAMA_CLOUD_BASE_URL": JSON.stringify(baseUrl),
}),
},
};
});
+2
View File
@@ -0,0 +1,2 @@
# Changes here will be overwritten by Copier; NEVER EDIT MANUALLY
{{ _copier_answers|to_nice_yaml -}}