feat: add invoices reconcilioation template (#43)

2026-06-30 22:17:53 -04:00 · 2025-11-06 22:43:29 -05:00
parent ffe3842030
commit 42822cea74
37 changed files with 1978 additions and 2 deletions
@@ -0,0 +1,3 @@
+# Changes here will be overwritten by Copier; NEVER EDIT MANUALLY
+_commit: v0.2.1
+_src_path: https://github.com/run-llama/template-workflow-data-extraction
@@ -0,0 +1,2 @@
+# copy to .env and place any needed secrets here. LLAMA_CLOUD_API_KEY will be automatically set
+# OPENAI_API_KEY=sk-xxx
@@ -0,0 +1,7 @@
+.env
+__pycache__
+workflows.db
+
+.venv
+package-lock.json
+node_modules
@@ -1,2 +1,48 @@
-# template-workflow-extract-reconcile-invoice
-Llama Index Workflow Template
+# Data Extraction and Ingestion
+
+This is a starter for LlamaAgents. See the [LlamaAgents (llamactl) getting started guide](https://developers.llamaindex.ai/python/llamaagents/llamactl/getting-started/) for context on local development and deployment.
+
+To run the application, install [`uv`](https://docs.astral.sh/uv/) and run `uvx llamactl serve`.
+
+## Simple customizations
+
+For some basic customizations, you can modify `src/extraction_review/config.py`
+
+- **`USE_REMOTE_EXTRACTION_SCHEMA`**: Set to `False` to define your own Pydantic `ExtractionSchema` in this file. Set to `True` to reuse the schema from an existing LlamaCloud Extraction Agent.
+- **`EXTRACTION_AGENT_NAME`**: Logical name for your Extraction Agent. When `USE_REMOTE_EXTRACTION_SCHEMA` is `False`, this name is used to upsert the agent with your local schema; when `True`, it is used to fetch an existing agent.
+- **`EXTRACTED_DATA_COLLECTION`**: The Agent Data collection name used to store extractions (namespaced by agent name and environment).
+- **`ExtractionSchema`**: When using a local schema, edit this Pydantic model to match the fields you want extracted. Prefer optional types where possible to allow for partial extractions.
+
+The UI fetches the JSON Schema and collection name from the backend metadata workflow at runtime, and dynamically
+generates an editing UI based on the schema.
+
+## Complex customizations
+
+For more complex customizations, you can edit the rest of the application. For example, you could
+- Modify the existing file processing workflow to provide additional context for the extraction process
+- Take further action based on the extracted data.
+- Add additional workflows to submit data upon approval.
+
+## Linting and type checking
+
+Python and javascript pacakges contain helpful scripts to lint, format, and type check the code.
+
+To check and fix python code:
+
+```bash
+uv run hatch run lint
+uv run hatch run typecheck
+uv run hatch run test
+# run all at once
+uv run hatch run all-fix
+```
+
+To check and fix javascript code, within the `ui` directory:
+
+```bash
+pnpm run lint
+pnpm run typecheck
+pnpm run test
+# run all at once
+pnpm run all-fix
+```
@@ -0,0 +1,54 @@
+[project]
+name = "extraction-review"
+version = "0.1.0"
+description = "Extracts data"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "llama-cloud-services>=0.6.69",
+    "llama-index-workflows>=2.2.0,<3.0.0",
+    "python-dotenv>=1.1.0",
+    "jsonref>=1.1.0",
+    "click>=8.2.1,<8.3.0",
+    "httpx>=0.28.1",
+    "llama-index-core>=0.14.0",
+    "llama-index-llms-openai>=0.3.0",
+]
+
+[dependency-groups]
+dev = [
+    "ruff>=0.11.10",
+    "typescript>=0.0.12",
+    "ty>=0.0.1a16",
+    "pytest>=8.4.1",
+    "hatch>=1.14.1",
+    "llamactl>=0.3.0"
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.envs.default.scripts]
+"format" = "ruff format ."
+"format-check" = "ruff format --check ."
+"lint" = "ruff check --fix ."
+"lint-check" = ["ruff check ."]
+typecheck = "ty check src"
+test = "pytest"
+"all-check" = ["format-check", "lint-check", "test"]
+"all-fix" = ["format", "lint", "test"]
+
+[tool.llamadeploy]
+env_files = [".env"]
+llama_cloud = true
+required_env_vars = ["OPENAI_API_KEY"]
+
+[tool.llamadeploy.workflows]
+process-file = "extraction_review.process_file:workflow"
+metadata = "extraction_review.metadata_workflow:workflow"
+index-contract = "extraction_review.index_contract:workflow"
+
+[tool.llamadeploy.ui]
+directory = "ui"
+
@@ -0,0 +1,92 @@
+import functools
+import logging
+import os
+
+import httpx
+from llama_cloud.client import AsyncLlamaCloud
+from llama_cloud.core.api_error import ApiError
+from llama_cloud_services import ExtractionAgent, LlamaExtract, LlamaCloudIndex
+from llama_cloud_services.beta.agent_data import AsyncAgentDataClient, ExtractedData
+from llama_index.llms.openai import OpenAI
+
+from extraction_review.config import (
+    CONTRACTS_INDEX_NAME,
+    EXTRACTED_DATA_COLLECTION,
+    EXTRACT_CONFIG,
+    EXTRACTION_AGENT_NAME,
+    InvoiceExtractionSchema,
+    InvoiceWithReconciliation,
+)
+
+logger = logging.getLogger(__name__)
+
+# deployed agents may infer their name from the deployment name
+# Note: Make sure that an agent deployment with this name actually exists
+# otherwise calls to get or set data will fail. You may need to adjust the `or `
+# name for development
+agent_name = os.getenv("LLAMA_DEPLOY_DEPLOYMENT_NAME")
+# required for all llama cloud calls
+api_key = os.environ["LLAMA_CLOUD_API_KEY"]
+# get this in case running against a different environment than production
+base_url = os.getenv("LLAMA_CLOUD_BASE_URL")
+project_id = os.getenv("LLAMA_DEPLOY_PROJECT_ID")
+
+
+@functools.lru_cache(maxsize=None)
+def get_extract_agent() -> ExtractionAgent:
+    extract_api = LlamaExtract(
+        api_key=api_key, base_url=base_url, project_id=project_id
+    )
+
+    try:
+        existing = extract_api.get_agent(EXTRACTION_AGENT_NAME)
+        existing.data_schema = InvoiceExtractionSchema
+        existing.config = EXTRACT_CONFIG
+        return existing
+    except ApiError as e:
+        if e.status_code == 404:
+            return extract_api.create_agent(
+                name=EXTRACTION_AGENT_NAME,
+                data_schema=InvoiceExtractionSchema,
+                config=EXTRACT_CONFIG,
+            )
+        else:
+            raise
+
+
+@functools.lru_cache(maxsize=None)
+def get_data_client() -> AsyncAgentDataClient[ExtractedData[InvoiceWithReconciliation]]:
+    return AsyncAgentDataClient(
+        deployment_name=agent_name,
+        collection=EXTRACTED_DATA_COLLECTION,
+        type=ExtractedData[InvoiceWithReconciliation],
+        client=get_llama_cloud_client(),
+    )
+
+
+@functools.lru_cache(maxsize=None)
+def get_llama_cloud_client():
+    return AsyncLlamaCloud(
+        base_url=base_url,
+        token=api_key,
+        httpx_client=httpx.AsyncClient(
+            timeout=60, headers={"Project-Id": project_id} if project_id else None
+        ),
+    )
+
+
+@functools.lru_cache(maxsize=None)
+def get_contracts_index() -> LlamaCloudIndex:
+    """Get or create the contracts index for storing and retrieving contract documents"""
+    return LlamaCloudIndex(
+        name=CONTRACTS_INDEX_NAME,
+        project_id=project_id,
+        api_key=api_key,
+        base_url=base_url,
+    )
+
+
+@functools.lru_cache(maxsize=None)
+def get_llm() -> OpenAI:
+    """Get OpenAI LLM for structured predictions"""
+    return OpenAI(model="gpt-5-mini", temperature=0)
@@ -0,0 +1,130 @@
+"""
+For simple configuration of the extraction review application, just customize this file.
+
+If you need more control, feel free to edit the rest of the application
+"""
+
+from __future__ import annotations
+
+import os
+
+from llama_cloud import ExtractConfig
+from llama_cloud_services.extract import ExtractMode
+from pydantic import BaseModel, Field
+
+# The name of the extraction agent to use. Prefers the name of this deployment when deployed to isolate environments.
+# Note that the application will create a new agent from the below ExtractionSchema if the extraction agent does not yet exist.
+EXTRACTION_AGENT_NAME: str = (
+    os.getenv("LLAMA_DEPLOY_DEPLOYMENT_NAME") or "invoice-reconciliation"
+)
+# The name of the collection to use for storing extracted data. This will be qualified by the agent name.
+# When developing locally, this will use the _public collection (shared within the project), otherwise agent
+# data is isolated to each agent
+EXTRACTED_DATA_COLLECTION: str = "invoices"
+
+# The name of the LlamaCloud index for storing contracts
+CONTRACTS_INDEX_NAME: str = "contracts"
+
+
+# Invoice extraction schema - extracted from invoice documents
+class LineItem(BaseModel):
+    description: str | None = Field(
+        default=None, description="Description of the line item"
+    )
+    quantity: float | None = Field(default=None, description="Quantity of the item")
+    unit_price: float | None = Field(
+        default=None, description="Price per unit of the item"
+    )
+    total: float | None = Field(
+        default=None, description="Total price for this line item"
+    )
+
+
+class InvoiceExtractionSchema(BaseModel):
+    """Schema for extracting invoice data"""
+
+    invoice_number: str | None = Field(
+        default=None, description="Invoice number or identifier"
+    )
+    invoice_date: str | None = Field(
+        default=None, description="Date of the invoice (YYYY-MM-DD format if possible)"
+    )
+    vendor_name: str | None = Field(
+        default=None, description="Name of the vendor or supplier"
+    )
+    vendor_address: str | None = Field(
+        default=None, description="Address of the vendor"
+    )
+    purchase_order_number: str | None = Field(
+        default=None, description="Purchase order (PO) number if present"
+    )
+    payment_terms: str | None = Field(
+        default=None,
+        description="Payment terms (e.g., Net 30, Net 60, Due on receipt)",
+    )
+    line_items: list[LineItem] | None = Field(
+        default=None, description="List of line items on the invoice"
+    )
+    subtotal: float | None = Field(
+        default=None, description="Subtotal before tax and other charges"
+    )
+    tax: float | None = Field(default=None, description="Tax amount")
+    total: float | None = Field(
+        default=None, description="Total amount due on the invoice"
+    )
+
+
+# For backward compatibility
+ExtractionSchema = InvoiceExtractionSchema
+
+
+# Reconciliation schema - extends invoice data with contract matching and discrepancy information
+class Discrepancy(BaseModel):
+    """Represents a single discrepancy between invoice and contract"""
+
+    field: str = Field(description="Field name where discrepancy was found")
+    invoice_value: str | None = Field(
+        default=None, description="Value from the invoice"
+    )
+    contract_value: str | None = Field(
+        default=None, description="Expected value from the contract"
+    )
+    severity: str | None = Field(
+        default=None,
+        description="Severity of the discrepancy (e.g., 'high', 'medium', 'low')",
+    )
+    note: str | None = Field(
+        default=None, description="Additional notes about the discrepancy"
+    )
+
+
+class InvoiceWithReconciliation(InvoiceExtractionSchema):
+    """Invoice data with reconciliation information"""
+
+    matched_contract_id: str | None = Field(
+        default=None, description="ID of the matched contract file in LlamaCloud"
+    )
+    matched_contract_name: str | None = Field(
+        default=None, description="Name of the matched contract file"
+    )
+    match_confidence: str | None = Field(
+        default=None,
+        description="Confidence level of the match (e.g., 'high', 'medium', 'low', 'none')",
+    )
+    match_rationale: str | None = Field(
+        default=None, description="Explanation of why this contract was matched"
+    )
+    discrepancies: list[Discrepancy] | None = Field(
+        default=None,
+        description="List of discrepancies found between invoice and contract",
+    )
+
+
+EXTRACT_CONFIG = ExtractConfig(
+    extraction_mode=ExtractMode.PREMIUM,
+    system_prompt=None,
+    # advanced. Only compatible with Premium mode.
+    use_reasoning=False,
+    cite_sources=False,
+    confidence_scores=True,
+)
@@ -0,0 +1,161 @@
+"""
+Workflow for indexing contract documents into LlamaCloud Index for retrieval.
+"""
+
+import logging
+import os
+import tempfile
+from pathlib import Path
+from typing import Literal
+
+import httpx
+from llama_index.core import Document
+from pydantic import BaseModel
+from workflows import Context, Workflow, step
+from workflows.events import Event, StartEvent, StopEvent
+
+from .clients import get_contracts_index, get_llama_cloud_client
+
+logger = logging.getLogger(__name__)
+
+
+class ContractFileEvent(StartEvent):
+    """Event to start contract indexing with a file ID"""
+
+    file_id: str
+
+
+class DownloadContractEvent(Event):
+    """Event to trigger contract download"""
+
+    pass
+
+
+class ContractDownloadedEvent(Event):
+    """Event indicating contract has been downloaded"""
+
+    pass
+
+
+class UIToast(Event):
+    """Event to show toast notifications in the UI"""
+
+    level: Literal["info", "warning", "error"]
+    message: str
+
+
+class ContractIndexState(BaseModel):
+    """State for contract indexing workflow"""
+
+    file_id: str | None = None
+    file_path: str | None = None
+    filename: str | None = None
+
+
+class IndexContractWorkflow(Workflow):
+    """
+    Workflow to download and index a contract document into LlamaCloud Index.
+    """
+
+    @step()
+    async def start_indexing(
+        self, event: ContractFileEvent, ctx: Context[ContractIndexState]
+    ) -> DownloadContractEvent:
+        """Initialize the workflow with the file ID"""
+        logger.info(f"Starting contract indexing for file {event.file_id}")
+        async with ctx.store.edit_state() as state:
+            state.file_id = event.file_id
+        return DownloadContractEvent()
+
+    @step()
+    async def download_contract(
+        self, event: DownloadContractEvent, ctx: Context[ContractIndexState]
+    ) -> ContractDownloadedEvent:
+        """Download the contract file from LlamaCloud storage"""
+        state = await ctx.store.get_state()
+        if state.file_id is None:
+            raise ValueError("File ID is not set")
+
+        file_metadata = await get_llama_cloud_client().files.get_file(id=state.file_id)
+        file_url = await get_llama_cloud_client().files.read_file_content(state.file_id)
+
+        temp_dir = tempfile.gettempdir()
+        filename = file_metadata.name
+        file_path = os.path.join(temp_dir, filename)
+
+        logger.info(f"Downloading contract {filename} from {file_url.url}")
+        ctx.write_event_to_stream(
+            UIToast(level="info", message=f"Downloading contract: {filename}")
+        )
+
+        client = httpx.AsyncClient()
+        async with client.stream("GET", file_url.url) as response:
+            with open(file_path, "wb") as f:
+                async for chunk in response.aiter_bytes():
+                    f.write(chunk)
+
+        logger.info(f"Downloaded contract to {file_path}")
+        async with ctx.store.edit_state() as state:
+            state.file_path = file_path
+            state.filename = filename
+
+        return ContractDownloadedEvent()
+
+    @step()
+    async def index_contract(
+        self, event: ContractDownloadedEvent, ctx: Context[ContractIndexState]
+    ) -> StopEvent:
+        """Index the contract document into LlamaCloud Index"""
+        state = await ctx.store.get_state()
+        if state.file_path is None or state.filename is None:
+            raise ValueError("File path or filename is not set")
+
+        logger.info(f"Indexing contract {state.filename}")
+        ctx.write_event_to_stream(
+            UIToast(level="info", message=f"Indexing contract: {state.filename}")
+        )
+
+        # Create a document with metadata
+        file_content = Path(state.file_path).read_text(errors="ignore")
+        document = Document(
+            text=file_content,
+            metadata={
+                "filename": state.filename,
+                "file_id": state.file_id,
+                "document_type": "contract",
+            },
+        )
+
+        # Get the contracts index and insert the document
+        index = get_contracts_index()
+        await index.ainsert(document)
+
+        logger.info(f"Successfully indexed contract {state.filename}")
+        ctx.write_event_to_stream(
+            UIToast(
+                level="info",
+                message=f"Successfully indexed contract: {state.filename}",
+            )
+        )
+
+        return StopEvent(result={"file_id": state.file_id, "filename": state.filename})
+
+
+workflow = IndexContractWorkflow(timeout=None)
+
+if __name__ == "__main__":
+    import asyncio
+    from dotenv import load_dotenv
+
+    load_dotenv()
+    logging.basicConfig(level=logging.INFO)
+
+    async def main():
+        # Example usage - upload a contract and index it
+        file = await get_llama_cloud_client().files.upload_file(
+            upload_file=Path("sample_contract.pdf").open("rb")
+        )
+        result = await workflow.run(start_event=ContractFileEvent(file_id=file.id))
+        print(f"Indexed contract: {result}")
+
+    asyncio.run(main())
@@ -0,0 +1,30 @@
+from typing import Any
+
+from workflows import Workflow, step
+from workflows.events import StartEvent, StopEvent
+
+from extraction_review.schema import get_extraction_schema_json
+
+from .config import EXTRACTED_DATA_COLLECTION
+
+
+class MetadataResponse(StopEvent):
+    json_schema: dict[str, Any]
+    extracted_data_collection: str
+
+
+class MetadataWorkflow(Workflow):
+    """
+    Simple single step workflow to expose configuration to the UI, such as the JSON schema and collection name.
+    """
+
+    @step
+    async def get_metadata(self, _: StartEvent) -> MetadataResponse:
+        json_schema = await get_extraction_schema_json()
+        return MetadataResponse(
+            json_schema=json_schema,
+            extracted_data_collection=EXTRACTED_DATA_COLLECTION,
+        )
+
+
+workflow = MetadataWorkflow(timeout=None)
@@ -0,0 +1,439 @@
+import asyncio
+import hashlib
+import logging
+import os
+import tempfile
+from pathlib import Path
+from typing import Any, Literal
+
+import httpx
+from llama_cloud import ExtractRun
+from llama_cloud_services.beta.agent_data import ExtractedData, InvalidExtractionData
+from llama_cloud_services.extract import SourceText
+from llama_index.core.prompts import PromptTemplate
+from pydantic import BaseModel, Field
+from workflows import Context, Workflow, step
+from workflows.events import Event, StartEvent, StopEvent
+
+from .clients import (
+    get_contracts_index,
+    get_data_client,
+    get_extract_agent,
+    get_llama_cloud_client,
+    get_llm,
+)
+from .config import Discrepancy, InvoiceExtractionSchema, InvoiceWithReconciliation
+
+logger = logging.getLogger(__name__)
+
+
+class FileEvent(StartEvent):
+    file_id: str
+
+
+class DownloadFileEvent(Event):
+    pass
+
+
+class FileDownloadedEvent(Event):
+    pass
+
+
+class UIToast(Event):
+    level: Literal["info", "warning", "error"]
+    message: str
+
+
+class ExtractedEvent(Event):
+    """Event when invoice data is successfully extracted"""
+
+    invoice_data: InvoiceExtractionSchema
+    field_metadata: dict[str, Any]
+
+
+class ExtractedInvalidEvent(Event):
+    """Event when extraction validation fails"""
+
+    data: ExtractedData[dict[str, Any]]
+
+
+class ReconciledEvent(Event):
+    """Event when invoice is reconciled with contracts"""
+
+    data: ExtractedData[InvoiceWithReconciliation]
+
+
+class ExtractionState(BaseModel):
+    file_id: str | None = None
+    file_path: str | None = None
+    filename: str | None = None
+
+
+class ProcessFileWorkflow(Workflow):
+    """
+    Given a file path, this workflow will process a single file through the custom extraction logic.
+    """
+
+    @step()
+    async def run_file(self, event: FileEvent, ctx: Context) -> DownloadFileEvent:
+        logger.info(f"Running file {event.file_id}")
+        async with ctx.store.edit_state() as state:
+            state.file_id = event.file_id
+        return DownloadFileEvent()
+
+    @step()
+    async def download_file(
+        self, event: DownloadFileEvent, ctx: Context[ExtractionState]
+    ) -> FileDownloadedEvent:
+        """Download the file reference from the cloud storage"""
+        state = await ctx.store.get_state()
+        if state.file_id is None:
+            raise ValueError("File ID is not set")
+        try:
+            file_metadata = await get_llama_cloud_client().files.get_file(
+                id=state.file_id
+            )
+            file_url = await get_llama_cloud_client().files.read_file_content(
+                state.file_id
+            )
+
+            temp_dir = tempfile.gettempdir()
+            filename = file_metadata.name
+            file_path = os.path.join(temp_dir, filename)
+            client = httpx.AsyncClient()
+            # Report progress to the UI
+            logger.info(f"Downloading file {file_url.url} to {file_path}")
+
+            async with client.stream("GET", file_url.url) as response:
+                with open(file_path, "wb") as f:
+                    async for chunk in response.aiter_bytes():
+                        f.write(chunk)
+            logger.info(f"Downloaded file {file_url.url} to {file_path}")
+            async with ctx.store.edit_state() as state:
+                state.file_path = file_path
+                state.filename = filename
+            return FileDownloadedEvent()
+
+        except Exception as e:
+            logger.error(f"Error downloading file {state.file_id}: {e}", exc_info=True)
+            ctx.write_event_to_stream(
+                UIToast(
+                    level="error",
+                    message=f"Error downloading file {state.file_id}: {e}",
+                )
+            )
+            raise e
+
+    @step()
+    async def process_file(
+        self, event: FileDownloadedEvent, ctx: Context[ExtractionState]
+    ) -> ExtractedEvent | ExtractedInvalidEvent:
+        """Runs the extraction against the file"""
+        state = await ctx.store.get_state()
+        if state.file_path is None or state.filename is None:
+            raise ValueError("File path or filename is not set")
+        try:
+            agent = get_extract_agent()
+            source_text = SourceText(
+                file=state.file_path,
+                filename=state.filename,
+            )
+            logger.info(f"Extracting data from file {state.filename}")
+            ctx.write_event_to_stream(
+                UIToast(
+                    level="info", message=f"Extracting data from file {state.filename}"
+                )
+            )
+            extracted_result: ExtractRun = await agent.aextract(source_text)
+
+            # Validate the extracted data
+            if not extracted_result.data:
+                raise ValueError("No data extracted from invoice")
+
+            invoice_data = InvoiceExtractionSchema.model_validate(extracted_result.data)
+            logger.info(f"Extracted invoice data: {invoice_data}")
+            # Extract only the field_metadata we need, not the entire ExtractRun object
+            field_metadata = extracted_result.extraction_metadata.get(
+                "field_metadata", {}
+            )
+            return ExtractedEvent(
+                invoice_data=invoice_data, field_metadata=field_metadata
+            )
+        except InvalidExtractionData as e:
+            logger.error(f"Error validating extracted data: {e}", exc_info=True)
+            return ExtractedInvalidEvent(data=e.invalid_item)
+        except Exception as e:
+            logger.error(
+                f"Error extracting data from file {state.filename}: {e}",
+                exc_info=True,
+            )
+            ctx.write_event_to_stream(
+                UIToast(
+                    level="error",
+                    message=f"Error extracting data from file {state.filename}: {e}",
+                )
+            )
+            raise e
+
+    @step()
+    async def reconcile_with_contract(
+        self, event: ExtractedEvent, ctx: Context[ExtractionState]
+    ) -> ReconciledEvent:
+        """Reconcile the invoice with matching contracts using retrieval and LLM"""
+        state = await ctx.store.get_state()
+        invoice_data = event.invoice_data
+
+        logger.info("Reconciling invoice with contracts")
+        ctx.write_event_to_stream(
+            UIToast(level="info", message="Matching invoice with contracts...")
+        )
+
+        try:
+            # Build a query from invoice data for contract retrieval
+            query_parts = []
+            if invoice_data.vendor_name:
+                query_parts.append(f"vendor: {invoice_data.vendor_name}")
+            if invoice_data.purchase_order_number:
+                query_parts.append(f"PO: {invoice_data.purchase_order_number}")
+            if invoice_data.invoice_number:
+                query_parts.append(f"invoice: {invoice_data.invoice_number}")
+
+            query = " ".join(query_parts) if query_parts else "contract agreement"
+
+            # Retrieve relevant contracts
+            index = get_contracts_index()
+            retriever = index.as_retriever(similarity_top_k=3)
+            retrieved_nodes = await retriever.aretrieve(query)
+
+            if not retrieved_nodes:
+                logger.info("No contracts found in index")
+                # No contracts available - create reconciliation data with no match
+                reconciled_data = InvoiceWithReconciliation(
+                    **invoice_data.model_dump(),
+                    match_confidence="none",
+                    match_rationale="No contracts found in the system",
+                    discrepancies=[],
+                )
+            else:
+                # Use LLM to match and reconcile
+                reconciled_data = await self._match_and_reconcile(
+                    invoice_data, retrieved_nodes
+                )
+
+            # Create ExtractedData with reconciliation information
+            file_content = Path(state.file_path).read_bytes()
+            file_hash = hashlib.sha256(file_content).hexdigest()
+
+            # Get field metadata from extraction event
+            field_metadata = event.field_metadata
+
+            extracted_data = ExtractedData.create(
+                data=reconciled_data,
+                file_id=state.file_id,
+                file_name=state.filename,
+                file_hash=file_hash,
+                field_metadata=field_metadata,
+            )
+
+            logger.info(f"Reconciliation complete: {reconciled_data.match_confidence}")
+            return ReconciledEvent(data=extracted_data)
+
+        except Exception as e:
+            logger.error(f"Error during reconciliation: {e}", exc_info=True)
+            # If reconciliation fails, still create data without reconciliation
+            reconciled_data = InvoiceWithReconciliation(
+                **invoice_data.model_dump(),
+                match_confidence="error",
+                match_rationale=f"Error during reconciliation: {str(e)}",
+                discrepancies=[],
+            )
+
+            file_content = Path(state.file_path).read_bytes()
+            file_hash = hashlib.sha256(file_content).hexdigest()
+            field_metadata = event.field_metadata
+
+            extracted_data = ExtractedData.create(
+                data=reconciled_data,
+                file_id=state.file_id,
+                file_name=state.filename,
+                file_hash=file_hash,
+                field_metadata=field_metadata,
+            )
+
+            return ReconciledEvent(data=extracted_data)
+
+    async def _match_and_reconcile(
+        self, invoice_data: InvoiceExtractionSchema, retrieved_nodes: list
+    ) -> InvoiceWithReconciliation:
+        """Use LLM to match invoice with contract and identify discrepancies"""
+
+        # Define structured output schema for LLM
+        class ContractMatchResult(BaseModel):
+            """Result of matching invoice to contract"""
+
+            is_match: bool = Field(
+                description="Whether a plausible contract match was found"
+            )
+            matched_contract_index: int | None = Field(
+                default=None,
+                description="Index (0-based) of the matched contract in the provided list, or None if no match",
+            )
+            match_confidence: str = Field(
+                description="Confidence level: 'high', 'medium', 'low', or 'none'"
+            )
+            match_rationale: str = Field(
+                description="Explanation of why this contract was or was not matched"
+            )
+            contract_payment_terms: str | None = Field(
+                default=None, description="Payment terms found in the matched contract"
+            )
+            discrepancies: list[Discrepancy] = Field(
+                default_factory=list,
+                description="List of discrepancies found between invoice and contract",
+            )
+
+        # Prepare contract context
+        contracts_text = "\n\n".join(
+            [
+                f"Contract {i} (File: {node.metadata.get('filename', 'Unknown')}):\n{node.text[:1000]}"
+                for i, node in enumerate(retrieved_nodes)
+            ]
+        )
+
+        # Create prompt for matching
+        prompt_template = PromptTemplate(
+            """You are analyzing an invoice to match it with the correct contract and identify any discrepancies.
+
+Invoice Details:
+- Vendor: {vendor_name}
+- Invoice Number: {invoice_number}
+- Invoice Date: {invoice_date}
+- PO Number: {po_number}
+- Payment Terms: {payment_terms}
+- Total: {total}
+
+Retrieved Contracts:
+{contracts_text}
+
+Task:
+1. Determine if any of the retrieved contracts plausibly matches this invoice based on:
+   - Vendor name matching or similarity
+   - PO number or invoice number references
+   - Date ranges or validity periods
+   - Any other relevant identifiers
+
+2. If a match is found, identify discrepancies between invoice and contract, focusing on:
+   - Payment terms differences (CRITICAL)
+   - Total amount mismatches if contract specifies amounts
+   - Vendor name discrepancies
+   - Any other obvious conflicts
+
+3. Assess match confidence:
+   - 'high': Clear match with strong vendor/PO/identifier alignment
+   - 'medium': Probable match with some uncertainty
+   - 'low': Weak match, possibly relevant but uncertain
+   - 'none': No plausible match found
+
+Provide your analysis in the specified format."""
+        )
+
+        # Use LLM with structured prediction
+        llm = get_llm()
+        result = await llm.astructured_predict(
+            ContractMatchResult,
+            prompt_template,
+            **{
+                "vendor_name": invoice_data.vendor_name or "N/A",
+                "invoice_number": invoice_data.invoice_number or "N/A",
+                "invoice_date": invoice_data.invoice_date or "N/A",
+                "po_number": invoice_data.purchase_order_number or "N/A",
+                "payment_terms": invoice_data.payment_terms or "N/A",
+                "total": invoice_data.total or "N/A",
+                "contracts_text": contracts_text,
+            },
+        )
+
+        # Build reconciled invoice data
+        matched_contract_id = None
+        matched_contract_name = None
+
+        if result.is_match and result.matched_contract_index is not None:
+            matched_node = retrieved_nodes[result.matched_contract_index]
+            matched_contract_id = matched_node.metadata.get("file_id")
+            matched_contract_name = matched_node.metadata.get("filename")
+
+        return InvoiceWithReconciliation(
+            **invoice_data.model_dump(),
+            matched_contract_id=matched_contract_id,
+            matched_contract_name=matched_contract_name,
+            match_confidence=result.match_confidence,
+            match_rationale=result.match_rationale,
+            discrepancies=result.discrepancies,
+        )
+
+    @step()
+    async def record_extracted_data(
+        self, event: ReconciledEvent | ExtractedInvalidEvent, ctx: Context
+    ) -> StopEvent:
+        """Records the extracted data to the agent data API"""
+        try:
+            logger.info(f"Recorded extracted data for file {event.data.file_name}")
+            ctx.write_event_to_stream(
+                UIToast(
+                    level="info",
+                    message=f"Recorded extracted data for file {event.data.file_name}",
+                )
+            )
+            # remove past data when reprocessing the same file
+            if event.data.file_hash:
+                existing_data = await get_data_client().untyped_search(
+                    filter={
+                        "file_hash": {
+                            "eq": event.data.file_hash,
+                        },
+                    },
+                )
+                if existing_data.items:
+                    logger.info(
+                        f"Removing past data for file {event.data.file_name} with hash {event.data.file_hash}"
+                    )
+                    await asyncio.gather(
+                        *[
+                            get_data_client().delete_item(item.id)
+                            for item in existing_data.items
+                        ]
+                    )
+            # finally, save the new data
+            item_id = await get_data_client().create_item(event.data)
+            return StopEvent(
+                result=item_id.id,
+            )
+        except Exception as e:
+            logger.error(
+                f"Error recording extracted data for file {event.data.file_name}: {e}",
+                exc_info=True,
+            )
+            ctx.write_event_to_stream(
+                UIToast(
+                    level="error",
+                    message=f"Error recording extracted data for file {event.data.file_name}: {e}",
+                )
+            )
+            raise e
+
+
+workflow = ProcessFileWorkflow(timeout=None)
+
+if __name__ == "__main__":
+    from dotenv import load_dotenv
+
+    load_dotenv()
+    logging.basicConfig(level=logging.INFO)
+
+    async def main():
+        file = await get_llama_cloud_client().files.upload_file(
+            upload_file=Path("test.pdf").open("rb")
+        )
+        await workflow.run(start_event=FileEvent(file_id=file.id))
+
+    asyncio.run(main())
@@ -0,0 +1,32 @@
+from typing import Any, Type
+
+import jsonref
+from pydantic import BaseModel, Field, create_model
+
+from extraction_review.config import InvoiceWithReconciliation
+
+
+async def get_extraction_schema_json() -> dict[str, Any]:
+    json_schema = InvoiceWithReconciliation.model_json_schema()
+    json_schema = jsonref.replace_refs(json_schema, proxies=False)
+    return json_schema
+
+
+def model_from_schema(schema: dict[str, Any]) -> Type[BaseModel]:
+    """
+    Converts a JSON schema back to a Pydantic model.
+    """
+    typemap = {
+        "string": str,
+        "integer": int,
+        "number": float,
+        "boolean": bool,
+        "array": list,
+        "object": dict,
+    }
+    fields = {}
+    for prop, meta in schema.get("properties", {}).items():
+        py_type = typemap.get(meta.get("type"), Any)
+        default = ... if prop in schema.get("required", []) else None
+        fields[prop] = (py_type, Field(default, description=meta.get("description")))
+    return create_model(schema.get("title", "DynamicModel"), **fields)
@@ -0,0 +1,24 @@
+We are building an invoice extraction and reconciliation workflow app.
+
+Invoices are parsed into structured data, then compared against indexed contracts to reconcile the invoice with its matching contract. Update the invoice record with contract-derived information and any discrepancies.
+
+Using the UI, the user should be able to:
+- add and index new contracts
+- add and reconcile new invoices
+
+This should be based off of the base extraction review template, which has 2 pages, one that displays a table of all extracted items (one row per invoice), and one for the item details (the extracted data for one invoice, e.g. total and line items). The items and details view should show the invoices.
+
+Contracts can remain largely invisible in the UI for now, but there should be a minimal way to add them. These should be placed into a LlamaCloud index (which parses PDFs to plain text for retrieval).
+
+The stored schema should extend the extracted invoice schema with reconciliation fields, such as links to the matched contract, a match confidence/score, and a structured list of discrepancies.
+
+Matching should retrieve candidate contracts and then use an LLM, with context for both the candidate contracts and the invoice data, to make the final selection and provide rationale. When no contract matches, record that outcome clearly.
+
+When matching and reconciling, consider:
+- Whether there is any plausible matching contract versus only irrelevant results (e.g., vendor name, contract dates/ranges, contract or PO numbers).
+- Whether payment terms are matching (at minimum).
+- Optionally, check other obvious alignments if cheaply available (e.g., totals, vendor identifiers).
+
+Represent reconciliation results in the details view with a clear, structured list of discrepancies (e.g., field, invoice_value, contract_value, optional note/severity).
+
+The vast majority of this change should be kept in the python codebase. Some minor changes may need to be added to the UI, however do not do anything complex, just a button or small widget.
@@ -0,0 +1,2 @@
+def test_placeholder():
+    pass
@@ -0,0 +1,43 @@
+# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
+
+# dependencies
+/node_modules
+/.pnp
+.pnp.*
+.yarn/*
+!.yarn/patches
+!.yarn/plugins
+!.yarn/releases
+!.yarn/versions
+
+# testing
+/coverage
+
+# next.js
+/.next/
+/out/
+/dist/
+
+# production
+/build
+
+# misc
+.DS_Store
+*.pem
+
+# debug
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+.pnpm-debug.log*
+
+# env files (can opt-in for committing if needed)
+.env*
+
+# vercel
+.vercel
+
+# typescript
+*.tsbuildinfo
+next-env.d.ts
+
@@ -0,0 +1,7 @@
+# Data Extraction UI
+
+This is a simple next.js template that builds on the @llamaindex/agent-app ui component library
+for showing displaying tables of extracted data.
+
+Ideally run this with `llamactl` in the parent directory (See [README.md](../README.md)),
+but you can also run it standalone with `npm run dev`, but workflow integrations will not work
@@ -0,0 +1,21 @@
+{
+  "$schema": "https://ui.shadcn.com/schema.json",
+  "style": "new-york",
+  "rsc": true,
+  "tsx": true,
+  "tailwind": {
+    "config": "",
+    "css": "src/index.css",
+    "baseColor": "zinc",
+    "cssVariables": true,
+    "prefix": ""
+  },
+  "aliases": {
+    "components": "@/components",
+    "utils": "@/lib/utils",
+    "ui": "@/components/ui",
+    "lib": "@/lib",
+    "hooks": "@/hooks"
+  },
+  "iconLibrary": "lucide"
+}
@@ -0,0 +1,12 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>Review</title>
+  </head>
+  <body>
+    <div id="root"></div>
+    <script type="module" src="/src/main.tsx"></script>
+  </body>
+</html>
@@ -0,0 +1,45 @@
+{
+  "name": "extraction-review-ui",
+  "version": "0.1.0",
+  "private": true,
+  "type": "module",
+  "scripts": {
+    "dev": "vite",
+    "build": "tsc && vite build",
+    "preview": "vite preview",
+    "lint": "tsc --noEmit",
+    "format": "prettier --write src",
+    "format-check": "prettier --check src",
+    "all-check": "pnpm i && pnpm run lint && pnpm run format-check && pnpm run build",
+    "all-fix": "pnpm i && pnpm run lint && pnpm run format && pnpm run build"
+  },
+  "dependencies": {
+    "@babel/runtime": "^7.27.6",
+    "@lezer/highlight": "^1.2.1",
+    "@llamaindex/ui": "^2.1.2",
+    "@radix-ui/themes": "^3.2.1",
+    "class-variance-authority": "^0.7.1",
+    "clsx": "^2.1.1",
+    "llama-cloud-services": "^0.3.4",
+    "lucide-react": "^0.514.0",
+    "react": "^18.3.0",
+    "react-dom": "^18.3.0",
+    "react-router-dom": "^6.30.0",
+    "sonner": "^2.0.5",
+    "tw-animate-css": "^1.3.5"
+  },
+  "devDependencies": {
+    "@tailwindcss/postcss": "^4.1.10",
+    "@types/node": "^20",
+    "@types/react": "^19",
+    "@types/react-dom": "^19",
+    "@vitejs/plugin-react": "^4.3.4",
+    "postcss": "^8.5.5",
+    "prettier": "^3.6.2",
+    "tailwind-merge": "^3.3.1",
+    "tailwindcss": "^4.1.8",
+    "typescript": "^5",
+    "vite": "^6.0.5"
+  },
+  "packageManager": "pnpm@10.11.1+sha512.e519b9f7639869dc8d5c3c5dfef73b3f091094b0a006d7317353c72b124e80e1afd429732e28705ad6bfa1ee879c1fce46c128ccebd3192101f43dd67c667912"
+}
@@ -0,0 +1,7 @@
+const config = {
+  plugins: {
+    "@tailwindcss/postcss": {},
+  },
+};
+
+export default config; 
@@ -0,0 +1,70 @@
+import React from "react";
+import { Routes, Route } from "react-router-dom";
+import { Theme } from "@radix-ui/themes";
+import {
+  Breadcrumb,
+  BreadcrumbItem,
+  BreadcrumbList,
+  BreadcrumbSeparator,
+} from "@llamaindex/ui";
+import { Link } from "react-router-dom";
+import { Toaster } from "@llamaindex/ui";
+import { useToolbar, ToolbarProvider } from "@/lib/ToolbarContext";
+import { MetadataProvider } from "@/lib/MetadataProvider";
+
+// Import pages
+import HomePage from "./pages/HomePage";
+import ItemPage from "./pages/ItemPage";
+
+export default function App() {
+  return (
+    <Theme>
+      <MetadataProvider>
+        <ToolbarProvider>
+          <div className="grid grid-rows-[auto_1fr] h-screen">
+            <Toolbar />
+            <main className="overflow-auto">
+              <Routes>
+                <Route path="/" element={<HomePage />} />
+                <Route path="/item/:itemId" element={<ItemPage />} />
+              </Routes>
+            </main>
+          </div>
+          <Toaster />
+        </ToolbarProvider>
+      </MetadataProvider>
+    </Theme>
+  );
+}
+
+const Toolbar = () => {
+  const { buttons, breadcrumbs } = useToolbar();
+
+  return (
+    <header className="sticky top-0 z-50 flex h-16 shrink-0 items-center gap-2 border-b px-4 bg-white/95 backdrop-blur supports-[backdrop-filter]:bg-white/60">
+      <Breadcrumb>
+        <BreadcrumbList>
+          {breadcrumbs.map((item, index) => (
+            <React.Fragment key={index}>
+              {index > 0 && <BreadcrumbSeparator />}
+              <BreadcrumbItem>
+                {item.href && !item.isCurrentPage ? (
+                  <Link to={item.href} className="font-medium text-base">
+                    {item.label}
+                  </Link>
+                ) : (
+                  <span
+                    className={`font-medium ${index === 0 ? "text-base" : ""}`}
+                  >
+                    {item.label}
+                  </span>
+                )}
+              </BreadcrumbItem>
+            </React.Fragment>
+          ))}
+        </BreadcrumbList>
+      </Breadcrumb>
+      {buttons}
+    </header>
+  );
+};
@@ -0,0 +1,120 @@
+@import "tailwindcss";
+@import "tw-animate-css";
+
+@custom-variant dark (&:is(.dark *));
+
+@theme inline {
+  --radius-sm: calc(var(--radius) - 4px);
+  --radius-md: calc(var(--radius) - 2px);
+  --radius-lg: var(--radius);
+  --radius-xl: calc(var(--radius) + 4px);
+  --color-background: var(--background);
+  --color-foreground: var(--foreground);
+  --color-card: var(--card);
+  --color-card-foreground: var(--card-foreground);
+  --color-popover: var(--popover);
+  --color-popover-foreground: var(--popover-foreground);
+  --color-primary: var(--primary);
+  --color-primary-foreground: var(--primary-foreground);
+  --color-secondary: var(--secondary);
+  --color-secondary-foreground: var(--secondary-foreground);
+  --color-muted: var(--muted);
+  --color-muted-foreground: var(--muted-foreground);
+  --color-accent: var(--accent);
+  --color-accent-foreground: var(--accent-foreground);
+  --color-destructive: var(--destructive);
+  --color-border: var(--border);
+  --color-input: var(--input);
+  --color-ring: var(--ring);
+  --color-chart-1: var(--chart-1);
+  --color-chart-2: var(--chart-2);
+  --color-chart-3: var(--chart-3);
+  --color-chart-4: var(--chart-4);
+  --color-chart-5: var(--chart-5);
+  --color-sidebar: var(--sidebar);
+  --color-sidebar-foreground: var(--sidebar-foreground);
+  --color-sidebar-primary: var(--sidebar-primary);
+  --color-sidebar-primary-foreground: var(--sidebar-primary-foreground);
+  --color-sidebar-accent: var(--sidebar-accent);
+  --color-sidebar-accent-foreground: var(--sidebar-accent-foreground);
+  --color-sidebar-border: var(--sidebar-border);
+  --color-sidebar-ring: var(--sidebar-ring);
+}
+
+:root {
+  --radius: 0.625rem;
+  --card: oklch(1 0 0);
+  --card-foreground: oklch(0.141 0.005 285.823);
+  --popover: oklch(1 0 0);
+  --popover-foreground: oklch(0.141 0.005 285.823);
+  --primary: oklch(0.21 0.006 285.885);
+  --primary-foreground: oklch(0.985 0 0);
+  --secondary: oklch(0.967 0.001 286.375);
+  --secondary-foreground: oklch(0.21 0.006 285.885);
+  --muted: oklch(0.967 0.001 286.375);
+  --muted-foreground: oklch(0.552 0.016 285.938);
+  --accent: oklch(0.967 0.001 286.375);
+  --accent-foreground: oklch(0.21 0.006 285.885);
+  --destructive: oklch(0.577 0.245 27.325);
+  --border: oklch(0.92 0.004 286.32);
+  --input: oklch(0.92 0.004 286.32);
+  --ring: oklch(0.705 0.015 286.067);
+  --chart-1: oklch(0.646 0.222 41.116);
+  --chart-2: oklch(0.6 0.118 184.704);
+  --chart-3: oklch(0.398 0.07 227.392);
+  --chart-4: oklch(0.828 0.189 84.429);
+  --chart-5: oklch(0.769 0.188 70.08);
+  --sidebar: oklch(0.985 0 0);
+  --sidebar-foreground: oklch(0.141 0.005 285.823);
+  --sidebar-primary: oklch(0.21 0.006 285.885);
+  --sidebar-primary-foreground: oklch(0.985 0 0);
+  --sidebar-accent: oklch(0.967 0.001 286.375);
+  --sidebar-accent-foreground: oklch(0.21 0.006 285.885);
+  --sidebar-border: oklch(0.92 0.004 286.32);
+  --sidebar-ring: oklch(0.705 0.015 286.067);
+  --background: oklch(1 0 0);
+  --foreground: oklch(0.141 0.005 285.823);
+}
+
+.dark {
+  --background: oklch(0.141 0.005 285.823);
+  --foreground: oklch(0.985 0 0);
+  --card: oklch(0.21 0.006 285.885);
+  --card-foreground: oklch(0.985 0 0);
+  --popover: oklch(0.21 0.006 285.885);
+  --popover-foreground: oklch(0.985 0 0);
+  --primary: oklch(0.92 0.004 286.32);
+  --primary-foreground: oklch(0.21 0.006 285.885);
+  --secondary: oklch(0.274 0.006 286.033);
+  --secondary-foreground: oklch(0.985 0 0);
+  --muted: oklch(0.274 0.006 286.033);
+  --muted-foreground: oklch(0.705 0.015 286.067);
+  --accent: oklch(0.274 0.006 286.033);
+  --accent-foreground: oklch(0.985 0 0);
+  --destructive: oklch(0.704 0.191 22.216);
+  --border: oklch(1 0 0 / 10%);
+  --input: oklch(1 0 0 / 15%);
+  --ring: oklch(0.552 0.016 285.938);
+  --chart-1: oklch(0.488 0.243 264.376);
+  --chart-2: oklch(0.696 0.17 162.48);
+  --chart-3: oklch(0.769 0.188 70.08);
+  --chart-4: oklch(0.627 0.265 303.9);
+  --chart-5: oklch(0.645 0.246 16.439);
+  --sidebar: oklch(0.21 0.006 285.885);
+  --sidebar-foreground: oklch(0.985 0 0);
+  --sidebar-primary: oklch(0.488 0.243 264.376);
+  --sidebar-primary-foreground: oklch(0.985 0 0);
+  --sidebar-accent: oklch(0.274 0.006 286.033);
+  --sidebar-accent-foreground: oklch(0.985 0 0);
+  --sidebar-border: oklch(1 0 0 / 10%);
+  --sidebar-ring: oklch(0.552 0.016 285.938);
+}
+
+@layer base {
+  * {
+    @apply border-border outline-ring/50;
+  }
+  body {
+    @apply bg-background text-foreground;
+  }
+}
@@ -0,0 +1,71 @@
+import { createContext, useContext, ReactNode, useMemo } from "react";
+import { ApiProvider, ApiClients } from "@llamaindex/ui";
+import { useMetadata, Metadata } from "./useMetadata";
+import { createBaseWorkflowClient, createClients } from "./client";
+import { Clock, XCircle } from "lucide-react";
+
+interface MetadataContextValue {
+  metadata: Metadata;
+  clients: ApiClients;
+}
+
+const MetadataContext = createContext<MetadataContextValue | null>(null);
+
+export function MetadataProvider({ children }: { children: ReactNode }) {
+  const baseClients: ApiClients = useMemo(() => {
+    return {
+      workflowsClient: createBaseWorkflowClient(),
+    } as ApiClients;
+  }, []);
+  return (
+    <ApiProvider clients={baseClients}>
+      <InnerMetadataProvider>{children}</InnerMetadataProvider>
+    </ApiProvider>
+  );
+}
+
+function InnerMetadataProvider({ children }: { children: ReactNode }) {
+  const { metadata, loading, error } = useMetadata();
+  const clients = useMemo(
+    () => (metadata ? createClients(metadata) : undefined),
+    [metadata],
+  );
+
+  if (loading) {
+    return (
+      <div className="flex h-screen items-center justify-center">
+        <div className="text-center">
+          <Clock className="h-8 w-8 animate-spin mx-auto mb-2" />
+          <div className="text-sm text-gray-500">Loading configuration...</div>
+        </div>
+      </div>
+    );
+  }
+
+  if (error || !metadata || !clients) {
+    return (
+      <div className="flex h-screen items-center justify-center">
+        <div className="text-center">
+          <XCircle className="h-8 w-8 text-red-500 mx-auto mb-2" />
+          <div className="text-sm text-gray-500">
+            Error loading configuration: {error || "Unknown error"}
+          </div>
+        </div>
+      </div>
+    );
+  }
+
+  return (
+    <MetadataContext.Provider value={{ metadata, clients }}>
+      <ApiProvider clients={clients}>{children}</ApiProvider>
+    </MetadataContext.Provider>
+  );
+}
+
+export function useMetadataContext() {
+  const context = useContext(MetadataContext);
+  if (!context) {
+    throw new Error("useMetadataContext must be used within MetadataProvider");
+  }
+  return context;
+}
@@ -0,0 +1,41 @@
+import React from "react";
+import { APP_TITLE } from "./config";
+
+export interface BreadcrumbItem {
+  label: string;
+  href?: string;
+  isCurrentPage?: boolean;
+}
+
+export const ToolbarCtx = React.createContext<{
+  buttons: React.ReactNode[];
+  setButtons: (fn: (prev: React.ReactNode[]) => React.ReactNode[]) => void;
+  breadcrumbs: BreadcrumbItem[];
+  setBreadcrumbs: (items: BreadcrumbItem[]) => void;
+}>({
+  buttons: [],
+  setButtons: () => {},
+  breadcrumbs: [],
+  setBreadcrumbs: () => {},
+});
+
+export const ToolbarProvider = ({
+  children,
+}: {
+  children: React.ReactNode;
+}) => {
+  const [buttons, setButtons] = React.useState<React.ReactNode[]>([]);
+  const [breadcrumbs, setBreadcrumbs] = React.useState<BreadcrumbItem[]>([
+    { label: APP_TITLE, href: "/" },
+  ]);
+
+  return (
+    <ToolbarCtx.Provider
+      value={{ buttons, setButtons, breadcrumbs, setBreadcrumbs }}
+    >
+      {children}
+    </ToolbarCtx.Provider>
+  );
+};
+
+export const useToolbar = () => React.useContext(ToolbarCtx);
@@ -0,0 +1,51 @@
+import { ExtractedData } from "llama-cloud-services/beta/agent";
+import {
+  ApiClients,
+  createWorkflowsClient,
+  createWorkflowsConfig,
+  createCloudAgentClient,
+  cloudApiClient,
+} from "@llamaindex/ui";
+import { AGENT_NAME } from "./config";
+import type { Metadata } from "./useMetadata";
+
+const platformToken = import.meta.env.VITE_LLAMA_CLOUD_API_KEY;
+const apiBaseUrl = import.meta.env.VITE_LLAMA_CLOUD_BASE_URL;
+const projectId = import.meta.env.VITE_LLAMA_DEPLOY_PROJECT_ID;
+
+// Configure the platform client
+cloudApiClient.setConfig({
+  ...(apiBaseUrl && { baseUrl: apiBaseUrl }),
+  headers: {
+    // optionally use a backend API token scoped to a project. For local development,
+    ...(platformToken && { authorization: `Bearer ${platformToken}` }),
+    // This header is required for requests to correctly scope to the agent's project
+    // when authenticating with a user cookie
+    ...(projectId && { "Project-Id": projectId }),
+  },
+});
+
+export function createBaseWorkflowClient(): ReturnType<
+  typeof createWorkflowsClient
+> {
+  return createWorkflowsClient(
+    createWorkflowsConfig({
+      baseUrl: `/deployments/${AGENT_NAME}/`,
+    }),
+  );
+}
+
+export function createClients(metadata: Metadata): ApiClients {
+  const workflowsClient = createBaseWorkflowClient();
+  const agentClient = createCloudAgentClient<ExtractedData<any>>({
+    client: cloudApiClient,
+    windowUrl: typeof window !== "undefined" ? window.location.href : undefined,
+    collection: metadata.extracted_data_collection,
+  });
+
+  return {
+    workflowsClient,
+    cloudApiClient,
+    agentDataClient: agentClient,
+  } as ApiClients;
+}
@@ -0,0 +1,2 @@
+export const APP_TITLE = "Extraction Review";
+export const AGENT_NAME = import.meta.env.VITE_LLAMA_DEPLOY_DEPLOYMENT_NAME;
@@ -0,0 +1,39 @@
+import type {
+  ExtractedData,
+  TypedAgentData,
+} from "llama-cloud-services/beta/agent";
+
+/**
+ * Downloads data as a JSON file
+ */
+export function downloadJSON<T>(
+  data: T,
+  filename: string = "extraction-results.json",
+) {
+  const jsonString = JSON.stringify(data, null, 2);
+  const blob = new Blob([jsonString], { type: "application/json" });
+  const url = URL.createObjectURL(blob);
+
+  const link = document.createElement("a");
+  link.href = url;
+  link.download = filename;
+  document.body.appendChild(link);
+  link.click();
+
+  // Cleanup
+  document.body.removeChild(link);
+  URL.revokeObjectURL(url);
+}
+
+/**
+ * Downloads extracted data item as JSON
+ */
+export function downloadExtractedDataItem<T>(
+  item: TypedAgentData<ExtractedData<T>>,
+) {
+  const fileName = item.data.file_name || "item";
+  const timestamp = item.createdAt.toISOString().split("T")[0];
+  const filename = `${fileName}-${timestamp}.json`;
+
+  downloadJSON(item, filename);
+}
@@ -0,0 +1,41 @@
+import { useWorkflowHandler, useWorkflowRun } from "@llamaindex/ui";
+import { useEffect, useState } from "react";
+
+export interface Metadata {
+  json_schema: any;
+  extracted_data_collection: string;
+}
+
+export interface UseMetadataResult {
+  metadata: Metadata;
+  loading: boolean;
+  error: string | undefined;
+}
+
+export function useMetadata() {
+  const run = useWorkflowRun();
+  const [handlerId, setHandlerId] = useState<string | undefined>(undefined);
+  const handler = useWorkflowHandler(handlerId ?? "");
+  const [error, setError] = useState<string | undefined>(undefined);
+  const [loading, setLoading] = useState(true);
+
+  useEffect(() => {
+    setLoading(true);
+    run
+      .runWorkflow("metadata", {})
+      .then((handlerSummary) => {
+        setHandlerId(handlerSummary.handler_id);
+      })
+      .catch((error) => {
+        setError(error.message);
+      })
+      .finally(() => {
+        setLoading(false);
+      });
+  }, []);
+  const stopEvent = handler.events.find((event) =>
+    event.type.endsWith("MetadataResponse"),
+  );
+  const metadata = stopEvent?.data as Metadata | undefined;
+  return { metadata, loading, error };
+}
@@ -0,0 +1,6 @@
+import { clsx, type ClassValue } from "clsx";
+import { twMerge } from "tailwind-merge";
+
+export function cn(...inputs: ClassValue[]) {
+  return twMerge(clsx(inputs));
+}
@@ -0,0 +1,14 @@
+import { StrictMode } from "react";
+import { createRoot } from "react-dom/client";
+import { HashRouter } from "react-router-dom";
+import App from "./App";
+import "@llamaindex/ui/styles.css";
+import "./index.css";
+
+createRoot(document.getElementById("root")!).render(
+  <StrictMode>
+    <HashRouter>
+      <App />
+    </HashRouter>
+  </StrictMode>,
+);
@@ -0,0 +1,23 @@
+.main {
+  padding: 1rem;
+}
+
+.grid {
+  display: flex;
+  flex-direction: row;
+  gap: 1rem;
+  margin-bottom: 1rem;
+  & > * {
+    flex: 1;
+  }
+}
+
+.commandBar {
+  display: flex;
+  justify-content: flex-end;
+  margin-bottom: 1rem;
+}
+
+.progressBar {
+  margin-bottom: 1rem;
+}
@@ -0,0 +1,98 @@
+import {
+  ItemCount,
+  WorkflowTrigger,
+  WorkflowProgressBar,
+  ExtractedDataItemGrid,
+  useWorkflowHandlerList,
+} from "@llamaindex/ui";
+import type { TypedAgentData } from "llama-cloud-services/beta/agent";
+import styles from "./HomePage.module.css";
+import { useNavigate } from "react-router-dom";
+import { useEffect, useState } from "react";
+
+export default function HomePage() {
+  const { taskKey } = taskCompletedState();
+  return <TaskList key={taskKey} />;
+}
+
+/**
+ * Returns a key that increments when a task is completed, can be used to force a re-render of the task list
+ */
+function taskCompletedState() {
+  const { handlers } = useWorkflowHandlerList("process-file");
+  const runningTasks = handlers.filter(
+    (handler) => handler.status === "running",
+  );
+  const [runningTaskCount, setRunningTaskCount] = useState(runningTasks.length);
+  const [taskKey, setTaskKey] = useState(0);
+  useEffect(() => {
+    if (runningTasks.length < runningTaskCount) {
+      // forcefully reload task list after a task is completed
+      setTaskKey(taskKey + 1);
+    }
+    setRunningTaskCount(runningTasks.length);
+  }, [runningTasks.length]);
+  return { runningTaskCount, taskKey };
+}
+
+function TaskList() {
+  const navigate = useNavigate();
+  const goToItem = (item: TypedAgentData) => {
+    navigate(`/item/${item.id}`);
+  };
+  return (
+    <div className={styles.page}>
+      <main className={styles.main}>
+        <div className={styles.grid}>
+          <ItemCount title="Total Items" />
+          <ItemCount
+            title="Reviewed"
+            filter={{
+              status: { eq: "approved" },
+            }}
+          />
+          <ItemCount
+            title="Needs Review"
+            filter={{
+              status: { eq: "pending_review" },
+            }}
+          />
+        </div>
+        <div className={styles.commandBar}>
+          <WorkflowTrigger
+            workflowName="process-file"
+            customWorkflowInput={(files) => {
+              return {
+                file_id: files[0].fileId,
+              };
+            }}
+            title="Upload Invoice"
+          />
+          <WorkflowTrigger
+            workflowName="index-contract"
+            customWorkflowInput={(files) => {
+              return {
+                file_id: files[0].fileId,
+              };
+            }}
+            title="Upload Contract"
+          />
+        </div>
+        <WorkflowProgressBar
+          className={styles.progressBar}
+          workflowName="process-file"
+        />
+        <ExtractedDataItemGrid
+          onRowClick={goToItem}
+          builtInColumns={{
+            fileName: true,
+            status: true,
+            createdAt: true,
+            itemsToReview: true,
+            actions: true,
+          }}
+        />
+      </main>
+    </div>
+  );
+}
@@ -0,0 +1,152 @@
+import { useEffect, useState } from "react";
+import {
+  AcceptReject,
+  ExtractedDataDisplay,
+  FilePreview,
+  useItemData,
+  type Highlight,
+  Button,
+} from "@llamaindex/ui";
+import { Clock, XCircle, Download } from "lucide-react";
+import { useParams } from "react-router-dom";
+import { useToolbar } from "@/lib/ToolbarContext";
+import { useNavigate } from "react-router-dom";
+import { modifyJsonSchema } from "@llamaindex/ui/lib";
+import { APP_TITLE } from "@/lib/config";
+import { downloadExtractedDataItem } from "@/lib/export";
+import { useMetadataContext } from "@/lib/MetadataProvider";
+
+export default function ItemPage() {
+  const { itemId } = useParams<{ itemId: string }>();
+  const { setButtons, setBreadcrumbs } = useToolbar();
+  const [highlight, setHighlight] = useState<Highlight | undefined>(undefined);
+  const { metadata } = useMetadataContext();
+  // Use the hook to fetch item data
+  const itemHookData = useItemData<any>({
+    // order/remove fields as needed here
+    jsonSchema: modifyJsonSchema(metadata.json_schema, {}),
+    itemId: itemId as string,
+    isMock: false,
+  });
+
+  const navigate = useNavigate();
+
+  // Update breadcrumb when item data loads
+  useEffect(() => {
+    const fileName = itemHookData.item?.data?.file_name;
+    if (fileName) {
+      setBreadcrumbs([
+        { label: APP_TITLE, href: "/" },
+        {
+          label: fileName,
+          isCurrentPage: true,
+        },
+      ]);
+    }
+
+    return () => {
+      // Reset to default breadcrumb when leaving the page
+      setBreadcrumbs([{ label: APP_TITLE, href: "/" }]);
+    };
+  }, [itemHookData.item?.data?.file_name, setBreadcrumbs]);
+
+  useEffect(() => {
+    setButtons(() => [
+      <div className="ml-auto flex items-center gap-2">
+        <Button
+          variant="outline"
+          size="sm"
+          onClick={() => {
+            if (itemData) {
+              downloadExtractedDataItem(itemData);
+            }
+          }}
+          disabled={!itemData}
+        >
+          <Download className="h-4 w-4 mr-2" />
+          Export JSON
+        </Button>
+        <AcceptReject<any>
+          itemData={itemHookData}
+          onComplete={() => navigate("/")}
+        />
+      </div>,
+    ]);
+    return () => {
+      setButtons(() => []);
+    };
+  }, [itemHookData.data, setButtons]);
+
+  const {
+    item: itemData,
+    updateData,
+    loading: isLoading,
+    error,
+  } = itemHookData;
+
+  if (isLoading) {
+    return (
+      <div className="flex h-screen items-center justify-center">
+        <div className="text-center">
+          <Clock className="h-8 w-8 animate-spin mx-auto mb-2" />
+          <div className="text-sm text-gray-500">Loading item...</div>
+        </div>
+      </div>
+    );
+  }
+
+  if (error || !itemData) {
+    return (
+      <div className="flex h-screen items-center justify-center">
+        <div className="text-center">
+          <XCircle className="h-8 w-8 text-red-500 mx-auto mb-2" />
+          <div className="text-sm text-gray-500">
+            Error loading item: {error || "Item not found"}
+          </div>
+        </div>
+      </div>
+    );
+  }
+
+  return (
+    <div className="flex h-full bg-gray-50">
+      {/* Left Side - File Preview */}
+      <div className="w-1/2 border-r border-gray-200 bg-white">
+        {itemData.data.file_id && (
+          <FilePreview
+            fileId={itemData.data.file_id}
+            onBoundingBoxClick={(box, pageNumber) => {
+              console.log("Bounding box clicked:", box, "on page:", pageNumber);
+            }}
+            highlight={highlight}
+          />
+        )}
+      </div>
+
+      {/* Right Side - Review Panel */}
+      <div className="flex-1 bg-white h-full overflow-y-auto">
+        <div className="p-4 space-y-4">
+          {/* Extracted Data */}
+          <ExtractedDataDisplay<any>
+            extractedData={itemData.data}
+            title="Extracted Data"
+            onChange={(updatedData) => {
+              updateData(updatedData);
+            }}
+            onClickField={(args) => {
+              // TODO: set multiple highlights
+              setHighlight({
+                page: args.metadata?.citation?.[0]?.page ?? 1,
+                x: 100,
+                y: 100,
+                width: 0,
+                height: 0,
+              });
+            }}
+            jsonSchema={itemHookData.jsonSchema}
+          />
+        </div>
+      </div>
+    </div>
+  );
+}
@@ -0,0 +1,15 @@
+/// <reference types="vite/client" />
+
+interface ImportMetaEnv {
+  readonly VITE_LLAMA_CLOUD_API_KEY?: string;
+  readonly VITE_LLAMA_CLOUD_BASE_URL?: string;
+
+  // injected from llama_deploy
+  readonly VITE_LLAMA_DEPLOY_BASE_PATH: string;
+  readonly VITE_LLAMA_DEPLOY_DEPLOYMENT_NAME: string;
+  readonly VITE_LLAMA_DEPLOY_PROJECT_ID: string;
+}
+
+interface ImportMeta {
+  readonly env: ImportMetaEnv;
+}
@@ -0,0 +1,31 @@
+{
+  "compilerOptions": {
+    "target": "ES2020",
+    "useDefineForClassFields": true,
+    "lib": ["ES2020", "DOM", "DOM.Iterable"],
+    "module": "ESNext",
+    "skipLibCheck": true,
+    
+    /* Bundler mode */
+    "moduleResolution": "bundler",
+    "allowImportingTsExtensions": true,
+    "resolveJsonModule": true,
+    "isolatedModules": true,
+    "noEmit": true,
+    "jsx": "react-jsx",
+    
+    /* Linting */
+    "strict": true,
+    "noUnusedLocals": true,
+    "noUnusedParameters": true,
+    "noFallthroughCasesInSwitch": true,
+    
+    /* Path mapping */
+    "baseUrl": ".",
+    "paths": {
+      "@/*": ["./src/*"]
+    }
+  },
+  "include": ["src", "vite.config.ts", "src/vite-env.d.ts"],
+  "exclude": ["node_modules"]
+}
@@ -0,0 +1,43 @@
+import { defineConfig } from "vite";
+import react from "@vitejs/plugin-react";
+import path from "path";
+
+// https://vitejs.dev/config/
+export default defineConfig(({}) => {
+  const deploymentName = process.env.LLAMA_DEPLOY_DEPLOYMENT_NAME;
+  const basePath = process.env.LLAMA_DEPLOY_DEPLOYMENT_BASE_PATH;
+  const projectId = process.env.LLAMA_DEPLOY_PROJECT_ID;
+  const port = process.env.PORT ? Number(process.env.PORT) : 3000;
+  const baseUrl = process.env.LLAMA_CLOUD_BASE_URL;
+  return {
+    plugins: [react()],
+    resolve: {
+      alias: {
+        "@": path.resolve(__dirname, "./src"),
+      },
+    },
+    server: {
+      port: port,
+      host: true,
+    },
+    build: {
+      outDir: "dist",
+      sourcemap: true,
+    },
+    base: basePath,
+    define: {
+      // Primary define uses NAME
+      "import.meta.env.VITE_LLAMA_DEPLOY_DEPLOYMENT_NAME": JSON.stringify(
+        deploymentName
+      ),
+      "import.meta.env.VITE_LLAMA_DEPLOY_DEPLOYMENT_BASE_PATH": JSON.stringify(basePath),
+      ...(projectId && {
+        "import.meta.env.VITE_LLAMA_DEPLOY_PROJECT_ID":
+          JSON.stringify(projectId),
+      }),
+      ...(baseUrl && {
+        "import.meta.env.VITE_LLAMA_CLOUD_BASE_URL": JSON.stringify(baseUrl),
+      }),
+    },
+  };
+});
@@ -0,0 +1,2 @@
+# Changes here will be overwritten by Copier; NEVER EDIT MANUALLY
+{{ _copier_answers|to_nice_yaml -}}