Add back classify-extract-sec

2026-06-30 21:57:55 -04:00 · 2025-11-04 17:29:54 -05:00
parent 9433b0dab5
commit 726d6c0e4c
35 changed files with 2057 additions and 2 deletions
@@ -0,0 +1,3 @@
+# Changes here will be overwritten by Copier; NEVER EDIT MANUALLY
+_commit: v0.2.1
+_src_path: https://github.com/run-llama/template-workflow-data-extraction
@@ -0,0 +1,2 @@
+# copy to .env and place any needed secrets here. LLAMA_CLOUD_API_KEY will be automatically set
+# OPENAI_API_KEY=sk-xxx
@@ -0,0 +1,7 @@
+.env
+__pycache__
+workflows.db
+
+.venv
+package-lock.json
+node_modules
@@ -1,2 +1,67 @@
-# template-workflow-classify-extract-sec
-Llama Index Workflow Template
+# SEC Filing Data Extraction and Analysis
+
+A LlamaAgents application for extracting structured information from SEC filings using LlamaClassify and LlamaExtract. This application automatically classifies SEC documents (10-K, 10-Q, 8-K, or other) and extracts relevant financial and business information tailored to each filing type.
+
+## Features
+
+- **Intelligent Classification**: Uses LlamaClassify to automatically identify SEC filing types (10-K, 10-Q, 8-K, other)
+- **Dynamic Schema Selection**: Applies specialized extraction schemas based on document type
+- **Comprehensive Data Extraction**: Extracts filing-specific information:
+  - **10-K**: Annual reports with financial metrics, risk factors, business descriptions, executive information
+  - **10-Q**: Quarterly reports with period-over-period comparisons and updates
+  - **8-K**: Current reports with material event information and impact analysis
+  - **Other**: Catch-all for S-1, DEF 14A, 13F, and other filing types
+- **Agent Data Storage**: Stores extracted data in LlamaCloud Agent Data for easy querying and analysis
+- **UI Integration**: Web interface for reviewing and managing extracted data
+
+## Configuration
+
+All main configuration is in `src/extraction_review/config.py`:
+
+## How It Works
+
+The application uses a multi-step workflow powered by LlamaIndex:
+
+1. **File Upload**: User uploads an SEC filing document through the UI
+2. **Download**: File is downloaded from LlamaCloud storage
+3. **Classification**: LlamaClassify analyzes the first 5 pages to determine filing type (10-K, 10-Q, 8-K, or other)
+4. **Schema Selection**: Appropriate extraction schema is selected based on classification
+5. **Extraction**: LlamaExtract processes the document using the selected schema
+6. **Storage**: Extracted data is stored in Agent Data with deduplication by file hash
+7. **Review**: UI displays extracted data for review and editing
+
+### Workflows
+
+The application includes two main workflows:
+
+- **`process-file`** (`src/extraction_review/process_file.py`): Main workflow for processing SEC filings
+  - Steps: download → classify → extract → store
+  - Uses typed context to pass state between steps
+  - Streams progress updates to UI via `UIToast` events
+
+- **`metadata`** (`src/extraction_review/metadata_workflow.py`): Exposes configuration metadata to UI
+  - Returns JSON schema and collection name for dynamic UI generation
+
+## Linting and Type Checking
+
+Python and javascript packages contain helpful scripts to lint, format, and type check the code.
+
+To check and fix python code:
+
+```bash
+uv run hatch run lint
+uv run hatch run typecheck
+uv run hatch run test
+# run all at once
+uv run hatch run all-fix
+```
+
+To check and fix javascript code, within the `ui` directory:
+
+```bash
+pnpm run lint
+pnpm run typecheck
+pnpm run test
+# run all at once
+pnpm run all-fix
+```
@@ -0,0 +1,51 @@
+[project]
+name = "extraction-review"
+version = "0.1.0"
+description = "Extracts data"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "llama-cloud-services>=0.6.69",
+    "llama-index-workflows>=2.2.0,<3.0.0",
+    "python-dotenv>=1.1.0",
+    "jsonref>=1.1.0",
+    "click>=8.2.1,<8.3.0",
+    "httpx>=0.28.1",
+    "llama-index-core>=0.14.0",
+]
+
+[dependency-groups]
+dev = [
+    "ruff>=0.11.10",
+    "typescript>=0.0.12",
+    "ty>=0.0.1a16",
+    "pytest>=8.4.1",
+    "hatch>=1.14.1",
+    "llamactl>=0.3.0"
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.envs.default.scripts]
+"format" = "ruff format ."
+"format-check" = "ruff format --check ."
+"lint" = "ruff check --fix ."
+"lint-check" = ["ruff check ."]
+typecheck = "ty check src"
+test = "pytest"
+"all-check" = ["format-check", "lint-check", "test"]
+"all-fix" = ["format", "lint", "test"]
+
+[tool.llamadeploy]
+env_files = [".env"]
+llama_cloud = true
+
+[tool.llamadeploy.workflows]
+process-file = "extraction_review.process_file:workflow"
+metadata = "extraction_review.metadata_workflow:workflow"
+
+[tool.llamadeploy.ui]
+directory = "ui"
+
@@ -0,0 +1,88 @@
+import functools
+import os
+from typing import Any
+import httpx
+
+from llama_cloud_services import ExtractionAgent, LlamaExtract
+from llama_cloud.core.api_error import ApiError
+from llama_cloud_services.beta.agent_data import AsyncAgentDataClient, ExtractedData
+from llama_cloud_services.beta.classifier.client import ClassifyClient
+from llama_cloud.client import AsyncLlamaCloud
+import logging
+
+from extraction_review.config import (
+    EXTRACT_CONFIG,
+    EXTRACTED_DATA_COLLECTION,
+    EXTRACTION_AGENT_NAME,
+    USE_REMOTE_EXTRACTION_SCHEMA,
+    ExtractionSchema,
+)
+
+logger = logging.getLogger(__name__)
+
+# deployed agents may infer their name from the deployment name
+# Note: Make sure that an agent deployment with this name actually exists
+# otherwise calls to get or set data will fail. You may need to adjust the `or `
+# name for development
+agent_name = os.getenv("LLAMA_DEPLOY_DEPLOYMENT_NAME")
+# required for all llama cloud calls
+api_key = os.environ["LLAMA_CLOUD_API_KEY"]
+# get this in case running against a different environment than production
+base_url = os.getenv("LLAMA_CLOUD_BASE_URL")
+project_id = os.getenv("LLAMA_DEPLOY_PROJECT_ID")
+
+
+@functools.lru_cache(maxsize=None)
+def get_extract_agent() -> ExtractionAgent:
+    extract_api = LlamaExtract(
+        api_key=api_key, base_url=base_url, project_id=project_id
+    )
+
+    try:
+        existing = extract_api.get_agent(EXTRACTION_AGENT_NAME)
+        if not USE_REMOTE_EXTRACTION_SCHEMA:
+            existing.data_schema = ExtractionSchema
+            existing.config = EXTRACT_CONFIG
+        return existing
+    except ApiError as e:
+        if e.status_code == 404:
+            if USE_REMOTE_EXTRACTION_SCHEMA:
+                logger.warning(
+                    "Extraction agent does not exist, creating a new one from the local schema"
+                )
+            return extract_api.create_agent(
+                name=EXTRACTION_AGENT_NAME,
+                data_schema=ExtractionSchema,
+                config=EXTRACT_CONFIG,
+            )
+        else:
+            raise
+
+
+@functools.lru_cache(maxsize=None)
+def get_data_client() -> AsyncAgentDataClient:
+    return AsyncAgentDataClient(
+        deployment_name=agent_name,
+        collection=EXTRACTED_DATA_COLLECTION,
+        type=ExtractedData[Any],
+        client=get_llama_cloud_client(),
+    )
+
+
+@functools.lru_cache(maxsize=None)
+def get_llama_cloud_client():
+    return AsyncLlamaCloud(
+        base_url=base_url,
+        token=api_key,
+        httpx_client=httpx.AsyncClient(
+            timeout=60, headers={"Project-Id": project_id} if project_id else None
+        ),
+    )
+
+
+@functools.lru_cache(maxsize=None)
+def get_classifier_client():
+    return ClassifyClient(
+        client=get_llama_cloud_client(),
+        project_id=project_id,
+    )
@@ -0,0 +1,362 @@
+"""
+For simple configuration of the extraction review application, just customize this file.
+
+If you need more control, feel free to edit the rest of the application
+"""
+
+from __future__ import annotations
+import os
+from typing import Type
+
+from llama_cloud import ExtractConfig
+from llama_cloud_services.extract import ExtractMode
+from pydantic import BaseModel, Field
+
+# If you change this to true, the schema and extraction configuration will be fetched from the remote extraction agent
+# rather than using the ExtractionSchema and configuration defined below.
+USE_REMOTE_EXTRACTION_SCHEMA: bool = False
+# The name of the extraction agent to use. Prefers the name of this deployment when deployed to isolate environments.
+# Note that the application will create a new agent from the below ExtractionSchema if the extraction agent does not yet exist.
+EXTRACTION_AGENT_NAME: str = (
+    os.getenv("LLAMA_DEPLOY_DEPLOYMENT_NAME") or "extraction-review"
+)
+# The name of the collection to use for storing extracted data. This will be qualified by the agent name.
+# When developing locally, this will use the _public collection (shared within the project), otherwise agent
+# data is isolated to each agent
+EXTRACTED_DATA_COLLECTION: str = "sec-filing-extraction"
+
+
+# SEC Filing Classification Types
+SEC_FILING_TYPES = ["10-K", "10-Q", "8-K", "other"]
+
+
+# Base class for common fields across all SEC filings
+class BaseSECFiling(BaseModel):
+    """Common fields present in all SEC filings"""
+
+    company_name: str = Field(
+        description="The full legal name of the company filing the document"
+    )
+    ticker_symbol: str | None = Field(
+        default=None,
+        description="The stock ticker symbol of the company. May not be present for all filings.",
+    )
+    cik: str | None = Field(
+        default=None,
+        description="Central Index Key - the unique identifier assigned by the SEC to the company",
+    )
+    filing_date: str | None = Field(
+        default=None,
+        description="The date the document was filed with the SEC (format: YYYY-MM-DD)",
+    )
+    fiscal_year_end: str | None = Field(
+        default=None,
+        description="The fiscal year end date for the company (format: YYYY-MM-DD)",
+    )
+    sic_code: str | None = Field(
+        default=None,
+        description="Standard Industrial Classification code for the company's industry",
+    )
+
+
+# Financial metrics that appear in multiple filing types
+class FinancialMetrics(BaseModel):
+    """Key financial metrics extracted from statements"""
+
+    total_revenue: str | None = Field(
+        default=None,
+        description="Total revenue/sales for the period. Include currency and amount (e.g., '$1.2B USD')",
+    )
+    net_income: str | None = Field(
+        default=None,
+        description="Net income/profit for the period. Include currency and amount",
+    )
+    total_assets: str | None = Field(
+        default=None,
+        description="Total assets as of the balance sheet date. Include currency and amount",
+    )
+    total_liabilities: str | None = Field(
+        default=None,
+        description="Total liabilities as of the balance sheet date. Include currency and amount",
+    )
+    stockholders_equity: str | None = Field(
+        default=None,
+        description="Total stockholders' equity. Include currency and amount",
+    )
+    cash_and_equivalents: str | None = Field(
+        default=None,
+        description="Cash and cash equivalents. Include currency and amount",
+    )
+    earnings_per_share: str | None = Field(
+        default=None, description="Earnings per share (EPS) for the period"
+    )
+
+
+# Risk factor for use in 10-K and 10-Q
+class RiskFactor(BaseModel):
+    """Individual risk factor identified in the filing"""
+
+    category: str = Field(
+        description="Category of risk (e.g., 'Market Risk', 'Operational Risk', 'Legal Risk')"
+    )
+    description: str = Field(description="Brief description of the specific risk")
+
+
+# 10-K: Annual Report
+class Filing10K(BaseSECFiling):
+    """
+    Form 10-K is an annual report required by the SEC that provides a comprehensive
+    summary of a company's financial performance.
+    """
+
+    document_type: str = Field(default="10-K", description="Should always be '10-K'")
+    fiscal_year: int | None = Field(
+        default=None,
+        description="The fiscal year covered by this annual report (e.g., 2023)",
+    )
+
+    # Business overview
+    business_description: str | None = Field(
+        default=None,
+        description="A 2-3 sentence summary of the company's business and operations",
+    )
+
+    # Financial data
+    financial_metrics: FinancialMetrics | None = Field(
+        default=None, description="Key financial metrics from the annual statements"
+    )
+
+    # Risk factors
+    risk_factors: list[RiskFactor] | None = Field(
+        default=None,
+        description="List of material risk factors disclosed in the filing. Extract 3-5 most significant risks.",
+    )
+
+    # Management discussion
+    management_discussion_summary: str | None = Field(
+        default=None,
+        description="2-3 sentence summary of Management's Discussion and Analysis (MD&A) section",
+    )
+
+    # Legal proceedings
+    legal_proceedings: list[str] | None = Field(
+        default=None,
+        description="List of significant legal proceedings or litigation mentioned",
+    )
+
+    # Executive officers
+    executive_officers: list[str] | None = Field(
+        default=None,
+        description="Names and titles of key executive officers (CEO, CFO, etc.)",
+    )
+
+    # Auditor information
+    auditor_name: str | None = Field(
+        default=None,
+        description="Name of the independent registered public accounting firm",
+    )
+
+    # Key insights
+    key_highlights: list[str] | None = Field(
+        default=None,
+        description="3-5 key highlights or notable items from the annual report",
+    )
+
+
+# 10-Q: Quarterly Report
+class Filing10Q(BaseSECFiling):
+    """
+    Form 10-Q is a quarterly report that provides a continuing view of a company's
+    financial position during the year.
+    """
+
+    document_type: str = Field(default="10-Q", description="Should always be '10-Q'")
+    fiscal_quarter: str | None = Field(
+        default=None,
+        description="The fiscal quarter covered (e.g., 'Q1 2024', 'Q2 2023')",
+    )
+    fiscal_year: int | None = Field(
+        default=None, description="The fiscal year for this quarter (e.g., 2024)"
+    )
+    period_end_date: str | None = Field(
+        default=None,
+        description="The end date of the quarterly period (format: YYYY-MM-DD)",
+    )
+
+    # Financial data
+    financial_metrics: FinancialMetrics | None = Field(
+        default=None, description="Key financial metrics from the quarterly statements"
+    )
+
+    # Comparison to prior periods
+    year_over_year_revenue_change: str | None = Field(
+        default=None,
+        description="Year-over-year revenue change percentage or description (e.g., 'up 15%')",
+    )
+    quarter_over_quarter_revenue_change: str | None = Field(
+        default=None,
+        description="Quarter-over-quarter revenue change percentage or description",
+    )
+
+    # Management discussion
+    management_discussion_summary: str | None = Field(
+        default=None,
+        description="2-3 sentence summary of Management's Discussion and Analysis for the quarter",
+    )
+
+    # Risk factors
+    material_changes_to_risks: str | None = Field(
+        default=None,
+        description="Summary of any material changes to risk factors since the last 10-K",
+    )
+
+    # Legal updates
+    legal_proceedings_updates: list[str] | None = Field(
+        default=None,
+        description="Updates to legal proceedings or new litigation since last filing",
+    )
+
+    # Key insights
+    key_highlights: list[str] | None = Field(
+        default=None,
+        description="3-5 key highlights or notable items from the quarterly report",
+    )
+
+
+# 8-K: Current Report
+class Filing8K(BaseSECFiling):
+    """
+    Form 8-K is a current report used to notify investors of significant events
+    that shareholders should know about.
+    """
+
+    document_type: str = Field(default="8-K", description="Should always be '8-K'")
+
+    # Event information
+    event_date: str | None = Field(
+        default=None,
+        description="The date of the event being reported (format: YYYY-MM-DD)",
+    )
+    event_type: str | None = Field(
+        default=None,
+        description="Type of event (e.g., 'Merger/Acquisition', 'Leadership Change', 'Earnings Release', 'Material Agreement')",
+    )
+    item_numbers: list[str] | None = Field(
+        default=None,
+        description="Item numbers from the 8-K form (e.g., ['1.01', '5.02']) indicating which sections are included",
+    )
+
+    # Event description
+    event_summary: str = Field(
+        description="2-4 sentence summary describing the material event being reported"
+    )
+    event_details: str | None = Field(
+        default=None,
+        description="More detailed description of the event and its implications",
+    )
+
+    # Financial impact
+    estimated_financial_impact: str | None = Field(
+        default=None,
+        description="Estimated financial impact of the event, if disclosed",
+    )
+
+    # Related parties
+    related_parties: list[str] | None = Field(
+        default=None,
+        description="Names of other companies, individuals, or entities involved in the event",
+    )
+
+    # Exhibits filed
+    material_exhibits: list[str] | None = Field(
+        default=None,
+        description="Description of significant exhibits filed with the 8-K (e.g., 'Press Release', 'Material Agreement')",
+    )
+
+    # Forward-looking statements
+    contains_forward_looking_statements: bool | None = Field(
+        default=None,
+        description="Whether the filing contains forward-looking statements",
+    )
+
+    # Key takeaways
+    investment_implications: str | None = Field(
+        default=None,
+        description="1-2 sentence assessment of potential implications for investors",
+    )
+
+
+# Other filings catch-all
+class FilingOther(BaseSECFiling):
+    """
+    Catch-all schema for other SEC filing types (e.g., S-1, DEF 14A, 13F, etc.)
+    """
+
+    document_type: str = Field(
+        description="The type of SEC filing (e.g., 'S-1', 'DEF 14A', '13F', 'SC 13D')"
+    )
+
+    filing_purpose: str | None = Field(
+        default=None,
+        description="The purpose of this filing type (e.g., 'IPO Registration', 'Proxy Statement', 'Insider Holdings')",
+    )
+
+    summary: str = Field(
+        description="3-4 sentence summary of the filing's key content and purpose"
+    )
+
+    key_information: list[str] | None = Field(
+        default=None,
+        description="List of 3-7 key pieces of information from the filing",
+    )
+
+    financial_data: FinancialMetrics | None = Field(
+        default=None, description="Any relevant financial metrics present in the filing"
+    )
+
+    material_events: list[str] | None = Field(
+        default=None,
+        description="List of any material events or transactions described",
+    )
+
+    parties_involved: list[str] | None = Field(
+        default=None,
+        description="Other parties mentioned (companies, executives, investors, etc.)",
+    )
+
+    investment_relevance: str | None = Field(
+        default=None,
+        description="Brief note on why this filing might be relevant for investment analysis",
+    )
+
+
+# Default schema for backward compatibility - now uses 10-K as the base
+class ExtractionSchema(Filing10K):
+    """Default extraction schema - uses 10-K structure for backward compatibility"""
+
+    pass
+
+
+# Mapping of filing types to their schemas
+FILING_SCHEMAS = {
+    "10-K": Filing10K,
+    "10-Q": Filing10Q,
+    "8-K": Filing8K,
+    "other": FilingOther,
+}
+
+
+# This is only used if USE_REMOTE_EXTRACTION_SCHEMA is False.
+EXTRACT_CONFIG = ExtractConfig(
+    extraction_mode=ExtractMode.PREMIUM,
+    system_prompt=None,
+    # advanced. Only compatible with Premium mode.
+    use_reasoning=False,
+    cite_sources=False,
+    confidence_scores=True,
+)
+
+
+SCHEMA: Type[BaseModel] | None = (
+    None if USE_REMOTE_EXTRACTION_SCHEMA else ExtractionSchema
+)
@@ -0,0 +1,36 @@
+from typing import Any
+from workflows import Workflow, step
+from workflows.events import StartEvent, StopEvent
+
+import jsonref
+
+from .config import EXTRACTED_DATA_COLLECTION, FILING_SCHEMAS
+
+
+class MetadataResponse(StopEvent):
+    schemas: dict[str, dict[str, Any]]
+    extracted_data_collection: str
+
+
+class MetadataWorkflow(Workflow):
+    """
+    Simple single step workflow to expose configuration to the UI, such as all JSON schemas and collection name.
+    """
+
+    @step
+    async def get_metadata(self, _: StartEvent) -> MetadataResponse:
+        # Convert all filing schemas to JSON schemas
+        schemas = {}
+        for filing_type, schema_class in FILING_SCHEMAS.items():
+            json_schema = schema_class.model_json_schema()
+            # Resolve any $ref references
+            json_schema = jsonref.replace_refs(json_schema, proxies=False)
+            schemas[filing_type] = json_schema
+
+        return MetadataResponse(
+            schemas=schemas,
+            extracted_data_collection=EXTRACTED_DATA_COLLECTION,
+        )
+
+
+workflow = MetadataWorkflow(timeout=None)
@@ -0,0 +1,401 @@
+import asyncio
+import hashlib
+import logging
+import os
+from pathlib import Path
+import tempfile
+from typing import Any, Literal
+
+import httpx
+from llama_cloud import ClassificationResult, ExtractRun
+from llama_cloud.types import ClassifierRule, ClassifyParsingConfiguration
+from llama_cloud_services.extract import SourceText
+from llama_cloud_services.beta.agent_data import ExtractedData, InvalidExtractionData
+from pydantic import BaseModel
+from workflows import Context, Workflow, step
+from workflows.events import Event, StartEvent, StopEvent
+
+from .clients import (
+    get_classifier_client,
+    get_llama_cloud_client,
+    get_data_client,
+    get_extract_agent,
+)
+from .config import FILING_SCHEMAS
+
+logger = logging.getLogger(__name__)
+
+
+class FileEvent(StartEvent):
+    file_id: str
+
+
+class DownloadFileEvent(Event):
+    pass
+
+
+class FileDownloadedEvent(Event):
+    pass
+
+
+class ClassifyFileEvent(Event):
+    pass
+
+
+class FileClassifiedEvent(Event):
+    filing_type: str
+    confidence: float | None = None
+    reasoning: str | None = None
+
+
+class UIToast(Event):
+    level: Literal["info", "warning", "error"]
+    message: str
+
+
+class ExtractedEvent(Event):
+    data: ExtractedData
+
+
+class ExtractedInvalidEvent(Event):
+    data: ExtractedData[dict[str, Any]]
+
+
+class ExtractionState(BaseModel):
+    file_id: str | None = None
+    file_path: str | None = None
+    filename: str | None = None
+    filing_type: str | None = None
+    classification_confidence: float | None = None
+    classification_reasoning: str | None = None
+
+
+class ProcessFileWorkflow(Workflow):
+    """
+    Given a file path, this workflow will process a single file through the custom extraction logic.
+    """
+
+    @step()
+    async def run_file(self, event: FileEvent, ctx: Context) -> DownloadFileEvent:
+        logger.info(f"Running file {event.file_id}")
+        async with ctx.store.edit_state() as state:
+            state.file_id = event.file_id
+        return DownloadFileEvent()
+
+    @step()
+    async def download_file(
+        self, event: DownloadFileEvent, ctx: Context[ExtractionState]
+    ) -> ClassifyFileEvent:
+        """Download the file reference from the cloud storage"""
+        state = await ctx.store.get_state()
+        if state.file_id is None:
+            raise ValueError("File ID is not set")
+        try:
+            file_metadata = await get_llama_cloud_client().files.get_file(
+                id=state.file_id
+            )
+            file_url = await get_llama_cloud_client().files.read_file_content(
+                state.file_id
+            )
+
+            temp_dir = tempfile.gettempdir()
+            filename = file_metadata.name
+            file_path = os.path.join(temp_dir, filename)
+            client = httpx.AsyncClient()
+            # Report progress to the UI
+            logger.info(f"Downloading file {file_url.url} to {file_path}")
+
+            async with client.stream("GET", file_url.url) as response:
+                with open(file_path, "wb") as f:
+                    async for chunk in response.aiter_bytes():
+                        f.write(chunk)
+            logger.info(f"Downloaded file {file_url.url} to {file_path}")
+            async with ctx.store.edit_state() as state:
+                state.file_path = file_path
+                state.filename = filename
+            return ClassifyFileEvent()
+
+        except Exception as e:
+            logger.error(f"Error downloading file {state.file_id}: {e}", exc_info=True)
+            ctx.write_event_to_stream(
+                UIToast(
+                    level="error",
+                    message=f"Error downloading file {state.file_id}: {e}",
+                )
+            )
+            raise e
+
+    @step()
+    async def classify_file(
+        self, event: ClassifyFileEvent, ctx: Context[ExtractionState]
+    ) -> FileClassifiedEvent:
+        """Classify the SEC filing document type"""
+        state = await ctx.store.get_state()
+        if state.file_path is None or state.filename is None:
+            raise ValueError("File path or filename is not set")
+
+        try:
+            logger.info(f"Classifying file {state.filename}")
+            ctx.write_event_to_stream(
+                UIToast(level="info", message=f"Classifying file {state.filename}")
+            )
+
+            # Initialize the classifier
+
+            classifier = get_classifier_client()
+
+            # Define classification rules for SEC filing types
+            rules = [
+                ClassifierRule(
+                    type="10-K",
+                    description=(
+                        "Form 10-K is an annual report filed by public companies with the SEC. "
+                        "It provides a comprehensive summary of a company's financial performance for the year, "
+                        "including audited financial statements, management's discussion and analysis (MD&A), "
+                        "risk factors, business description, and executive compensation. "
+                        "Look for: 'Form 10-K', 'Annual Report', fiscal year references, audited financials."
+                    ),
+                ),
+                ClassifierRule(
+                    type="10-Q",
+                    description=(
+                        "Form 10-Q is a quarterly report filed by public companies with the SEC. "
+                        "It provides unaudited financial statements and management discussion for a specific quarter. "
+                        "Contains quarterly financial data, updates on business operations, and material changes. "
+                        "Look for: 'Form 10-Q', 'Quarterly Report', quarter references (Q1, Q2, Q3), unaudited statements."
+                    ),
+                ),
+                ClassifierRule(
+                    type="8-K",
+                    description=(
+                        "Form 8-K is a current report filed to announce material events or corporate changes. "
+                        "Used to notify investors of significant events like mergers, acquisitions, leadership changes, "
+                        "earnings releases, or other material corporate events that shareholders should know about. "
+                        "Look for: 'Form 8-K', 'Current Report', Item numbers (e.g., Item 1.01, Item 5.02), event dates, "
+                        "specific triggering events."
+                    ),
+                ),
+                ClassifierRule(
+                    type="other",
+                    description=(
+                        "Any other SEC filing type not covered by 10-K, 10-Q, or 8-K. "
+                        "This includes forms such as S-1 (IPO registration), DEF 14A (proxy statement), "
+                        "13F (institutional holdings), SC 13D (beneficial ownership), and other SEC forms."
+                    ),
+                ),
+            ]
+
+            # Configure parsing - only parse first few pages for classification
+            parsing_config = ClassifyParsingConfiguration(
+                max_pages=5,  # Only parse first 5 pages for faster classification
+            )
+
+            # Classify the file
+            results = await classifier.aclassify_file_paths(
+                rules=rules,
+                file_input_paths=[state.file_path],
+                parsing_configuration=parsing_config,
+            )
+
+            # Extract classification result
+            if results.items and len(results.items) > 0:
+                item = results.items[0]
+                result: ClassificationResult | None = item.result
+                if result:
+                    filing_type = result.type
+                    confidence = result.confidence
+                    reasoning = result.reasoning
+
+                    logger.info(
+                        f"Classified {state.filename} as {filing_type} "
+                        f"(confidence: {confidence}, reasoning: {reasoning})"
+                    )
+                    ctx.write_event_to_stream(
+                        UIToast(
+                            level="info",
+                            message=f"Classified as {filing_type} SEC filing",
+                        )
+                    )
+
+                    async with ctx.store.edit_state() as state:
+                        state.filing_type = filing_type
+                        state.classification_confidence = confidence
+                        state.classification_reasoning = reasoning
+
+                    return FileClassifiedEvent(
+                        filing_type=filing_type,
+                        confidence=confidence,
+                        reasoning=reasoning,
+                    )
+                else:
+                    # Classification failed, default to "other"
+                    logger.warning(
+                        f"Classification failed for {state.filename}, defaulting to 'other'"
+                    )
+                    ctx.write_event_to_stream(
+                        UIToast(
+                            level="warning",
+                            message="Classification uncertain, using default schema",
+                        )
+                    )
+                    async with ctx.store.edit_state() as state:
+                        state.filing_type = "other"
+                    return FileClassifiedEvent(filing_type="other")
+            else:
+                # No results, default to "other"
+                logger.warning(f"No classification results for {state.filename}")
+                async with ctx.store.edit_state() as state:
+                    state.filing_type = "other"
+                return FileClassifiedEvent(filing_type="other")
+
+        except Exception as e:
+            logger.error(f"Error classifying file {state.filename}: {e}", exc_info=True)
+            ctx.write_event_to_stream(
+                UIToast(
+                    level="warning",
+                    message=f"Classification failed, using default schema: {e}",
+                )
+            )
+            # On error, default to "other" and continue
+            async with ctx.store.edit_state() as state:
+                state.filing_type = "other"
+            return FileClassifiedEvent(filing_type="other")
+
+    @step()
+    async def process_file(
+        self, event: FileClassifiedEvent, ctx: Context[ExtractionState]
+    ) -> ExtractedEvent | ExtractedInvalidEvent:
+        """Runs the extraction against the file"""
+        state = await ctx.store.get_state()
+        if state.file_path is None or state.filename is None:
+            raise ValueError("File path or filename is not set")
+        try:
+            # Get the appropriate schema based on classification
+            filing_type = (state.filing_type or "other").upper()
+            schema = FILING_SCHEMAS.get(filing_type, FILING_SCHEMAS["other"])
+
+            logger.info(f"Using schema for filing type: {filing_type}")
+            ctx.write_event_to_stream(
+                UIToast(
+                    level="info",
+                    message=f"Extracting data using {filing_type} schema",
+                )
+            )
+
+            agent = get_extract_agent()
+            # Update the agent's data schema for this specific filing type
+            agent.data_schema = schema
+            # track the content of the file, so as to be able to de-duplicate
+            file_content = Path(state.file_path).read_bytes()
+            file_hash = hashlib.sha256(file_content).hexdigest()
+            source_text = SourceText(
+                file=state.file_path,
+                filename=state.filename,
+            )
+            logger.info(f"Extracting data from file {state.filename}")
+            ctx.write_event_to_stream(
+                UIToast(
+                    level="info", message=f"Extracting data from file {state.filename}"
+                )
+            )
+            extracted_result: ExtractRun = await agent.aextract(source_text)
+            try:
+                logger.info(f"Extracted data: {extracted_result}")
+                data = ExtractedData.from_extraction_result(
+                    result=extracted_result,
+                    schema=schema,
+                    file_hash=file_hash,
+                )
+                # Add classification information to the extracted data
+                if data.metadata is None:
+                    data.metadata = {}
+                data.metadata["classification"] = filing_type
+                data.metadata["classification_confidence"] = (
+                    state.classification_confidence
+                )
+                data.metadata["classification_reasoning"] = (
+                    state.classification_reasoning
+                )
+                return ExtractedEvent(data=data)
+            except InvalidExtractionData as e:
+                logger.error(f"Error validating extracted data: {e}", exc_info=True)
+                return ExtractedInvalidEvent(data=e.invalid_item)
+        except Exception as e:
+            logger.error(
+                f"Error extracting data from file {state.filename}: {e}",
+                exc_info=True,
+            )
+            ctx.write_event_to_stream(
+                UIToast(
+                    level="error",
+                    message=f"Error extracting data from file {state.filename}: {e}",
+                )
+            )
+            raise e
+
+    @step()
+    async def record_extracted_data(
+        self, event: ExtractedEvent | ExtractedInvalidEvent, ctx: Context
+    ) -> StopEvent:
+        """Records the extracted data to the agent data API"""
+        try:
+            logger.info(f"Recorded extracted data for file {event.data.file_name}")
+            ctx.write_event_to_stream(
+                UIToast(
+                    level="info",
+                    message=f"Recorded extracted data for file {event.data.file_name}",
+                )
+            )
+            # remove past data when reprocessing the same file
+            if event.data.file_hash:
+                existing_data = await get_data_client().untyped_search(
+                    filter={
+                        "file_hash": {
+                            "eq": event.data.file_hash,
+                        },
+                    },
+                )
+                if existing_data.items:
+                    logger.info(
+                        f"Removing past data for file {event.data.file_name} with hash {event.data.file_hash}"
+                    )
+                    await asyncio.gather(
+                        *[
+                            get_data_client().delete_item(item.id)
+                            for item in existing_data.items
+                        ]
+                    )
+            # finally, save the new data
+            item_id = await get_data_client().create_item(event.data)
+            return StopEvent(
+                result=item_id.id,
+            )
+        except Exception as e:
+            logger.error(
+                f"Error recording extracted data for file {event.data.file_name}: {e}",
+                exc_info=True,
+            )
+            ctx.write_event_to_stream(
+                UIToast(
+                    level="error",
+                    message=f"Error recording extracted data for file {event.data.file_name}: {e}",
+                )
+            )
+            raise e
+
+
+workflow = ProcessFileWorkflow(timeout=None)
+
+if __name__ == "__main__":
+    from dotenv import load_dotenv
+
+    load_dotenv()
+    logging.basicConfig(level=logging.INFO)
+
+    async def main():
+        file = await get_llama_cloud_client().files.upload_file(
+            upload_file=Path("test.pdf").open("rb")
+        )
+        await workflow.run(start_event=FileEvent(file_id=file.id))
+
+    asyncio.run(main())
@@ -0,0 +1,57 @@
+"""
+Selects a locally defined shema, or queries the remote extraction agent for the schema.
+"""
+
+import asyncio
+import jsonref
+from .clients import get_extract_agent
+from .config import USE_REMOTE_EXTRACTION_SCHEMA, ExtractionSchema
+from typing import Any, Type
+from pydantic import BaseModel
+from pydantic import create_model, Field
+
+
+SCHEMA: Type[BaseModel] | None = (
+    None if USE_REMOTE_EXTRACTION_SCHEMA else ExtractionSchema
+)
+
+
+_schema_lock = asyncio.Lock()
+
+
+async def get_extraction_schema() -> Type[BaseModel]:
+    global SCHEMA
+    if SCHEMA is not None:
+        return SCHEMA
+    async with _schema_lock:
+        if SCHEMA is not None:
+            return SCHEMA
+        agent = get_extract_agent()
+        SCHEMA = model_from_schema(agent.data_schema)
+        return SCHEMA
+
+
+async def get_extraction_schema_json() -> dict[str, Any]:
+    json_schema = (await get_extraction_schema()).model_json_schema()
+    json_schema = jsonref.replace_refs(json_schema, proxies=False)
+    return json_schema
+
+
+def model_from_schema(schema: dict[str, Any]) -> Type[BaseModel]:
+    """
+    Converts a JSON schema back to a Pydantic model.
+    """
+    typemap = {
+        "string": str,
+        "integer": int,
+        "number": float,
+        "boolean": bool,
+        "array": list,
+        "object": dict,
+    }
+    fields = {}
+    for prop, meta in schema.get("properties", {}).items():
+        py_type = typemap.get(meta.get("type"), Any)
+        default = ... if prop in schema.get("required", []) else None
+        fields[prop] = (py_type, Field(default, description=meta.get("description")))
+    return create_model(schema.get("title", "DynamicModel"), **fields)
@@ -0,0 +1,2 @@
+def test_placeholder():
+    pass
@@ -0,0 +1,43 @@
+# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
+
+# dependencies
+/node_modules
+/.pnp
+.pnp.*
+.yarn/*
+!.yarn/patches
+!.yarn/plugins
+!.yarn/releases
+!.yarn/versions
+
+# testing
+/coverage
+
+# next.js
+/.next/
+/out/
+/dist/
+
+# production
+/build
+
+# misc
+.DS_Store
+*.pem
+
+# debug
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+.pnpm-debug.log*
+
+# env files (can opt-in for committing if needed)
+.env*
+
+# vercel
+.vercel
+
+# typescript
+*.tsbuildinfo
+next-env.d.ts
+
@@ -0,0 +1,7 @@
+# Data Extraction UI
+
+This is a simple next.js template that builds on the @llamaindex/agent-app ui component library
+for showing displaying tables of extracted data.
+
+Ideally run this with `llamactl` in the parent directory (See [README.md](../README.md)),
+but you can also run it standalone with `npm run dev`, but workflow integrations will not work
@@ -0,0 +1,21 @@
+{
+  "$schema": "https://ui.shadcn.com/schema.json",
+  "style": "new-york",
+  "rsc": true,
+  "tsx": true,
+  "tailwind": {
+    "config": "",
+    "css": "src/index.css",
+    "baseColor": "zinc",
+    "cssVariables": true,
+    "prefix": ""
+  },
+  "aliases": {
+    "components": "@/components",
+    "utils": "@/lib/utils",
+    "ui": "@/components/ui",
+    "lib": "@/lib",
+    "hooks": "@/hooks"
+  },
+  "iconLibrary": "lucide"
+}
@@ -0,0 +1,12 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>Review</title>
+  </head>
+  <body>
+    <div id="root"></div>
+    <script type="module" src="/src/main.tsx"></script>
+  </body>
+</html>
@@ -0,0 +1,45 @@
+{
+  "name": "extraction-review-ui",
+  "version": "0.1.0",
+  "private": true,
+  "type": "module",
+  "scripts": {
+    "dev": "vite",
+    "build": "tsc && vite build",
+    "preview": "vite preview",
+    "lint": "tsc --noEmit",
+    "format": "prettier --write src",
+    "format-check": "prettier --check src",
+    "all-check": "pnpm i && pnpm run lint && pnpm run format-check && pnpm run build",
+    "all-fix": "pnpm i && pnpm run lint && pnpm run format && pnpm run build"
+  },
+  "dependencies": {
+    "@babel/runtime": "^7.27.6",
+    "@lezer/highlight": "^1.2.1",
+    "@llamaindex/ui": "^2.1.2",
+    "@radix-ui/themes": "^3.2.1",
+    "class-variance-authority": "^0.7.1",
+    "clsx": "^2.1.1",
+    "llama-cloud-services": "^0.3.4",
+    "lucide-react": "^0.514.0",
+    "react": "^18.3.0",
+    "react-dom": "^18.3.0",
+    "react-router-dom": "^6.30.0",
+    "sonner": "^2.0.5",
+    "tw-animate-css": "^1.3.5"
+  },
+  "devDependencies": {
+    "@tailwindcss/postcss": "^4.1.10",
+    "@types/node": "^20",
+    "@types/react": "^19",
+    "@types/react-dom": "^19",
+    "@vitejs/plugin-react": "^4.3.4",
+    "postcss": "^8.5.5",
+    "prettier": "^3.6.2",
+    "tailwind-merge": "^3.3.1",
+    "tailwindcss": "^4.1.8",
+    "typescript": "^5",
+    "vite": "^6.0.5"
+  },
+  "packageManager": "pnpm@10.11.1+sha512.e519b9f7639869dc8d5c3c5dfef73b3f091094b0a006d7317353c72b124e80e1afd429732e28705ad6bfa1ee879c1fce46c128ccebd3192101f43dd67c667912"
+}
@@ -0,0 +1,7 @@
+const config = {
+  plugins: {
+    "@tailwindcss/postcss": {},
+  },
+};
+
+export default config; 
@@ -0,0 +1,70 @@
+import React from "react";
+import { Routes, Route } from "react-router-dom";
+import { Theme } from "@radix-ui/themes";
+import {
+  Breadcrumb,
+  BreadcrumbItem,
+  BreadcrumbList,
+  BreadcrumbSeparator,
+} from "@llamaindex/ui";
+import { Link } from "react-router-dom";
+import { Toaster } from "@llamaindex/ui";
+import { useToolbar, ToolbarProvider } from "@/lib/ToolbarContext";
+import { MetadataProvider } from "@/lib/MetadataProvider";
+
+// Import pages
+import HomePage from "./pages/HomePage";
+import ItemPage from "./pages/ItemPage";
+
+export default function App() {
+  return (
+    <Theme>
+      <MetadataProvider>
+        <ToolbarProvider>
+          <div className="grid grid-rows-[auto_1fr] h-screen">
+            <Toolbar />
+            <main className="overflow-auto">
+              <Routes>
+                <Route path="/" element={<HomePage />} />
+                <Route path="/item/:itemId" element={<ItemPage />} />
+              </Routes>
+            </main>
+          </div>
+          <Toaster />
+        </ToolbarProvider>
+      </MetadataProvider>
+    </Theme>
+  );
+}
+
+const Toolbar = () => {
+  const { buttons, breadcrumbs } = useToolbar();
+
+  return (
+    <header className="sticky top-0 z-50 flex h-16 shrink-0 items-center gap-2 border-b px-4 bg-white/95 backdrop-blur supports-[backdrop-filter]:bg-white/60">
+      <Breadcrumb>
+        <BreadcrumbList>
+          {breadcrumbs.map((item, index) => (
+            <React.Fragment key={index}>
+              {index > 0 && <BreadcrumbSeparator />}
+              <BreadcrumbItem>
+                {item.href && !item.isCurrentPage ? (
+                  <Link to={item.href} className="font-medium text-base">
+                    {item.label}
+                  </Link>
+                ) : (
+                  <span
+                    className={`font-medium ${index === 0 ? "text-base" : ""}`}
+                  >
+                    {item.label}
+                  </span>
+                )}
+              </BreadcrumbItem>
+            </React.Fragment>
+          ))}
+        </BreadcrumbList>
+      </Breadcrumb>
+      {buttons}
+    </header>
+  );
+};
@@ -0,0 +1,120 @@
+@import "tailwindcss";
+@import "tw-animate-css";
+
+@custom-variant dark (&:is(.dark *));
+
+@theme inline {
+  --radius-sm: calc(var(--radius) - 4px);
+  --radius-md: calc(var(--radius) - 2px);
+  --radius-lg: var(--radius);
+  --radius-xl: calc(var(--radius) + 4px);
+  --color-background: var(--background);
+  --color-foreground: var(--foreground);
+  --color-card: var(--card);
+  --color-card-foreground: var(--card-foreground);
+  --color-popover: var(--popover);
+  --color-popover-foreground: var(--popover-foreground);
+  --color-primary: var(--primary);
+  --color-primary-foreground: var(--primary-foreground);
+  --color-secondary: var(--secondary);
+  --color-secondary-foreground: var(--secondary-foreground);
+  --color-muted: var(--muted);
+  --color-muted-foreground: var(--muted-foreground);
+  --color-accent: var(--accent);
+  --color-accent-foreground: var(--accent-foreground);
+  --color-destructive: var(--destructive);
+  --color-border: var(--border);
+  --color-input: var(--input);
+  --color-ring: var(--ring);
+  --color-chart-1: var(--chart-1);
+  --color-chart-2: var(--chart-2);
+  --color-chart-3: var(--chart-3);
+  --color-chart-4: var(--chart-4);
+  --color-chart-5: var(--chart-5);
+  --color-sidebar: var(--sidebar);
+  --color-sidebar-foreground: var(--sidebar-foreground);
+  --color-sidebar-primary: var(--sidebar-primary);
+  --color-sidebar-primary-foreground: var(--sidebar-primary-foreground);
+  --color-sidebar-accent: var(--sidebar-accent);
+  --color-sidebar-accent-foreground: var(--sidebar-accent-foreground);
+  --color-sidebar-border: var(--sidebar-border);
+  --color-sidebar-ring: var(--sidebar-ring);
+}
+
+:root {
+  --radius: 0.625rem;
+  --card: oklch(1 0 0);
+  --card-foreground: oklch(0.141 0.005 285.823);
+  --popover: oklch(1 0 0);
+  --popover-foreground: oklch(0.141 0.005 285.823);
+  --primary: oklch(0.21 0.006 285.885);
+  --primary-foreground: oklch(0.985 0 0);
+  --secondary: oklch(0.967 0.001 286.375);
+  --secondary-foreground: oklch(0.21 0.006 285.885);
+  --muted: oklch(0.967 0.001 286.375);
+  --muted-foreground: oklch(0.552 0.016 285.938);
+  --accent: oklch(0.967 0.001 286.375);
+  --accent-foreground: oklch(0.21 0.006 285.885);
+  --destructive: oklch(0.577 0.245 27.325);
+  --border: oklch(0.92 0.004 286.32);
+  --input: oklch(0.92 0.004 286.32);
+  --ring: oklch(0.705 0.015 286.067);
+  --chart-1: oklch(0.646 0.222 41.116);
+  --chart-2: oklch(0.6 0.118 184.704);
+  --chart-3: oklch(0.398 0.07 227.392);
+  --chart-4: oklch(0.828 0.189 84.429);
+  --chart-5: oklch(0.769 0.188 70.08);
+  --sidebar: oklch(0.985 0 0);
+  --sidebar-foreground: oklch(0.141 0.005 285.823);
+  --sidebar-primary: oklch(0.21 0.006 285.885);
+  --sidebar-primary-foreground: oklch(0.985 0 0);
+  --sidebar-accent: oklch(0.967 0.001 286.375);
+  --sidebar-accent-foreground: oklch(0.21 0.006 285.885);
+  --sidebar-border: oklch(0.92 0.004 286.32);
+  --sidebar-ring: oklch(0.705 0.015 286.067);
+  --background: oklch(1 0 0);
+  --foreground: oklch(0.141 0.005 285.823);
+}
+
+.dark {
+  --background: oklch(0.141 0.005 285.823);
+  --foreground: oklch(0.985 0 0);
+  --card: oklch(0.21 0.006 285.885);
+  --card-foreground: oklch(0.985 0 0);
+  --popover: oklch(0.21 0.006 285.885);
+  --popover-foreground: oklch(0.985 0 0);
+  --primary: oklch(0.92 0.004 286.32);
+  --primary-foreground: oklch(0.21 0.006 285.885);
+  --secondary: oklch(0.274 0.006 286.033);
+  --secondary-foreground: oklch(0.985 0 0);
+  --muted: oklch(0.274 0.006 286.033);
+  --muted-foreground: oklch(0.705 0.015 286.067);
+  --accent: oklch(0.274 0.006 286.033);
+  --accent-foreground: oklch(0.985 0 0);
+  --destructive: oklch(0.704 0.191 22.216);
+  --border: oklch(1 0 0 / 10%);
+  --input: oklch(1 0 0 / 15%);
+  --ring: oklch(0.552 0.016 285.938);
+  --chart-1: oklch(0.488 0.243 264.376);
+  --chart-2: oklch(0.696 0.17 162.48);
+  --chart-3: oklch(0.769 0.188 70.08);
+  --chart-4: oklch(0.627 0.265 303.9);
+  --chart-5: oklch(0.645 0.246 16.439);
+  --sidebar: oklch(0.21 0.006 285.885);
+  --sidebar-foreground: oklch(0.985 0 0);
+  --sidebar-primary: oklch(0.488 0.243 264.376);
+  --sidebar-primary-foreground: oklch(0.985 0 0);
+  --sidebar-accent: oklch(0.274 0.006 286.033);
+  --sidebar-accent-foreground: oklch(0.985 0 0);
+  --sidebar-border: oklch(1 0 0 / 10%);
+  --sidebar-ring: oklch(0.552 0.016 285.938);
+}
+
+@layer base {
+  * {
+    @apply border-border outline-ring/50;
+  }
+  body {
+    @apply bg-background text-foreground;
+  }
+}
@@ -0,0 +1,71 @@
+import { createContext, useContext, ReactNode, useMemo } from "react";
+import { ApiProvider, ApiClients } from "@llamaindex/ui";
+import { useMetadata, Metadata } from "./useMetadata";
+import { createBaseWorkflowClient, createClients } from "./client";
+import { Clock, XCircle } from "lucide-react";
+
+interface MetadataContextValue {
+  metadata: Metadata;
+  clients: ApiClients;
+}
+
+const MetadataContext = createContext<MetadataContextValue | null>(null);
+
+export function MetadataProvider({ children }: { children: ReactNode }) {
+  const baseClients: ApiClients = useMemo(() => {
+    return {
+      workflowsClient: createBaseWorkflowClient(),
+    } as ApiClients;
+  }, []);
+  return (
+    <ApiProvider clients={baseClients}>
+      <InnerMetadataProvider>{children}</InnerMetadataProvider>
+    </ApiProvider>
+  );
+}
+
+function InnerMetadataProvider({ children }: { children: ReactNode }) {
+  const { metadata, loading, error } = useMetadata();
+  const clients = useMemo(
+    () => (metadata ? createClients(metadata) : undefined),
+    [metadata],
+  );
+
+  if (loading) {
+    return (
+      <div className="flex h-screen items-center justify-center">
+        <div className="text-center">
+          <Clock className="h-8 w-8 animate-spin mx-auto mb-2" />
+          <div className="text-sm text-gray-500">Loading configuration...</div>
+        </div>
+      </div>
+    );
+  }
+
+  if (error || !metadata || !clients) {
+    return (
+      <div className="flex h-screen items-center justify-center">
+        <div className="text-center">
+          <XCircle className="h-8 w-8 text-red-500 mx-auto mb-2" />
+          <div className="text-sm text-gray-500">
+            Error loading configuration: {error || "Unknown error"}
+          </div>
+        </div>
+      </div>
+    );
+  }
+
+  return (
+    <MetadataContext.Provider value={{ metadata, clients }}>
+      <ApiProvider clients={clients}>{children}</ApiProvider>
+    </MetadataContext.Provider>
+  );
+}
+
+export function useMetadataContext() {
+  const context = useContext(MetadataContext);
+  if (!context) {
+    throw new Error("useMetadataContext must be used within MetadataProvider");
+  }
+  return context;
+}
@@ -0,0 +1,41 @@
+import React from "react";
+import { APP_TITLE } from "./config";
+
+export interface BreadcrumbItem {
+  label: string;
+  href?: string;
+  isCurrentPage?: boolean;
+}
+
+export const ToolbarCtx = React.createContext<{
+  buttons: React.ReactNode[];
+  setButtons: (fn: (prev: React.ReactNode[]) => React.ReactNode[]) => void;
+  breadcrumbs: BreadcrumbItem[];
+  setBreadcrumbs: (items: BreadcrumbItem[]) => void;
+}>({
+  buttons: [],
+  setButtons: () => {},
+  breadcrumbs: [],
+  setBreadcrumbs: () => {},
+});
+
+export const ToolbarProvider = ({
+  children,
+}: {
+  children: React.ReactNode;
+}) => {
+  const [buttons, setButtons] = React.useState<React.ReactNode[]>([]);
+  const [breadcrumbs, setBreadcrumbs] = React.useState<BreadcrumbItem[]>([
+    { label: APP_TITLE, href: "/" },
+  ]);
+
+  return (
+    <ToolbarCtx.Provider
+      value={{ buttons, setButtons, breadcrumbs, setBreadcrumbs }}
+    >
+      {children}
+    </ToolbarCtx.Provider>
+  );
+};
+
+export const useToolbar = () => React.useContext(ToolbarCtx);
@@ -0,0 +1,51 @@
+import { ExtractedData } from "llama-cloud-services/beta/agent";
+import {
+  ApiClients,
+  createWorkflowsClient,
+  createWorkflowsConfig,
+  createCloudAgentClient,
+  cloudApiClient,
+} from "@llamaindex/ui";
+import { AGENT_NAME } from "./config";
+import type { Metadata } from "./useMetadata";
+
+const platformToken = import.meta.env.VITE_LLAMA_CLOUD_API_KEY;
+const apiBaseUrl = import.meta.env.VITE_LLAMA_CLOUD_BASE_URL;
+const projectId = import.meta.env.VITE_LLAMA_DEPLOY_PROJECT_ID;
+
+// Configure the platform client
+cloudApiClient.setConfig({
+  ...(apiBaseUrl && { baseUrl: apiBaseUrl }),
+  headers: {
+    // optionally use a backend API token scoped to a project. For local development,
+    ...(platformToken && { authorization: `Bearer ${platformToken}` }),
+    // This header is required for requests to correctly scope to the agent's project
+    // when authenticating with a user cookie
+    ...(projectId && { "Project-Id": projectId }),
+  },
+});
+
+export function createBaseWorkflowClient(): ReturnType<
+  typeof createWorkflowsClient
+> {
+  return createWorkflowsClient(
+    createWorkflowsConfig({
+      baseUrl: `/deployments/${AGENT_NAME}/`,
+    }),
+  );
+}
+
+export function createClients(metadata: Metadata): ApiClients {
+  const workflowsClient = createBaseWorkflowClient();
+  const agentClient = createCloudAgentClient<ExtractedData<any>>({
+    client: cloudApiClient,
+    windowUrl: typeof window !== "undefined" ? window.location.href : undefined,
+    collection: metadata.extracted_data_collection,
+  });
+
+  return {
+    workflowsClient,
+    cloudApiClient,
+    agentDataClient: agentClient,
+  } as ApiClients;
+}
@@ -0,0 +1,2 @@
+export const APP_TITLE = "Extraction Review";
+export const AGENT_NAME = import.meta.env.VITE_LLAMA_DEPLOY_DEPLOYMENT_NAME;
@@ -0,0 +1,39 @@
+import type {
+  ExtractedData,
+  TypedAgentData,
+} from "llama-cloud-services/beta/agent";
+
+/**
+ * Downloads data as a JSON file
+ */
+export function downloadJSON<T>(
+  data: T,
+  filename: string = "extraction-results.json",
+) {
+  const jsonString = JSON.stringify(data, null, 2);
+  const blob = new Blob([jsonString], { type: "application/json" });
+  const url = URL.createObjectURL(blob);
+
+  const link = document.createElement("a");
+  link.href = url;
+  link.download = filename;
+  document.body.appendChild(link);
+  link.click();
+
+  // Cleanup
+  document.body.removeChild(link);
+  URL.revokeObjectURL(url);
+}
+
+/**
+ * Downloads extracted data item as JSON
+ */
+export function downloadExtractedDataItem<T>(
+  item: TypedAgentData<ExtractedData<T>>,
+) {
+  const fileName = item.data.file_name || "item";
+  const timestamp = item.createdAt.toISOString().split("T")[0];
+  const filename = `${fileName}-${timestamp}.json`;
+
+  downloadJSON(item, filename);
+}
@@ -0,0 +1,41 @@
+import { useWorkflowHandler, useWorkflowRun } from "@llamaindex/ui";
+import { useEffect, useState } from "react";
+
+export interface Metadata {
+  schemas: Record<string, any>;
+  extracted_data_collection: string;
+}
+
+export interface UseMetadataResult {
+  metadata: Metadata;
+  loading: boolean;
+  error: string | undefined;
+}
+
+export function useMetadata() {
+  const run = useWorkflowRun();
+  const [handlerId, setHandlerId] = useState<string | undefined>(undefined);
+  const handler = useWorkflowHandler(handlerId ?? "");
+  const [error, setError] = useState<string | undefined>(undefined);
+  const [loading, setLoading] = useState(true);
+
+  useEffect(() => {
+    setLoading(true);
+    run
+      .runWorkflow("metadata", {})
+      .then((handlerSummary) => {
+        setHandlerId(handlerSummary.handler_id);
+      })
+      .catch((error) => {
+        setError(error.message);
+      })
+      .finally(() => {
+        setLoading(false);
+      });
+  }, []);
+  const stopEvent = handler.events.find((event) =>
+    event.type.endsWith("MetadataResponse"),
+  );
+  const metadata = stopEvent?.data as Metadata | undefined;
+  return { metadata, loading, error };
+}
@@ -0,0 +1,6 @@
+import { clsx, type ClassValue } from "clsx";
+import { twMerge } from "tailwind-merge";
+
+export function cn(...inputs: ClassValue[]) {
+  return twMerge(clsx(inputs));
+}
@@ -0,0 +1,14 @@
+import { StrictMode } from "react";
+import { createRoot } from "react-dom/client";
+import { HashRouter } from "react-router-dom";
+import App from "./App";
+import "@llamaindex/ui/styles.css";
+import "./index.css";
+
+createRoot(document.getElementById("root")!).render(
+  <StrictMode>
+    <HashRouter>
+      <App />
+    </HashRouter>
+  </StrictMode>,
+);
@@ -0,0 +1,23 @@
+.main {
+  padding: 1rem;
+}
+
+.grid {
+  display: flex;
+  flex-direction: row;
+  gap: 1rem;
+  margin-bottom: 1rem;
+  & > * {
+    flex: 1;
+  }
+}
+
+.commandBar {
+  display: flex;
+  justify-content: flex-end;
+  margin-bottom: 1rem;
+}
+
+.progressBar {
+  margin-bottom: 1rem;
+}
@@ -0,0 +1,88 @@
+import {
+  ItemCount,
+  WorkflowTrigger,
+  WorkflowProgressBar,
+  ExtractedDataItemGrid,
+  useWorkflowHandlerList,
+} from "@llamaindex/ui";
+import type { TypedAgentData } from "llama-cloud-services/beta/agent";
+import styles from "./HomePage.module.css";
+import { useNavigate } from "react-router-dom";
+import { useEffect, useState } from "react";
+
+export default function HomePage() {
+  const { taskKey } = taskCompletedState();
+  return <TaskList key={taskKey} />;
+}
+
+/**
+ * Returns a key that increments when a task is completed, can be used to force a re-render of the task list
+ */
+function taskCompletedState() {
+  const { handlers } = useWorkflowHandlerList("process-file");
+  const runningTasks = handlers.filter(
+    (handler) => handler.status === "running",
+  );
+  const [runningTaskCount, setRunningTaskCount] = useState(runningTasks.length);
+  const [taskKey, setTaskKey] = useState(0);
+  useEffect(() => {
+    if (runningTasks.length < runningTaskCount) {
+      // forcefully reload task list after a task is completed
+      setTaskKey(taskKey + 1);
+    }
+    setRunningTaskCount(runningTasks.length);
+  }, [runningTasks.length]);
+  return { runningTaskCount, taskKey };
+}
+
+function TaskList() {
+  const navigate = useNavigate();
+  const goToItem = (item: TypedAgentData) => {
+    navigate(`/item/${item.id}`);
+  };
+  return (
+    <div className={styles.page}>
+      <main className={styles.main}>
+        <div className={styles.grid}>
+          <ItemCount title="Total Items" />
+          <ItemCount
+            title="Reviewed"
+            filter={{
+              status: { eq: "approved" },
+            }}
+          />
+          <ItemCount
+            title="Needs Review"
+            filter={{
+              status: { eq: "pending_review" },
+            }}
+          />
+        </div>
+        <div className={styles.commandBar}>
+          <WorkflowTrigger
+            workflowName="process-file"
+            customWorkflowInput={(files) => {
+              return {
+                file_id: files[0].fileId,
+              };
+            }}
+          />
+        </div>
+        <WorkflowProgressBar
+          className={styles.progressBar}
+          workflowName="process-file"
+        />
+        <ExtractedDataItemGrid
+          onRowClick={goToItem}
+          builtInColumns={{
+            fileName: true,
+            status: true,
+            createdAt: true,
+            itemsToReview: true,
+            actions: true,
+          }}
+        />
+      </main>
+    </div>
+  );
+}
@@ -0,0 +1,189 @@
+import { useEffect, useState } from "react";
+import {
+  AcceptReject,
+  ExtractedDataDisplay,
+  FilePreview,
+  useItemData,
+  type Highlight,
+  Button,
+} from "@llamaindex/ui";
+import { Clock, XCircle, Download } from "lucide-react";
+import { useParams } from "react-router-dom";
+import { useToolbar } from "@/lib/ToolbarContext";
+import { useNavigate } from "react-router-dom";
+import { modifyJsonSchema } from "@llamaindex/ui/lib";
+import { APP_TITLE } from "@/lib/config";
+import { downloadExtractedDataItem } from "@/lib/export";
+import { useMetadataContext } from "@/lib/MetadataProvider";
+
+export default function ItemPage() {
+  const { itemId } = useParams<{ itemId: string }>();
+  const { setButtons, setBreadcrumbs } = useToolbar();
+  const [highlight, setHighlight] = useState<Highlight | undefined>(undefined);
+  const { metadata } = useMetadataContext();
+
+  // Use the hook to fetch item data (initially with a default schema)
+  const itemHookData = useItemData<any>({
+    // We'll update the schema based on classification once data loads
+    jsonSchema: modifyJsonSchema(metadata.schemas["10-K"] || {}, {}),
+    itemId: itemId as string,
+    isMock: false,
+  });
+
+  // Determine the correct schema based on classification
+  const classification = (
+    (itemHookData.item?.data?.metadata?.classification as string | undefined) ||
+    "10-K"
+  ).toUpperCase();
+  const correctSchema =
+    metadata.schemas[classification] || metadata.schemas["10-K"];
+
+  // Update the schema in itemHookData if classification is available
+  const [schemaKey, setSchemaKey] = useState(0);
+  const [appliedSchema, setAppliedSchema] = useState(correctSchema);
+
+  useEffect(() => {
+    if (classification && metadata.schemas[classification]) {
+      setAppliedSchema(modifyJsonSchema(metadata.schemas[classification], {}));
+      setSchemaKey(schemaKey + 1);
+    }
+  }, [classification, metadata.schemas]);
+
+  const navigate = useNavigate();
+
+  // Update breadcrumb when item data loads
+  useEffect(() => {
+    const fileName = itemHookData.item?.data?.file_name;
+    if (fileName) {
+      setBreadcrumbs([
+        { label: APP_TITLE, href: "/" },
+        {
+          label: fileName,
+          isCurrentPage: true,
+        },
+      ]);
+    }
+
+    return () => {
+      // Reset to default breadcrumb when leaving the page
+      setBreadcrumbs([{ label: APP_TITLE, href: "/" }]);
+    };
+  }, [itemHookData.item?.data?.file_name, setBreadcrumbs]);
+
+  useEffect(() => {
+    setButtons(() => [
+      <div className="ml-auto flex items-center gap-2">
+        <Button
+          variant="outline"
+          size="sm"
+          onClick={() => {
+            if (itemData) {
+              downloadExtractedDataItem(itemData);
+            }
+          }}
+          disabled={!itemData}
+        >
+          <Download className="h-4 w-4 mr-2" />
+          Export JSON
+        </Button>
+        <AcceptReject<any>
+          itemData={itemHookData}
+          onComplete={() => navigate("/")}
+        />
+      </div>,
+    ]);
+    return () => {
+      setButtons(() => []);
+    };
+  }, [itemHookData.data, setButtons]);
+
+  const {
+    item: itemData,
+    updateData,
+    loading: isLoading,
+    error,
+  } = itemHookData;
+
+  const classificationReasoning = itemData?.data?.metadata
+    ?.classification_reasoning as string | undefined;
+
+  if (isLoading) {
+    return (
+      <div className="flex h-screen items-center justify-center">
+        <div className="text-center">
+          <Clock className="h-8 w-8 animate-spin mx-auto mb-2" />
+          <div className="text-sm text-gray-500">Loading item...</div>
+        </div>
+      </div>
+    );
+  }
+
+  if (error || !itemData) {
+    return (
+      <div className="flex h-screen items-center justify-center">
+        <div className="text-center">
+          <XCircle className="h-8 w-8 text-red-500 mx-auto mb-2" />
+          <div className="text-sm text-gray-500">
+            Error loading item: {error || "Item not found"}
+          </div>
+        </div>
+      </div>
+    );
+  }
+
+  return (
+    <div className="flex h-full bg-gray-50">
+      {/* Left Side - File Preview */}
+      <div className="w-1/2 border-r border-gray-200 bg-white">
+        {itemData.data.file_id && (
+          <FilePreview
+            fileId={itemData.data.file_id}
+            onBoundingBoxClick={(box, pageNumber) => {
+              console.log("Bounding box clicked:", box, "on page:", pageNumber);
+            }}
+            highlight={highlight}
+          />
+        )}
+      </div>
+
+      {/* Right Side - Review Panel */}
+      <div className="flex-1 bg-white h-full overflow-y-auto">
+        <div className="p-4 space-y-4">
+          {/* Classification Info */}
+          {classification && (
+            <div className="bg-blue-50 border border-blue-200 rounded-lg p-3 mb-4">
+              <div className="text-sm font-semibold text-blue-900">
+                Document Type: {classification}
+              </div>
+              {classificationReasoning && (
+                <div className="text-xs text-blue-600 mt-1">
+                  {classificationReasoning}
+                </div>
+              )}
+            </div>
+          )}
+          {/* Extracted Data */}
+          <ExtractedDataDisplay<any>
+            key={schemaKey}
+            extractedData={itemData.data}
+            title="Extracted Data"
+            onChange={(updatedData) => {
+              updateData(updatedData);
+            }}
+            onClickField={(args) => {
+              // TODO: set multiple highlights
+              setHighlight({
+                page: args.metadata?.citation?.[0]?.page ?? 1,
+                x: 100,
+                y: 100,
+                width: 0,
+                height: 0,
+              });
+            }}
+            jsonSchema={appliedSchema}
+          />
+        </div>
+      </div>
+    </div>
+  );
+}
@@ -0,0 +1,15 @@
+/// <reference types="vite/client" />
+
+interface ImportMetaEnv {
+  readonly VITE_LLAMA_CLOUD_API_KEY?: string;
+  readonly VITE_LLAMA_CLOUD_BASE_URL?: string;
+
+  // injected from llama_deploy
+  readonly VITE_LLAMA_DEPLOY_BASE_PATH: string;
+  readonly VITE_LLAMA_DEPLOY_DEPLOYMENT_NAME: string;
+  readonly VITE_LLAMA_DEPLOY_PROJECT_ID: string;
+}
+
+interface ImportMeta {
+  readonly env: ImportMetaEnv;
+}
@@ -0,0 +1,31 @@
+{
+  "compilerOptions": {
+    "target": "ES2020",
+    "useDefineForClassFields": true,
+    "lib": ["ES2020", "DOM", "DOM.Iterable"],
+    "module": "ESNext",
+    "skipLibCheck": true,
+    
+    /* Bundler mode */
+    "moduleResolution": "bundler",
+    "allowImportingTsExtensions": true,
+    "resolveJsonModule": true,
+    "isolatedModules": true,
+    "noEmit": true,
+    "jsx": "react-jsx",
+    
+    /* Linting */
+    "strict": true,
+    "noUnusedLocals": true,
+    "noUnusedParameters": true,
+    "noFallthroughCasesInSwitch": true,
+    
+    /* Path mapping */
+    "baseUrl": ".",
+    "paths": {
+      "@/*": ["./src/*"]
+    }
+  },
+  "include": ["src", "vite.config.ts", "src/vite-env.d.ts"],
+  "exclude": ["node_modules"]
+}
@@ -0,0 +1,43 @@
+import { defineConfig } from "vite";
+import react from "@vitejs/plugin-react";
+import path from "path";
+
+// https://vitejs.dev/config/
+export default defineConfig(({}) => {
+  const deploymentName = process.env.LLAMA_DEPLOY_DEPLOYMENT_NAME;
+  const basePath = process.env.LLAMA_DEPLOY_DEPLOYMENT_BASE_PATH;
+  const projectId = process.env.LLAMA_DEPLOY_PROJECT_ID;
+  const port = process.env.PORT ? Number(process.env.PORT) : 3000;
+  const baseUrl = process.env.LLAMA_CLOUD_BASE_URL;
+  return {
+    plugins: [react()],
+    resolve: {
+      alias: {
+        "@": path.resolve(__dirname, "./src"),
+      },
+    },
+    server: {
+      port: port,
+      host: true,
+    },
+    build: {
+      outDir: "dist",
+      sourcemap: true,
+    },
+    base: basePath,
+    define: {
+      // Primary define uses NAME
+      "import.meta.env.VITE_LLAMA_DEPLOY_DEPLOYMENT_NAME": JSON.stringify(
+        deploymentName
+      ),
+      "import.meta.env.VITE_LLAMA_DEPLOY_DEPLOYMENT_BASE_PATH": JSON.stringify(basePath),
+      ...(projectId && {
+        "import.meta.env.VITE_LLAMA_DEPLOY_PROJECT_ID":
+          JSON.stringify(projectId),
+      }),
+      ...(baseUrl && {
+        "import.meta.env.VITE_LLAMA_CLOUD_BASE_URL": JSON.stringify(baseUrl),
+      }),
+    },
+  };
+});
@@ -0,0 +1,2 @@
+# Changes here will be overwritten by Copier; NEVER EDIT MANUALLY
+{{ _copier_answers|to_nice_yaml -}}