Add back classify-extract-sec

This commit is contained in:
Adrian Lyjak
2025-11-04 17:29:54 -05:00
parent 9433b0dab5
commit 726d6c0e4c
35 changed files with 2057 additions and 2 deletions
+3
View File
@@ -0,0 +1,3 @@
# Changes here will be overwritten by Copier; NEVER EDIT MANUALLY
_commit: v0.2.1
_src_path: https://github.com/run-llama/template-workflow-data-extraction
+2
View File
@@ -0,0 +1,2 @@
# copy to .env and place any needed secrets here. LLAMA_CLOUD_API_KEY will be automatically set
# OPENAI_API_KEY=sk-xxx
+7
View File
@@ -0,0 +1,7 @@
.env
__pycache__
workflows.db
.venv
package-lock.json
node_modules
+67 -2
View File
@@ -1,2 +1,67 @@
# template-workflow-classify-extract-sec
Llama Index Workflow Template
# SEC Filing Data Extraction and Analysis
A LlamaAgents application for extracting structured information from SEC filings using LlamaClassify and LlamaExtract. This application automatically classifies SEC documents (10-K, 10-Q, 8-K, or other) and extracts relevant financial and business information tailored to each filing type.
## Features
- **Intelligent Classification**: Uses LlamaClassify to automatically identify SEC filing types (10-K, 10-Q, 8-K, other)
- **Dynamic Schema Selection**: Applies specialized extraction schemas based on document type
- **Comprehensive Data Extraction**: Extracts filing-specific information:
- **10-K**: Annual reports with financial metrics, risk factors, business descriptions, executive information
- **10-Q**: Quarterly reports with period-over-period comparisons and updates
- **8-K**: Current reports with material event information and impact analysis
- **Other**: Catch-all for S-1, DEF 14A, 13F, and other filing types
- **Agent Data Storage**: Stores extracted data in LlamaCloud Agent Data for easy querying and analysis
- **UI Integration**: Web interface for reviewing and managing extracted data
## Configuration
All main configuration is in `src/extraction_review/config.py`:
## How It Works
The application uses a multi-step workflow powered by LlamaIndex:
1. **File Upload**: User uploads an SEC filing document through the UI
2. **Download**: File is downloaded from LlamaCloud storage
3. **Classification**: LlamaClassify analyzes the first 5 pages to determine filing type (10-K, 10-Q, 8-K, or other)
4. **Schema Selection**: Appropriate extraction schema is selected based on classification
5. **Extraction**: LlamaExtract processes the document using the selected schema
6. **Storage**: Extracted data is stored in Agent Data with deduplication by file hash
7. **Review**: UI displays extracted data for review and editing
### Workflows
The application includes two main workflows:
- **`process-file`** (`src/extraction_review/process_file.py`): Main workflow for processing SEC filings
- Steps: download → classify → extract → store
- Uses typed context to pass state between steps
- Streams progress updates to UI via `UIToast` events
- **`metadata`** (`src/extraction_review/metadata_workflow.py`): Exposes configuration metadata to UI
- Returns JSON schema and collection name for dynamic UI generation
## Linting and Type Checking
Python and javascript packages contain helpful scripts to lint, format, and type check the code.
To check and fix python code:
```bash
uv run hatch run lint
uv run hatch run typecheck
uv run hatch run test
# run all at once
uv run hatch run all-fix
```
To check and fix javascript code, within the `ui` directory:
```bash
pnpm run lint
pnpm run typecheck
pnpm run test
# run all at once
pnpm run all-fix
```
+51
View File
@@ -0,0 +1,51 @@
[project]
name = "extraction-review"
version = "0.1.0"
description = "Extracts data"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"llama-cloud-services>=0.6.69",
"llama-index-workflows>=2.2.0,<3.0.0",
"python-dotenv>=1.1.0",
"jsonref>=1.1.0",
"click>=8.2.1,<8.3.0",
"httpx>=0.28.1",
"llama-index-core>=0.14.0",
]
[dependency-groups]
dev = [
"ruff>=0.11.10",
"typescript>=0.0.12",
"ty>=0.0.1a16",
"pytest>=8.4.1",
"hatch>=1.14.1",
"llamactl>=0.3.0"
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.envs.default.scripts]
"format" = "ruff format ."
"format-check" = "ruff format --check ."
"lint" = "ruff check --fix ."
"lint-check" = ["ruff check ."]
typecheck = "ty check src"
test = "pytest"
"all-check" = ["format-check", "lint-check", "test"]
"all-fix" = ["format", "lint", "test"]
[tool.llamadeploy]
env_files = [".env"]
llama_cloud = true
[tool.llamadeploy.workflows]
process-file = "extraction_review.process_file:workflow"
metadata = "extraction_review.metadata_workflow:workflow"
[tool.llamadeploy.ui]
directory = "ui"
View File
+88
View File
@@ -0,0 +1,88 @@
import functools
import os
from typing import Any
import httpx
from llama_cloud_services import ExtractionAgent, LlamaExtract
from llama_cloud.core.api_error import ApiError
from llama_cloud_services.beta.agent_data import AsyncAgentDataClient, ExtractedData
from llama_cloud_services.beta.classifier.client import ClassifyClient
from llama_cloud.client import AsyncLlamaCloud
import logging
from extraction_review.config import (
EXTRACT_CONFIG,
EXTRACTED_DATA_COLLECTION,
EXTRACTION_AGENT_NAME,
USE_REMOTE_EXTRACTION_SCHEMA,
ExtractionSchema,
)
logger = logging.getLogger(__name__)
# deployed agents may infer their name from the deployment name
# Note: Make sure that an agent deployment with this name actually exists
# otherwise calls to get or set data will fail. You may need to adjust the `or `
# name for development
agent_name = os.getenv("LLAMA_DEPLOY_DEPLOYMENT_NAME")
# required for all llama cloud calls
api_key = os.environ["LLAMA_CLOUD_API_KEY"]
# get this in case running against a different environment than production
base_url = os.getenv("LLAMA_CLOUD_BASE_URL")
project_id = os.getenv("LLAMA_DEPLOY_PROJECT_ID")
@functools.lru_cache(maxsize=None)
def get_extract_agent() -> ExtractionAgent:
extract_api = LlamaExtract(
api_key=api_key, base_url=base_url, project_id=project_id
)
try:
existing = extract_api.get_agent(EXTRACTION_AGENT_NAME)
if not USE_REMOTE_EXTRACTION_SCHEMA:
existing.data_schema = ExtractionSchema
existing.config = EXTRACT_CONFIG
return existing
except ApiError as e:
if e.status_code == 404:
if USE_REMOTE_EXTRACTION_SCHEMA:
logger.warning(
"Extraction agent does not exist, creating a new one from the local schema"
)
return extract_api.create_agent(
name=EXTRACTION_AGENT_NAME,
data_schema=ExtractionSchema,
config=EXTRACT_CONFIG,
)
else:
raise
@functools.lru_cache(maxsize=None)
def get_data_client() -> AsyncAgentDataClient:
return AsyncAgentDataClient(
deployment_name=agent_name,
collection=EXTRACTED_DATA_COLLECTION,
type=ExtractedData[Any],
client=get_llama_cloud_client(),
)
@functools.lru_cache(maxsize=None)
def get_llama_cloud_client():
return AsyncLlamaCloud(
base_url=base_url,
token=api_key,
httpx_client=httpx.AsyncClient(
timeout=60, headers={"Project-Id": project_id} if project_id else None
),
)
@functools.lru_cache(maxsize=None)
def get_classifier_client():
return ClassifyClient(
client=get_llama_cloud_client(),
project_id=project_id,
)
+362
View File
@@ -0,0 +1,362 @@
"""
For simple configuration of the extraction review application, just customize this file.
If you need more control, feel free to edit the rest of the application
"""
from __future__ import annotations
import os
from typing import Type
from llama_cloud import ExtractConfig
from llama_cloud_services.extract import ExtractMode
from pydantic import BaseModel, Field
# If you change this to true, the schema and extraction configuration will be fetched from the remote extraction agent
# rather than using the ExtractionSchema and configuration defined below.
USE_REMOTE_EXTRACTION_SCHEMA: bool = False
# The name of the extraction agent to use. Prefers the name of this deployment when deployed to isolate environments.
# Note that the application will create a new agent from the below ExtractionSchema if the extraction agent does not yet exist.
EXTRACTION_AGENT_NAME: str = (
os.getenv("LLAMA_DEPLOY_DEPLOYMENT_NAME") or "extraction-review"
)
# The name of the collection to use for storing extracted data. This will be qualified by the agent name.
# When developing locally, this will use the _public collection (shared within the project), otherwise agent
# data is isolated to each agent
EXTRACTED_DATA_COLLECTION: str = "sec-filing-extraction"
# SEC Filing Classification Types
SEC_FILING_TYPES = ["10-K", "10-Q", "8-K", "other"]
# Base class for common fields across all SEC filings
class BaseSECFiling(BaseModel):
"""Common fields present in all SEC filings"""
company_name: str = Field(
description="The full legal name of the company filing the document"
)
ticker_symbol: str | None = Field(
default=None,
description="The stock ticker symbol of the company. May not be present for all filings.",
)
cik: str | None = Field(
default=None,
description="Central Index Key - the unique identifier assigned by the SEC to the company",
)
filing_date: str | None = Field(
default=None,
description="The date the document was filed with the SEC (format: YYYY-MM-DD)",
)
fiscal_year_end: str | None = Field(
default=None,
description="The fiscal year end date for the company (format: YYYY-MM-DD)",
)
sic_code: str | None = Field(
default=None,
description="Standard Industrial Classification code for the company's industry",
)
# Financial metrics that appear in multiple filing types
class FinancialMetrics(BaseModel):
"""Key financial metrics extracted from statements"""
total_revenue: str | None = Field(
default=None,
description="Total revenue/sales for the period. Include currency and amount (e.g., '$1.2B USD')",
)
net_income: str | None = Field(
default=None,
description="Net income/profit for the period. Include currency and amount",
)
total_assets: str | None = Field(
default=None,
description="Total assets as of the balance sheet date. Include currency and amount",
)
total_liabilities: str | None = Field(
default=None,
description="Total liabilities as of the balance sheet date. Include currency and amount",
)
stockholders_equity: str | None = Field(
default=None,
description="Total stockholders' equity. Include currency and amount",
)
cash_and_equivalents: str | None = Field(
default=None,
description="Cash and cash equivalents. Include currency and amount",
)
earnings_per_share: str | None = Field(
default=None, description="Earnings per share (EPS) for the period"
)
# Risk factor for use in 10-K and 10-Q
class RiskFactor(BaseModel):
"""Individual risk factor identified in the filing"""
category: str = Field(
description="Category of risk (e.g., 'Market Risk', 'Operational Risk', 'Legal Risk')"
)
description: str = Field(description="Brief description of the specific risk")
# 10-K: Annual Report
class Filing10K(BaseSECFiling):
"""
Form 10-K is an annual report required by the SEC that provides a comprehensive
summary of a company's financial performance.
"""
document_type: str = Field(default="10-K", description="Should always be '10-K'")
fiscal_year: int | None = Field(
default=None,
description="The fiscal year covered by this annual report (e.g., 2023)",
)
# Business overview
business_description: str | None = Field(
default=None,
description="A 2-3 sentence summary of the company's business and operations",
)
# Financial data
financial_metrics: FinancialMetrics | None = Field(
default=None, description="Key financial metrics from the annual statements"
)
# Risk factors
risk_factors: list[RiskFactor] | None = Field(
default=None,
description="List of material risk factors disclosed in the filing. Extract 3-5 most significant risks.",
)
# Management discussion
management_discussion_summary: str | None = Field(
default=None,
description="2-3 sentence summary of Management's Discussion and Analysis (MD&A) section",
)
# Legal proceedings
legal_proceedings: list[str] | None = Field(
default=None,
description="List of significant legal proceedings or litigation mentioned",
)
# Executive officers
executive_officers: list[str] | None = Field(
default=None,
description="Names and titles of key executive officers (CEO, CFO, etc.)",
)
# Auditor information
auditor_name: str | None = Field(
default=None,
description="Name of the independent registered public accounting firm",
)
# Key insights
key_highlights: list[str] | None = Field(
default=None,
description="3-5 key highlights or notable items from the annual report",
)
# 10-Q: Quarterly Report
class Filing10Q(BaseSECFiling):
"""
Form 10-Q is a quarterly report that provides a continuing view of a company's
financial position during the year.
"""
document_type: str = Field(default="10-Q", description="Should always be '10-Q'")
fiscal_quarter: str | None = Field(
default=None,
description="The fiscal quarter covered (e.g., 'Q1 2024', 'Q2 2023')",
)
fiscal_year: int | None = Field(
default=None, description="The fiscal year for this quarter (e.g., 2024)"
)
period_end_date: str | None = Field(
default=None,
description="The end date of the quarterly period (format: YYYY-MM-DD)",
)
# Financial data
financial_metrics: FinancialMetrics | None = Field(
default=None, description="Key financial metrics from the quarterly statements"
)
# Comparison to prior periods
year_over_year_revenue_change: str | None = Field(
default=None,
description="Year-over-year revenue change percentage or description (e.g., 'up 15%')",
)
quarter_over_quarter_revenue_change: str | None = Field(
default=None,
description="Quarter-over-quarter revenue change percentage or description",
)
# Management discussion
management_discussion_summary: str | None = Field(
default=None,
description="2-3 sentence summary of Management's Discussion and Analysis for the quarter",
)
# Risk factors
material_changes_to_risks: str | None = Field(
default=None,
description="Summary of any material changes to risk factors since the last 10-K",
)
# Legal updates
legal_proceedings_updates: list[str] | None = Field(
default=None,
description="Updates to legal proceedings or new litigation since last filing",
)
# Key insights
key_highlights: list[str] | None = Field(
default=None,
description="3-5 key highlights or notable items from the quarterly report",
)
# 8-K: Current Report
class Filing8K(BaseSECFiling):
"""
Form 8-K is a current report used to notify investors of significant events
that shareholders should know about.
"""
document_type: str = Field(default="8-K", description="Should always be '8-K'")
# Event information
event_date: str | None = Field(
default=None,
description="The date of the event being reported (format: YYYY-MM-DD)",
)
event_type: str | None = Field(
default=None,
description="Type of event (e.g., 'Merger/Acquisition', 'Leadership Change', 'Earnings Release', 'Material Agreement')",
)
item_numbers: list[str] | None = Field(
default=None,
description="Item numbers from the 8-K form (e.g., ['1.01', '5.02']) indicating which sections are included",
)
# Event description
event_summary: str = Field(
description="2-4 sentence summary describing the material event being reported"
)
event_details: str | None = Field(
default=None,
description="More detailed description of the event and its implications",
)
# Financial impact
estimated_financial_impact: str | None = Field(
default=None,
description="Estimated financial impact of the event, if disclosed",
)
# Related parties
related_parties: list[str] | None = Field(
default=None,
description="Names of other companies, individuals, or entities involved in the event",
)
# Exhibits filed
material_exhibits: list[str] | None = Field(
default=None,
description="Description of significant exhibits filed with the 8-K (e.g., 'Press Release', 'Material Agreement')",
)
# Forward-looking statements
contains_forward_looking_statements: bool | None = Field(
default=None,
description="Whether the filing contains forward-looking statements",
)
# Key takeaways
investment_implications: str | None = Field(
default=None,
description="1-2 sentence assessment of potential implications for investors",
)
# Other filings catch-all
class FilingOther(BaseSECFiling):
"""
Catch-all schema for other SEC filing types (e.g., S-1, DEF 14A, 13F, etc.)
"""
document_type: str = Field(
description="The type of SEC filing (e.g., 'S-1', 'DEF 14A', '13F', 'SC 13D')"
)
filing_purpose: str | None = Field(
default=None,
description="The purpose of this filing type (e.g., 'IPO Registration', 'Proxy Statement', 'Insider Holdings')",
)
summary: str = Field(
description="3-4 sentence summary of the filing's key content and purpose"
)
key_information: list[str] | None = Field(
default=None,
description="List of 3-7 key pieces of information from the filing",
)
financial_data: FinancialMetrics | None = Field(
default=None, description="Any relevant financial metrics present in the filing"
)
material_events: list[str] | None = Field(
default=None,
description="List of any material events or transactions described",
)
parties_involved: list[str] | None = Field(
default=None,
description="Other parties mentioned (companies, executives, investors, etc.)",
)
investment_relevance: str | None = Field(
default=None,
description="Brief note on why this filing might be relevant for investment analysis",
)
# Default schema for backward compatibility - now uses 10-K as the base
class ExtractionSchema(Filing10K):
"""Default extraction schema - uses 10-K structure for backward compatibility"""
pass
# Mapping of filing types to their schemas
FILING_SCHEMAS = {
"10-K": Filing10K,
"10-Q": Filing10Q,
"8-K": Filing8K,
"other": FilingOther,
}
# This is only used if USE_REMOTE_EXTRACTION_SCHEMA is False.
EXTRACT_CONFIG = ExtractConfig(
extraction_mode=ExtractMode.PREMIUM,
system_prompt=None,
# advanced. Only compatible with Premium mode.
use_reasoning=False,
cite_sources=False,
confidence_scores=True,
)
SCHEMA: Type[BaseModel] | None = (
None if USE_REMOTE_EXTRACTION_SCHEMA else ExtractionSchema
)
@@ -0,0 +1,36 @@
from typing import Any
from workflows import Workflow, step
from workflows.events import StartEvent, StopEvent
import jsonref
from .config import EXTRACTED_DATA_COLLECTION, FILING_SCHEMAS
class MetadataResponse(StopEvent):
schemas: dict[str, dict[str, Any]]
extracted_data_collection: str
class MetadataWorkflow(Workflow):
"""
Simple single step workflow to expose configuration to the UI, such as all JSON schemas and collection name.
"""
@step
async def get_metadata(self, _: StartEvent) -> MetadataResponse:
# Convert all filing schemas to JSON schemas
schemas = {}
for filing_type, schema_class in FILING_SCHEMAS.items():
json_schema = schema_class.model_json_schema()
# Resolve any $ref references
json_schema = jsonref.replace_refs(json_schema, proxies=False)
schemas[filing_type] = json_schema
return MetadataResponse(
schemas=schemas,
extracted_data_collection=EXTRACTED_DATA_COLLECTION,
)
workflow = MetadataWorkflow(timeout=None)
+401
View File
@@ -0,0 +1,401 @@
import asyncio
import hashlib
import logging
import os
from pathlib import Path
import tempfile
from typing import Any, Literal
import httpx
from llama_cloud import ClassificationResult, ExtractRun
from llama_cloud.types import ClassifierRule, ClassifyParsingConfiguration
from llama_cloud_services.extract import SourceText
from llama_cloud_services.beta.agent_data import ExtractedData, InvalidExtractionData
from pydantic import BaseModel
from workflows import Context, Workflow, step
from workflows.events import Event, StartEvent, StopEvent
from .clients import (
get_classifier_client,
get_llama_cloud_client,
get_data_client,
get_extract_agent,
)
from .config import FILING_SCHEMAS
logger = logging.getLogger(__name__)
class FileEvent(StartEvent):
file_id: str
class DownloadFileEvent(Event):
pass
class FileDownloadedEvent(Event):
pass
class ClassifyFileEvent(Event):
pass
class FileClassifiedEvent(Event):
filing_type: str
confidence: float | None = None
reasoning: str | None = None
class UIToast(Event):
level: Literal["info", "warning", "error"]
message: str
class ExtractedEvent(Event):
data: ExtractedData
class ExtractedInvalidEvent(Event):
data: ExtractedData[dict[str, Any]]
class ExtractionState(BaseModel):
file_id: str | None = None
file_path: str | None = None
filename: str | None = None
filing_type: str | None = None
classification_confidence: float | None = None
classification_reasoning: str | None = None
class ProcessFileWorkflow(Workflow):
"""
Given a file path, this workflow will process a single file through the custom extraction logic.
"""
@step()
async def run_file(self, event: FileEvent, ctx: Context) -> DownloadFileEvent:
logger.info(f"Running file {event.file_id}")
async with ctx.store.edit_state() as state:
state.file_id = event.file_id
return DownloadFileEvent()
@step()
async def download_file(
self, event: DownloadFileEvent, ctx: Context[ExtractionState]
) -> ClassifyFileEvent:
"""Download the file reference from the cloud storage"""
state = await ctx.store.get_state()
if state.file_id is None:
raise ValueError("File ID is not set")
try:
file_metadata = await get_llama_cloud_client().files.get_file(
id=state.file_id
)
file_url = await get_llama_cloud_client().files.read_file_content(
state.file_id
)
temp_dir = tempfile.gettempdir()
filename = file_metadata.name
file_path = os.path.join(temp_dir, filename)
client = httpx.AsyncClient()
# Report progress to the UI
logger.info(f"Downloading file {file_url.url} to {file_path}")
async with client.stream("GET", file_url.url) as response:
with open(file_path, "wb") as f:
async for chunk in response.aiter_bytes():
f.write(chunk)
logger.info(f"Downloaded file {file_url.url} to {file_path}")
async with ctx.store.edit_state() as state:
state.file_path = file_path
state.filename = filename
return ClassifyFileEvent()
except Exception as e:
logger.error(f"Error downloading file {state.file_id}: {e}", exc_info=True)
ctx.write_event_to_stream(
UIToast(
level="error",
message=f"Error downloading file {state.file_id}: {e}",
)
)
raise e
@step()
async def classify_file(
self, event: ClassifyFileEvent, ctx: Context[ExtractionState]
) -> FileClassifiedEvent:
"""Classify the SEC filing document type"""
state = await ctx.store.get_state()
if state.file_path is None or state.filename is None:
raise ValueError("File path or filename is not set")
try:
logger.info(f"Classifying file {state.filename}")
ctx.write_event_to_stream(
UIToast(level="info", message=f"Classifying file {state.filename}")
)
# Initialize the classifier
classifier = get_classifier_client()
# Define classification rules for SEC filing types
rules = [
ClassifierRule(
type="10-K",
description=(
"Form 10-K is an annual report filed by public companies with the SEC. "
"It provides a comprehensive summary of a company's financial performance for the year, "
"including audited financial statements, management's discussion and analysis (MD&A), "
"risk factors, business description, and executive compensation. "
"Look for: 'Form 10-K', 'Annual Report', fiscal year references, audited financials."
),
),
ClassifierRule(
type="10-Q",
description=(
"Form 10-Q is a quarterly report filed by public companies with the SEC. "
"It provides unaudited financial statements and management discussion for a specific quarter. "
"Contains quarterly financial data, updates on business operations, and material changes. "
"Look for: 'Form 10-Q', 'Quarterly Report', quarter references (Q1, Q2, Q3), unaudited statements."
),
),
ClassifierRule(
type="8-K",
description=(
"Form 8-K is a current report filed to announce material events or corporate changes. "
"Used to notify investors of significant events like mergers, acquisitions, leadership changes, "
"earnings releases, or other material corporate events that shareholders should know about. "
"Look for: 'Form 8-K', 'Current Report', Item numbers (e.g., Item 1.01, Item 5.02), event dates, "
"specific triggering events."
),
),
ClassifierRule(
type="other",
description=(
"Any other SEC filing type not covered by 10-K, 10-Q, or 8-K. "
"This includes forms such as S-1 (IPO registration), DEF 14A (proxy statement), "
"13F (institutional holdings), SC 13D (beneficial ownership), and other SEC forms."
),
),
]
# Configure parsing - only parse first few pages for classification
parsing_config = ClassifyParsingConfiguration(
max_pages=5, # Only parse first 5 pages for faster classification
)
# Classify the file
results = await classifier.aclassify_file_paths(
rules=rules,
file_input_paths=[state.file_path],
parsing_configuration=parsing_config,
)
# Extract classification result
if results.items and len(results.items) > 0:
item = results.items[0]
result: ClassificationResult | None = item.result
if result:
filing_type = result.type
confidence = result.confidence
reasoning = result.reasoning
logger.info(
f"Classified {state.filename} as {filing_type} "
f"(confidence: {confidence}, reasoning: {reasoning})"
)
ctx.write_event_to_stream(
UIToast(
level="info",
message=f"Classified as {filing_type} SEC filing",
)
)
async with ctx.store.edit_state() as state:
state.filing_type = filing_type
state.classification_confidence = confidence
state.classification_reasoning = reasoning
return FileClassifiedEvent(
filing_type=filing_type,
confidence=confidence,
reasoning=reasoning,
)
else:
# Classification failed, default to "other"
logger.warning(
f"Classification failed for {state.filename}, defaulting to 'other'"
)
ctx.write_event_to_stream(
UIToast(
level="warning",
message="Classification uncertain, using default schema",
)
)
async with ctx.store.edit_state() as state:
state.filing_type = "other"
return FileClassifiedEvent(filing_type="other")
else:
# No results, default to "other"
logger.warning(f"No classification results for {state.filename}")
async with ctx.store.edit_state() as state:
state.filing_type = "other"
return FileClassifiedEvent(filing_type="other")
except Exception as e:
logger.error(f"Error classifying file {state.filename}: {e}", exc_info=True)
ctx.write_event_to_stream(
UIToast(
level="warning",
message=f"Classification failed, using default schema: {e}",
)
)
# On error, default to "other" and continue
async with ctx.store.edit_state() as state:
state.filing_type = "other"
return FileClassifiedEvent(filing_type="other")
@step()
async def process_file(
self, event: FileClassifiedEvent, ctx: Context[ExtractionState]
) -> ExtractedEvent | ExtractedInvalidEvent:
"""Runs the extraction against the file"""
state = await ctx.store.get_state()
if state.file_path is None or state.filename is None:
raise ValueError("File path or filename is not set")
try:
# Get the appropriate schema based on classification
filing_type = (state.filing_type or "other").upper()
schema = FILING_SCHEMAS.get(filing_type, FILING_SCHEMAS["other"])
logger.info(f"Using schema for filing type: {filing_type}")
ctx.write_event_to_stream(
UIToast(
level="info",
message=f"Extracting data using {filing_type} schema",
)
)
agent = get_extract_agent()
# Update the agent's data schema for this specific filing type
agent.data_schema = schema
# track the content of the file, so as to be able to de-duplicate
file_content = Path(state.file_path).read_bytes()
file_hash = hashlib.sha256(file_content).hexdigest()
source_text = SourceText(
file=state.file_path,
filename=state.filename,
)
logger.info(f"Extracting data from file {state.filename}")
ctx.write_event_to_stream(
UIToast(
level="info", message=f"Extracting data from file {state.filename}"
)
)
extracted_result: ExtractRun = await agent.aextract(source_text)
try:
logger.info(f"Extracted data: {extracted_result}")
data = ExtractedData.from_extraction_result(
result=extracted_result,
schema=schema,
file_hash=file_hash,
)
# Add classification information to the extracted data
if data.metadata is None:
data.metadata = {}
data.metadata["classification"] = filing_type
data.metadata["classification_confidence"] = (
state.classification_confidence
)
data.metadata["classification_reasoning"] = (
state.classification_reasoning
)
return ExtractedEvent(data=data)
except InvalidExtractionData as e:
logger.error(f"Error validating extracted data: {e}", exc_info=True)
return ExtractedInvalidEvent(data=e.invalid_item)
except Exception as e:
logger.error(
f"Error extracting data from file {state.filename}: {e}",
exc_info=True,
)
ctx.write_event_to_stream(
UIToast(
level="error",
message=f"Error extracting data from file {state.filename}: {e}",
)
)
raise e
@step()
async def record_extracted_data(
self, event: ExtractedEvent | ExtractedInvalidEvent, ctx: Context
) -> StopEvent:
"""Records the extracted data to the agent data API"""
try:
logger.info(f"Recorded extracted data for file {event.data.file_name}")
ctx.write_event_to_stream(
UIToast(
level="info",
message=f"Recorded extracted data for file {event.data.file_name}",
)
)
# remove past data when reprocessing the same file
if event.data.file_hash:
existing_data = await get_data_client().untyped_search(
filter={
"file_hash": {
"eq": event.data.file_hash,
},
},
)
if existing_data.items:
logger.info(
f"Removing past data for file {event.data.file_name} with hash {event.data.file_hash}"
)
await asyncio.gather(
*[
get_data_client().delete_item(item.id)
for item in existing_data.items
]
)
# finally, save the new data
item_id = await get_data_client().create_item(event.data)
return StopEvent(
result=item_id.id,
)
except Exception as e:
logger.error(
f"Error recording extracted data for file {event.data.file_name}: {e}",
exc_info=True,
)
ctx.write_event_to_stream(
UIToast(
level="error",
message=f"Error recording extracted data for file {event.data.file_name}: {e}",
)
)
raise e
workflow = ProcessFileWorkflow(timeout=None)
if __name__ == "__main__":
from dotenv import load_dotenv
load_dotenv()
logging.basicConfig(level=logging.INFO)
async def main():
file = await get_llama_cloud_client().files.upload_file(
upload_file=Path("test.pdf").open("rb")
)
await workflow.run(start_event=FileEvent(file_id=file.id))
asyncio.run(main())
+57
View File
@@ -0,0 +1,57 @@
"""
Selects a locally defined shema, or queries the remote extraction agent for the schema.
"""
import asyncio
import jsonref
from .clients import get_extract_agent
from .config import USE_REMOTE_EXTRACTION_SCHEMA, ExtractionSchema
from typing import Any, Type
from pydantic import BaseModel
from pydantic import create_model, Field
SCHEMA: Type[BaseModel] | None = (
None if USE_REMOTE_EXTRACTION_SCHEMA else ExtractionSchema
)
_schema_lock = asyncio.Lock()
async def get_extraction_schema() -> Type[BaseModel]:
global SCHEMA
if SCHEMA is not None:
return SCHEMA
async with _schema_lock:
if SCHEMA is not None:
return SCHEMA
agent = get_extract_agent()
SCHEMA = model_from_schema(agent.data_schema)
return SCHEMA
async def get_extraction_schema_json() -> dict[str, Any]:
json_schema = (await get_extraction_schema()).model_json_schema()
json_schema = jsonref.replace_refs(json_schema, proxies=False)
return json_schema
def model_from_schema(schema: dict[str, Any]) -> Type[BaseModel]:
"""
Converts a JSON schema back to a Pydantic model.
"""
typemap = {
"string": str,
"integer": int,
"number": float,
"boolean": bool,
"array": list,
"object": dict,
}
fields = {}
for prop, meta in schema.get("properties", {}).items():
py_type = typemap.get(meta.get("type"), Any)
default = ... if prop in schema.get("required", []) else None
fields[prop] = (py_type, Field(default, description=meta.get("description")))
return create_model(schema.get("title", "DynamicModel"), **fields)
+2
View File
@@ -0,0 +1,2 @@
def test_placeholder():
pass
+43
View File
@@ -0,0 +1,43 @@
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
# dependencies
/node_modules
/.pnp
.pnp.*
.yarn/*
!.yarn/patches
!.yarn/plugins
!.yarn/releases
!.yarn/versions
# testing
/coverage
# next.js
/.next/
/out/
/dist/
# production
/build
# misc
.DS_Store
*.pem
# debug
npm-debug.log*
yarn-debug.log*
yarn-error.log*
.pnpm-debug.log*
# env files (can opt-in for committing if needed)
.env*
# vercel
.vercel
# typescript
*.tsbuildinfo
next-env.d.ts
+7
View File
@@ -0,0 +1,7 @@
# Data Extraction UI
This is a simple next.js template that builds on the @llamaindex/agent-app ui component library
for showing displaying tables of extracted data.
Ideally run this with `llamactl` in the parent directory (See [README.md](../README.md)),
but you can also run it standalone with `npm run dev`, but workflow integrations will not work
+21
View File
@@ -0,0 +1,21 @@
{
"$schema": "https://ui.shadcn.com/schema.json",
"style": "new-york",
"rsc": true,
"tsx": true,
"tailwind": {
"config": "",
"css": "src/index.css",
"baseColor": "zinc",
"cssVariables": true,
"prefix": ""
},
"aliases": {
"components": "@/components",
"utils": "@/lib/utils",
"ui": "@/components/ui",
"lib": "@/lib",
"hooks": "@/hooks"
},
"iconLibrary": "lucide"
}
+12
View File
@@ -0,0 +1,12 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Review</title>
</head>
<body>
<div id="root"></div>
<script type="module" src="/src/main.tsx"></script>
</body>
</html>
+45
View File
@@ -0,0 +1,45 @@
{
"name": "extraction-review-ui",
"version": "0.1.0",
"private": true,
"type": "module",
"scripts": {
"dev": "vite",
"build": "tsc && vite build",
"preview": "vite preview",
"lint": "tsc --noEmit",
"format": "prettier --write src",
"format-check": "prettier --check src",
"all-check": "pnpm i && pnpm run lint && pnpm run format-check && pnpm run build",
"all-fix": "pnpm i && pnpm run lint && pnpm run format && pnpm run build"
},
"dependencies": {
"@babel/runtime": "^7.27.6",
"@lezer/highlight": "^1.2.1",
"@llamaindex/ui": "^2.1.2",
"@radix-ui/themes": "^3.2.1",
"class-variance-authority": "^0.7.1",
"clsx": "^2.1.1",
"llama-cloud-services": "^0.3.4",
"lucide-react": "^0.514.0",
"react": "^18.3.0",
"react-dom": "^18.3.0",
"react-router-dom": "^6.30.0",
"sonner": "^2.0.5",
"tw-animate-css": "^1.3.5"
},
"devDependencies": {
"@tailwindcss/postcss": "^4.1.10",
"@types/node": "^20",
"@types/react": "^19",
"@types/react-dom": "^19",
"@vitejs/plugin-react": "^4.3.4",
"postcss": "^8.5.5",
"prettier": "^3.6.2",
"tailwind-merge": "^3.3.1",
"tailwindcss": "^4.1.8",
"typescript": "^5",
"vite": "^6.0.5"
},
"packageManager": "pnpm@10.11.1+sha512.e519b9f7639869dc8d5c3c5dfef73b3f091094b0a006d7317353c72b124e80e1afd429732e28705ad6bfa1ee879c1fce46c128ccebd3192101f43dd67c667912"
}
+7
View File
@@ -0,0 +1,7 @@
const config = {
plugins: {
"@tailwindcss/postcss": {},
},
};
export default config;
+70
View File
@@ -0,0 +1,70 @@
import React from "react";
import { Routes, Route } from "react-router-dom";
import { Theme } from "@radix-ui/themes";
import {
Breadcrumb,
BreadcrumbItem,
BreadcrumbList,
BreadcrumbSeparator,
} from "@llamaindex/ui";
import { Link } from "react-router-dom";
import { Toaster } from "@llamaindex/ui";
import { useToolbar, ToolbarProvider } from "@/lib/ToolbarContext";
import { MetadataProvider } from "@/lib/MetadataProvider";
// Import pages
import HomePage from "./pages/HomePage";
import ItemPage from "./pages/ItemPage";
export default function App() {
return (
<Theme>
<MetadataProvider>
<ToolbarProvider>
<div className="grid grid-rows-[auto_1fr] h-screen">
<Toolbar />
<main className="overflow-auto">
<Routes>
<Route path="/" element={<HomePage />} />
<Route path="/item/:itemId" element={<ItemPage />} />
</Routes>
</main>
</div>
<Toaster />
</ToolbarProvider>
</MetadataProvider>
</Theme>
);
}
const Toolbar = () => {
const { buttons, breadcrumbs } = useToolbar();
return (
<header className="sticky top-0 z-50 flex h-16 shrink-0 items-center gap-2 border-b px-4 bg-white/95 backdrop-blur supports-[backdrop-filter]:bg-white/60">
<Breadcrumb>
<BreadcrumbList>
{breadcrumbs.map((item, index) => (
<React.Fragment key={index}>
{index > 0 && <BreadcrumbSeparator />}
<BreadcrumbItem>
{item.href && !item.isCurrentPage ? (
<Link to={item.href} className="font-medium text-base">
{item.label}
</Link>
) : (
<span
className={`font-medium ${index === 0 ? "text-base" : ""}`}
>
{item.label}
</span>
)}
</BreadcrumbItem>
</React.Fragment>
))}
</BreadcrumbList>
</Breadcrumb>
{buttons}
</header>
);
};
+120
View File
@@ -0,0 +1,120 @@
@import "tailwindcss";
@import "tw-animate-css";
@custom-variant dark (&:is(.dark *));
@theme inline {
--radius-sm: calc(var(--radius) - 4px);
--radius-md: calc(var(--radius) - 2px);
--radius-lg: var(--radius);
--radius-xl: calc(var(--radius) + 4px);
--color-background: var(--background);
--color-foreground: var(--foreground);
--color-card: var(--card);
--color-card-foreground: var(--card-foreground);
--color-popover: var(--popover);
--color-popover-foreground: var(--popover-foreground);
--color-primary: var(--primary);
--color-primary-foreground: var(--primary-foreground);
--color-secondary: var(--secondary);
--color-secondary-foreground: var(--secondary-foreground);
--color-muted: var(--muted);
--color-muted-foreground: var(--muted-foreground);
--color-accent: var(--accent);
--color-accent-foreground: var(--accent-foreground);
--color-destructive: var(--destructive);
--color-border: var(--border);
--color-input: var(--input);
--color-ring: var(--ring);
--color-chart-1: var(--chart-1);
--color-chart-2: var(--chart-2);
--color-chart-3: var(--chart-3);
--color-chart-4: var(--chart-4);
--color-chart-5: var(--chart-5);
--color-sidebar: var(--sidebar);
--color-sidebar-foreground: var(--sidebar-foreground);
--color-sidebar-primary: var(--sidebar-primary);
--color-sidebar-primary-foreground: var(--sidebar-primary-foreground);
--color-sidebar-accent: var(--sidebar-accent);
--color-sidebar-accent-foreground: var(--sidebar-accent-foreground);
--color-sidebar-border: var(--sidebar-border);
--color-sidebar-ring: var(--sidebar-ring);
}
:root {
--radius: 0.625rem;
--card: oklch(1 0 0);
--card-foreground: oklch(0.141 0.005 285.823);
--popover: oklch(1 0 0);
--popover-foreground: oklch(0.141 0.005 285.823);
--primary: oklch(0.21 0.006 285.885);
--primary-foreground: oklch(0.985 0 0);
--secondary: oklch(0.967 0.001 286.375);
--secondary-foreground: oklch(0.21 0.006 285.885);
--muted: oklch(0.967 0.001 286.375);
--muted-foreground: oklch(0.552 0.016 285.938);
--accent: oklch(0.967 0.001 286.375);
--accent-foreground: oklch(0.21 0.006 285.885);
--destructive: oklch(0.577 0.245 27.325);
--border: oklch(0.92 0.004 286.32);
--input: oklch(0.92 0.004 286.32);
--ring: oklch(0.705 0.015 286.067);
--chart-1: oklch(0.646 0.222 41.116);
--chart-2: oklch(0.6 0.118 184.704);
--chart-3: oklch(0.398 0.07 227.392);
--chart-4: oklch(0.828 0.189 84.429);
--chart-5: oklch(0.769 0.188 70.08);
--sidebar: oklch(0.985 0 0);
--sidebar-foreground: oklch(0.141 0.005 285.823);
--sidebar-primary: oklch(0.21 0.006 285.885);
--sidebar-primary-foreground: oklch(0.985 0 0);
--sidebar-accent: oklch(0.967 0.001 286.375);
--sidebar-accent-foreground: oklch(0.21 0.006 285.885);
--sidebar-border: oklch(0.92 0.004 286.32);
--sidebar-ring: oklch(0.705 0.015 286.067);
--background: oklch(1 0 0);
--foreground: oklch(0.141 0.005 285.823);
}
.dark {
--background: oklch(0.141 0.005 285.823);
--foreground: oklch(0.985 0 0);
--card: oklch(0.21 0.006 285.885);
--card-foreground: oklch(0.985 0 0);
--popover: oklch(0.21 0.006 285.885);
--popover-foreground: oklch(0.985 0 0);
--primary: oklch(0.92 0.004 286.32);
--primary-foreground: oklch(0.21 0.006 285.885);
--secondary: oklch(0.274 0.006 286.033);
--secondary-foreground: oklch(0.985 0 0);
--muted: oklch(0.274 0.006 286.033);
--muted-foreground: oklch(0.705 0.015 286.067);
--accent: oklch(0.274 0.006 286.033);
--accent-foreground: oklch(0.985 0 0);
--destructive: oklch(0.704 0.191 22.216);
--border: oklch(1 0 0 / 10%);
--input: oklch(1 0 0 / 15%);
--ring: oklch(0.552 0.016 285.938);
--chart-1: oklch(0.488 0.243 264.376);
--chart-2: oklch(0.696 0.17 162.48);
--chart-3: oklch(0.769 0.188 70.08);
--chart-4: oklch(0.627 0.265 303.9);
--chart-5: oklch(0.645 0.246 16.439);
--sidebar: oklch(0.21 0.006 285.885);
--sidebar-foreground: oklch(0.985 0 0);
--sidebar-primary: oklch(0.488 0.243 264.376);
--sidebar-primary-foreground: oklch(0.985 0 0);
--sidebar-accent: oklch(0.274 0.006 286.033);
--sidebar-accent-foreground: oklch(0.985 0 0);
--sidebar-border: oklch(1 0 0 / 10%);
--sidebar-ring: oklch(0.552 0.016 285.938);
}
@layer base {
* {
@apply border-border outline-ring/50;
}
body {
@apply bg-background text-foreground;
}
}
+71
View File
@@ -0,0 +1,71 @@
import { createContext, useContext, ReactNode, useMemo } from "react";
import { ApiProvider, ApiClients } from "@llamaindex/ui";
import { useMetadata, Metadata } from "./useMetadata";
import { createBaseWorkflowClient, createClients } from "./client";
import { Clock, XCircle } from "lucide-react";
interface MetadataContextValue {
metadata: Metadata;
clients: ApiClients;
}
const MetadataContext = createContext<MetadataContextValue | null>(null);
export function MetadataProvider({ children }: { children: ReactNode }) {
const baseClients: ApiClients = useMemo(() => {
return {
workflowsClient: createBaseWorkflowClient(),
} as ApiClients;
}, []);
return (
<ApiProvider clients={baseClients}>
<InnerMetadataProvider>{children}</InnerMetadataProvider>
</ApiProvider>
);
}
function InnerMetadataProvider({ children }: { children: ReactNode }) {
const { metadata, loading, error } = useMetadata();
const clients = useMemo(
() => (metadata ? createClients(metadata) : undefined),
[metadata],
);
if (loading) {
return (
<div className="flex h-screen items-center justify-center">
<div className="text-center">
<Clock className="h-8 w-8 animate-spin mx-auto mb-2" />
<div className="text-sm text-gray-500">Loading configuration...</div>
</div>
</div>
);
}
if (error || !metadata || !clients) {
return (
<div className="flex h-screen items-center justify-center">
<div className="text-center">
<XCircle className="h-8 w-8 text-red-500 mx-auto mb-2" />
<div className="text-sm text-gray-500">
Error loading configuration: {error || "Unknown error"}
</div>
</div>
</div>
);
}
return (
<MetadataContext.Provider value={{ metadata, clients }}>
<ApiProvider clients={clients}>{children}</ApiProvider>
</MetadataContext.Provider>
);
}
export function useMetadataContext() {
const context = useContext(MetadataContext);
if (!context) {
throw new Error("useMetadataContext must be used within MetadataProvider");
}
return context;
}
+41
View File
@@ -0,0 +1,41 @@
import React from "react";
import { APP_TITLE } from "./config";
export interface BreadcrumbItem {
label: string;
href?: string;
isCurrentPage?: boolean;
}
export const ToolbarCtx = React.createContext<{
buttons: React.ReactNode[];
setButtons: (fn: (prev: React.ReactNode[]) => React.ReactNode[]) => void;
breadcrumbs: BreadcrumbItem[];
setBreadcrumbs: (items: BreadcrumbItem[]) => void;
}>({
buttons: [],
setButtons: () => {},
breadcrumbs: [],
setBreadcrumbs: () => {},
});
export const ToolbarProvider = ({
children,
}: {
children: React.ReactNode;
}) => {
const [buttons, setButtons] = React.useState<React.ReactNode[]>([]);
const [breadcrumbs, setBreadcrumbs] = React.useState<BreadcrumbItem[]>([
{ label: APP_TITLE, href: "/" },
]);
return (
<ToolbarCtx.Provider
value={{ buttons, setButtons, breadcrumbs, setBreadcrumbs }}
>
{children}
</ToolbarCtx.Provider>
);
};
export const useToolbar = () => React.useContext(ToolbarCtx);
+51
View File
@@ -0,0 +1,51 @@
import { ExtractedData } from "llama-cloud-services/beta/agent";
import {
ApiClients,
createWorkflowsClient,
createWorkflowsConfig,
createCloudAgentClient,
cloudApiClient,
} from "@llamaindex/ui";
import { AGENT_NAME } from "./config";
import type { Metadata } from "./useMetadata";
const platformToken = import.meta.env.VITE_LLAMA_CLOUD_API_KEY;
const apiBaseUrl = import.meta.env.VITE_LLAMA_CLOUD_BASE_URL;
const projectId = import.meta.env.VITE_LLAMA_DEPLOY_PROJECT_ID;
// Configure the platform client
cloudApiClient.setConfig({
...(apiBaseUrl && { baseUrl: apiBaseUrl }),
headers: {
// optionally use a backend API token scoped to a project. For local development,
...(platformToken && { authorization: `Bearer ${platformToken}` }),
// This header is required for requests to correctly scope to the agent's project
// when authenticating with a user cookie
...(projectId && { "Project-Id": projectId }),
},
});
export function createBaseWorkflowClient(): ReturnType<
typeof createWorkflowsClient
> {
return createWorkflowsClient(
createWorkflowsConfig({
baseUrl: `/deployments/${AGENT_NAME}/`,
}),
);
}
export function createClients(metadata: Metadata): ApiClients {
const workflowsClient = createBaseWorkflowClient();
const agentClient = createCloudAgentClient<ExtractedData<any>>({
client: cloudApiClient,
windowUrl: typeof window !== "undefined" ? window.location.href : undefined,
collection: metadata.extracted_data_collection,
});
return {
workflowsClient,
cloudApiClient,
agentDataClient: agentClient,
} as ApiClients;
}
+2
View File
@@ -0,0 +1,2 @@
export const APP_TITLE = "Extraction Review";
export const AGENT_NAME = import.meta.env.VITE_LLAMA_DEPLOY_DEPLOYMENT_NAME;
+39
View File
@@ -0,0 +1,39 @@
import type {
ExtractedData,
TypedAgentData,
} from "llama-cloud-services/beta/agent";
/**
* Downloads data as a JSON file
*/
export function downloadJSON<T>(
data: T,
filename: string = "extraction-results.json",
) {
const jsonString = JSON.stringify(data, null, 2);
const blob = new Blob([jsonString], { type: "application/json" });
const url = URL.createObjectURL(blob);
const link = document.createElement("a");
link.href = url;
link.download = filename;
document.body.appendChild(link);
link.click();
// Cleanup
document.body.removeChild(link);
URL.revokeObjectURL(url);
}
/**
* Downloads extracted data item as JSON
*/
export function downloadExtractedDataItem<T>(
item: TypedAgentData<ExtractedData<T>>,
) {
const fileName = item.data.file_name || "item";
const timestamp = item.createdAt.toISOString().split("T")[0];
const filename = `${fileName}-${timestamp}.json`;
downloadJSON(item, filename);
}
+41
View File
@@ -0,0 +1,41 @@
import { useWorkflowHandler, useWorkflowRun } from "@llamaindex/ui";
import { useEffect, useState } from "react";
export interface Metadata {
schemas: Record<string, any>;
extracted_data_collection: string;
}
export interface UseMetadataResult {
metadata: Metadata;
loading: boolean;
error: string | undefined;
}
export function useMetadata() {
const run = useWorkflowRun();
const [handlerId, setHandlerId] = useState<string | undefined>(undefined);
const handler = useWorkflowHandler(handlerId ?? "");
const [error, setError] = useState<string | undefined>(undefined);
const [loading, setLoading] = useState(true);
useEffect(() => {
setLoading(true);
run
.runWorkflow("metadata", {})
.then((handlerSummary) => {
setHandlerId(handlerSummary.handler_id);
})
.catch((error) => {
setError(error.message);
})
.finally(() => {
setLoading(false);
});
}, []);
const stopEvent = handler.events.find((event) =>
event.type.endsWith("MetadataResponse"),
);
const metadata = stopEvent?.data as Metadata | undefined;
return { metadata, loading, error };
}
+6
View File
@@ -0,0 +1,6 @@
import { clsx, type ClassValue } from "clsx";
import { twMerge } from "tailwind-merge";
export function cn(...inputs: ClassValue[]) {
return twMerge(clsx(inputs));
}
+14
View File
@@ -0,0 +1,14 @@
import { StrictMode } from "react";
import { createRoot } from "react-dom/client";
import { HashRouter } from "react-router-dom";
import App from "./App";
import "@llamaindex/ui/styles.css";
import "./index.css";
createRoot(document.getElementById("root")!).render(
<StrictMode>
<HashRouter>
<App />
</HashRouter>
</StrictMode>,
);
+23
View File
@@ -0,0 +1,23 @@
.main {
padding: 1rem;
}
.grid {
display: flex;
flex-direction: row;
gap: 1rem;
margin-bottom: 1rem;
& > * {
flex: 1;
}
}
.commandBar {
display: flex;
justify-content: flex-end;
margin-bottom: 1rem;
}
.progressBar {
margin-bottom: 1rem;
}
+88
View File
@@ -0,0 +1,88 @@
import {
ItemCount,
WorkflowTrigger,
WorkflowProgressBar,
ExtractedDataItemGrid,
useWorkflowHandlerList,
} from "@llamaindex/ui";
import type { TypedAgentData } from "llama-cloud-services/beta/agent";
import styles from "./HomePage.module.css";
import { useNavigate } from "react-router-dom";
import { useEffect, useState } from "react";
export default function HomePage() {
const { taskKey } = taskCompletedState();
return <TaskList key={taskKey} />;
}
/**
* Returns a key that increments when a task is completed, can be used to force a re-render of the task list
*/
function taskCompletedState() {
const { handlers } = useWorkflowHandlerList("process-file");
const runningTasks = handlers.filter(
(handler) => handler.status === "running",
);
const [runningTaskCount, setRunningTaskCount] = useState(runningTasks.length);
const [taskKey, setTaskKey] = useState(0);
useEffect(() => {
if (runningTasks.length < runningTaskCount) {
// forcefully reload task list after a task is completed
setTaskKey(taskKey + 1);
}
setRunningTaskCount(runningTasks.length);
}, [runningTasks.length]);
return { runningTaskCount, taskKey };
}
function TaskList() {
const navigate = useNavigate();
const goToItem = (item: TypedAgentData) => {
navigate(`/item/${item.id}`);
};
return (
<div className={styles.page}>
<main className={styles.main}>
<div className={styles.grid}>
<ItemCount title="Total Items" />
<ItemCount
title="Reviewed"
filter={{
status: { eq: "approved" },
}}
/>
<ItemCount
title="Needs Review"
filter={{
status: { eq: "pending_review" },
}}
/>
</div>
<div className={styles.commandBar}>
<WorkflowTrigger
workflowName="process-file"
customWorkflowInput={(files) => {
return {
file_id: files[0].fileId,
};
}}
/>
</div>
<WorkflowProgressBar
className={styles.progressBar}
workflowName="process-file"
/>
<ExtractedDataItemGrid
onRowClick={goToItem}
builtInColumns={{
fileName: true,
status: true,
createdAt: true,
itemsToReview: true,
actions: true,
}}
/>
</main>
</div>
);
}
+189
View File
@@ -0,0 +1,189 @@
import { useEffect, useState } from "react";
import {
AcceptReject,
ExtractedDataDisplay,
FilePreview,
useItemData,
type Highlight,
Button,
} from "@llamaindex/ui";
import { Clock, XCircle, Download } from "lucide-react";
import { useParams } from "react-router-dom";
import { useToolbar } from "@/lib/ToolbarContext";
import { useNavigate } from "react-router-dom";
import { modifyJsonSchema } from "@llamaindex/ui/lib";
import { APP_TITLE } from "@/lib/config";
import { downloadExtractedDataItem } from "@/lib/export";
import { useMetadataContext } from "@/lib/MetadataProvider";
export default function ItemPage() {
const { itemId } = useParams<{ itemId: string }>();
const { setButtons, setBreadcrumbs } = useToolbar();
const [highlight, setHighlight] = useState<Highlight | undefined>(undefined);
const { metadata } = useMetadataContext();
// Use the hook to fetch item data (initially with a default schema)
const itemHookData = useItemData<any>({
// We'll update the schema based on classification once data loads
jsonSchema: modifyJsonSchema(metadata.schemas["10-K"] || {}, {}),
itemId: itemId as string,
isMock: false,
});
// Determine the correct schema based on classification
const classification = (
(itemHookData.item?.data?.metadata?.classification as string | undefined) ||
"10-K"
).toUpperCase();
const correctSchema =
metadata.schemas[classification] || metadata.schemas["10-K"];
// Update the schema in itemHookData if classification is available
const [schemaKey, setSchemaKey] = useState(0);
const [appliedSchema, setAppliedSchema] = useState(correctSchema);
useEffect(() => {
if (classification && metadata.schemas[classification]) {
setAppliedSchema(modifyJsonSchema(metadata.schemas[classification], {}));
setSchemaKey(schemaKey + 1);
}
}, [classification, metadata.schemas]);
const navigate = useNavigate();
// Update breadcrumb when item data loads
useEffect(() => {
const fileName = itemHookData.item?.data?.file_name;
if (fileName) {
setBreadcrumbs([
{ label: APP_TITLE, href: "/" },
{
label: fileName,
isCurrentPage: true,
},
]);
}
return () => {
// Reset to default breadcrumb when leaving the page
setBreadcrumbs([{ label: APP_TITLE, href: "/" }]);
};
}, [itemHookData.item?.data?.file_name, setBreadcrumbs]);
useEffect(() => {
setButtons(() => [
<div className="ml-auto flex items-center gap-2">
<Button
variant="outline"
size="sm"
onClick={() => {
if (itemData) {
downloadExtractedDataItem(itemData);
}
}}
disabled={!itemData}
>
<Download className="h-4 w-4 mr-2" />
Export JSON
</Button>
<AcceptReject<any>
itemData={itemHookData}
onComplete={() => navigate("/")}
/>
</div>,
]);
return () => {
setButtons(() => []);
};
}, [itemHookData.data, setButtons]);
const {
item: itemData,
updateData,
loading: isLoading,
error,
} = itemHookData;
const classificationReasoning = itemData?.data?.metadata
?.classification_reasoning as string | undefined;
if (isLoading) {
return (
<div className="flex h-screen items-center justify-center">
<div className="text-center">
<Clock className="h-8 w-8 animate-spin mx-auto mb-2" />
<div className="text-sm text-gray-500">Loading item...</div>
</div>
</div>
);
}
if (error || !itemData) {
return (
<div className="flex h-screen items-center justify-center">
<div className="text-center">
<XCircle className="h-8 w-8 text-red-500 mx-auto mb-2" />
<div className="text-sm text-gray-500">
Error loading item: {error || "Item not found"}
</div>
</div>
</div>
);
}
return (
<div className="flex h-full bg-gray-50">
{/* Left Side - File Preview */}
<div className="w-1/2 border-r border-gray-200 bg-white">
{itemData.data.file_id && (
<FilePreview
fileId={itemData.data.file_id}
onBoundingBoxClick={(box, pageNumber) => {
console.log("Bounding box clicked:", box, "on page:", pageNumber);
}}
highlight={highlight}
/>
)}
</div>
{/* Right Side - Review Panel */}
<div className="flex-1 bg-white h-full overflow-y-auto">
<div className="p-4 space-y-4">
{/* Classification Info */}
{classification && (
<div className="bg-blue-50 border border-blue-200 rounded-lg p-3 mb-4">
<div className="text-sm font-semibold text-blue-900">
Document Type: {classification}
</div>
{classificationReasoning && (
<div className="text-xs text-blue-600 mt-1">
{classificationReasoning}
</div>
)}
</div>
)}
{/* Extracted Data */}
<ExtractedDataDisplay<any>
key={schemaKey}
extractedData={itemData.data}
title="Extracted Data"
onChange={(updatedData) => {
updateData(updatedData);
}}
onClickField={(args) => {
// TODO: set multiple highlights
setHighlight({
page: args.metadata?.citation?.[0]?.page ?? 1,
x: 100,
y: 100,
width: 0,
height: 0,
});
}}
jsonSchema={appliedSchema}
/>
</div>
</div>
</div>
);
}
+15
View File
@@ -0,0 +1,15 @@
/// <reference types="vite/client" />
interface ImportMetaEnv {
readonly VITE_LLAMA_CLOUD_API_KEY?: string;
readonly VITE_LLAMA_CLOUD_BASE_URL?: string;
// injected from llama_deploy
readonly VITE_LLAMA_DEPLOY_BASE_PATH: string;
readonly VITE_LLAMA_DEPLOY_DEPLOYMENT_NAME: string;
readonly VITE_LLAMA_DEPLOY_PROJECT_ID: string;
}
interface ImportMeta {
readonly env: ImportMetaEnv;
}
+31
View File
@@ -0,0 +1,31 @@
{
"compilerOptions": {
"target": "ES2020",
"useDefineForClassFields": true,
"lib": ["ES2020", "DOM", "DOM.Iterable"],
"module": "ESNext",
"skipLibCheck": true,
/* Bundler mode */
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
"resolveJsonModule": true,
"isolatedModules": true,
"noEmit": true,
"jsx": "react-jsx",
/* Linting */
"strict": true,
"noUnusedLocals": true,
"noUnusedParameters": true,
"noFallthroughCasesInSwitch": true,
/* Path mapping */
"baseUrl": ".",
"paths": {
"@/*": ["./src/*"]
}
},
"include": ["src", "vite.config.ts", "src/vite-env.d.ts"],
"exclude": ["node_modules"]
}
+43
View File
@@ -0,0 +1,43 @@
import { defineConfig } from "vite";
import react from "@vitejs/plugin-react";
import path from "path";
// https://vitejs.dev/config/
export default defineConfig(({}) => {
const deploymentName = process.env.LLAMA_DEPLOY_DEPLOYMENT_NAME;
const basePath = process.env.LLAMA_DEPLOY_DEPLOYMENT_BASE_PATH;
const projectId = process.env.LLAMA_DEPLOY_PROJECT_ID;
const port = process.env.PORT ? Number(process.env.PORT) : 3000;
const baseUrl = process.env.LLAMA_CLOUD_BASE_URL;
return {
plugins: [react()],
resolve: {
alias: {
"@": path.resolve(__dirname, "./src"),
},
},
server: {
port: port,
host: true,
},
build: {
outDir: "dist",
sourcemap: true,
},
base: basePath,
define: {
// Primary define uses NAME
"import.meta.env.VITE_LLAMA_DEPLOY_DEPLOYMENT_NAME": JSON.stringify(
deploymentName
),
"import.meta.env.VITE_LLAMA_DEPLOY_DEPLOYMENT_BASE_PATH": JSON.stringify(basePath),
...(projectId && {
"import.meta.env.VITE_LLAMA_DEPLOY_PROJECT_ID":
JSON.stringify(projectId),
}),
...(baseUrl && {
"import.meta.env.VITE_LLAMA_CLOUD_BASE_URL": JSON.stringify(baseUrl),
}),
},
};
});
+2
View File
@@ -0,0 +1,2 @@
# Changes here will be overwritten by Copier; NEVER EDIT MANUALLY
{{ _copier_answers|to_nice_yaml -}}