mirror of
https://github.com/run-llama/template-workflow-classify-extract-sec.git
synced 2026-06-30 21:57:55 -04:00
Add back classify-extract-sec
This commit is contained in:
@@ -0,0 +1,3 @@
|
||||
# Changes here will be overwritten by Copier; NEVER EDIT MANUALLY
|
||||
_commit: v0.2.1
|
||||
_src_path: https://github.com/run-llama/template-workflow-data-extraction
|
||||
@@ -0,0 +1,2 @@
|
||||
# copy to .env and place any needed secrets here. LLAMA_CLOUD_API_KEY will be automatically set
|
||||
# OPENAI_API_KEY=sk-xxx
|
||||
@@ -0,0 +1,7 @@
|
||||
.env
|
||||
__pycache__
|
||||
workflows.db
|
||||
|
||||
.venv
|
||||
package-lock.json
|
||||
node_modules
|
||||
@@ -1,2 +1,67 @@
|
||||
# template-workflow-classify-extract-sec
|
||||
Llama Index Workflow Template
|
||||
# SEC Filing Data Extraction and Analysis
|
||||
|
||||
A LlamaAgents application for extracting structured information from SEC filings using LlamaClassify and LlamaExtract. This application automatically classifies SEC documents (10-K, 10-Q, 8-K, or other) and extracts relevant financial and business information tailored to each filing type.
|
||||
|
||||
## Features
|
||||
|
||||
- **Intelligent Classification**: Uses LlamaClassify to automatically identify SEC filing types (10-K, 10-Q, 8-K, other)
|
||||
- **Dynamic Schema Selection**: Applies specialized extraction schemas based on document type
|
||||
- **Comprehensive Data Extraction**: Extracts filing-specific information:
|
||||
- **10-K**: Annual reports with financial metrics, risk factors, business descriptions, executive information
|
||||
- **10-Q**: Quarterly reports with period-over-period comparisons and updates
|
||||
- **8-K**: Current reports with material event information and impact analysis
|
||||
- **Other**: Catch-all for S-1, DEF 14A, 13F, and other filing types
|
||||
- **Agent Data Storage**: Stores extracted data in LlamaCloud Agent Data for easy querying and analysis
|
||||
- **UI Integration**: Web interface for reviewing and managing extracted data
|
||||
|
||||
## Configuration
|
||||
|
||||
All main configuration is in `src/extraction_review/config.py`:
|
||||
|
||||
## How It Works
|
||||
|
||||
The application uses a multi-step workflow powered by LlamaIndex:
|
||||
|
||||
1. **File Upload**: User uploads an SEC filing document through the UI
|
||||
2. **Download**: File is downloaded from LlamaCloud storage
|
||||
3. **Classification**: LlamaClassify analyzes the first 5 pages to determine filing type (10-K, 10-Q, 8-K, or other)
|
||||
4. **Schema Selection**: Appropriate extraction schema is selected based on classification
|
||||
5. **Extraction**: LlamaExtract processes the document using the selected schema
|
||||
6. **Storage**: Extracted data is stored in Agent Data with deduplication by file hash
|
||||
7. **Review**: UI displays extracted data for review and editing
|
||||
|
||||
### Workflows
|
||||
|
||||
The application includes two main workflows:
|
||||
|
||||
- **`process-file`** (`src/extraction_review/process_file.py`): Main workflow for processing SEC filings
|
||||
- Steps: download → classify → extract → store
|
||||
- Uses typed context to pass state between steps
|
||||
- Streams progress updates to UI via `UIToast` events
|
||||
|
||||
- **`metadata`** (`src/extraction_review/metadata_workflow.py`): Exposes configuration metadata to UI
|
||||
- Returns JSON schema and collection name for dynamic UI generation
|
||||
|
||||
## Linting and Type Checking
|
||||
|
||||
Python and javascript packages contain helpful scripts to lint, format, and type check the code.
|
||||
|
||||
To check and fix python code:
|
||||
|
||||
```bash
|
||||
uv run hatch run lint
|
||||
uv run hatch run typecheck
|
||||
uv run hatch run test
|
||||
# run all at once
|
||||
uv run hatch run all-fix
|
||||
```
|
||||
|
||||
To check and fix javascript code, within the `ui` directory:
|
||||
|
||||
```bash
|
||||
pnpm run lint
|
||||
pnpm run typecheck
|
||||
pnpm run test
|
||||
# run all at once
|
||||
pnpm run all-fix
|
||||
```
|
||||
@@ -0,0 +1,51 @@
|
||||
[project]
|
||||
name = "extraction-review"
|
||||
version = "0.1.0"
|
||||
description = "Extracts data"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"llama-cloud-services>=0.6.69",
|
||||
"llama-index-workflows>=2.2.0,<3.0.0",
|
||||
"python-dotenv>=1.1.0",
|
||||
"jsonref>=1.1.0",
|
||||
"click>=8.2.1,<8.3.0",
|
||||
"httpx>=0.28.1",
|
||||
"llama-index-core>=0.14.0",
|
||||
]
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"ruff>=0.11.10",
|
||||
"typescript>=0.0.12",
|
||||
"ty>=0.0.1a16",
|
||||
"pytest>=8.4.1",
|
||||
"hatch>=1.14.1",
|
||||
"llamactl>=0.3.0"
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[tool.hatch.envs.default.scripts]
|
||||
"format" = "ruff format ."
|
||||
"format-check" = "ruff format --check ."
|
||||
"lint" = "ruff check --fix ."
|
||||
"lint-check" = ["ruff check ."]
|
||||
typecheck = "ty check src"
|
||||
test = "pytest"
|
||||
"all-check" = ["format-check", "lint-check", "test"]
|
||||
"all-fix" = ["format", "lint", "test"]
|
||||
|
||||
[tool.llamadeploy]
|
||||
env_files = [".env"]
|
||||
llama_cloud = true
|
||||
|
||||
[tool.llamadeploy.workflows]
|
||||
process-file = "extraction_review.process_file:workflow"
|
||||
metadata = "extraction_review.metadata_workflow:workflow"
|
||||
|
||||
[tool.llamadeploy.ui]
|
||||
directory = "ui"
|
||||
|
||||
@@ -0,0 +1,88 @@
|
||||
import functools
|
||||
import os
|
||||
from typing import Any
|
||||
import httpx
|
||||
|
||||
from llama_cloud_services import ExtractionAgent, LlamaExtract
|
||||
from llama_cloud.core.api_error import ApiError
|
||||
from llama_cloud_services.beta.agent_data import AsyncAgentDataClient, ExtractedData
|
||||
from llama_cloud_services.beta.classifier.client import ClassifyClient
|
||||
from llama_cloud.client import AsyncLlamaCloud
|
||||
import logging
|
||||
|
||||
from extraction_review.config import (
|
||||
EXTRACT_CONFIG,
|
||||
EXTRACTED_DATA_COLLECTION,
|
||||
EXTRACTION_AGENT_NAME,
|
||||
USE_REMOTE_EXTRACTION_SCHEMA,
|
||||
ExtractionSchema,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# deployed agents may infer their name from the deployment name
|
||||
# Note: Make sure that an agent deployment with this name actually exists
|
||||
# otherwise calls to get or set data will fail. You may need to adjust the `or `
|
||||
# name for development
|
||||
agent_name = os.getenv("LLAMA_DEPLOY_DEPLOYMENT_NAME")
|
||||
# required for all llama cloud calls
|
||||
api_key = os.environ["LLAMA_CLOUD_API_KEY"]
|
||||
# get this in case running against a different environment than production
|
||||
base_url = os.getenv("LLAMA_CLOUD_BASE_URL")
|
||||
project_id = os.getenv("LLAMA_DEPLOY_PROJECT_ID")
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=None)
|
||||
def get_extract_agent() -> ExtractionAgent:
|
||||
extract_api = LlamaExtract(
|
||||
api_key=api_key, base_url=base_url, project_id=project_id
|
||||
)
|
||||
|
||||
try:
|
||||
existing = extract_api.get_agent(EXTRACTION_AGENT_NAME)
|
||||
if not USE_REMOTE_EXTRACTION_SCHEMA:
|
||||
existing.data_schema = ExtractionSchema
|
||||
existing.config = EXTRACT_CONFIG
|
||||
return existing
|
||||
except ApiError as e:
|
||||
if e.status_code == 404:
|
||||
if USE_REMOTE_EXTRACTION_SCHEMA:
|
||||
logger.warning(
|
||||
"Extraction agent does not exist, creating a new one from the local schema"
|
||||
)
|
||||
return extract_api.create_agent(
|
||||
name=EXTRACTION_AGENT_NAME,
|
||||
data_schema=ExtractionSchema,
|
||||
config=EXTRACT_CONFIG,
|
||||
)
|
||||
else:
|
||||
raise
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=None)
|
||||
def get_data_client() -> AsyncAgentDataClient:
|
||||
return AsyncAgentDataClient(
|
||||
deployment_name=agent_name,
|
||||
collection=EXTRACTED_DATA_COLLECTION,
|
||||
type=ExtractedData[Any],
|
||||
client=get_llama_cloud_client(),
|
||||
)
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=None)
|
||||
def get_llama_cloud_client():
|
||||
return AsyncLlamaCloud(
|
||||
base_url=base_url,
|
||||
token=api_key,
|
||||
httpx_client=httpx.AsyncClient(
|
||||
timeout=60, headers={"Project-Id": project_id} if project_id else None
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=None)
|
||||
def get_classifier_client():
|
||||
return ClassifyClient(
|
||||
client=get_llama_cloud_client(),
|
||||
project_id=project_id,
|
||||
)
|
||||
@@ -0,0 +1,362 @@
|
||||
"""
|
||||
For simple configuration of the extraction review application, just customize this file.
|
||||
|
||||
If you need more control, feel free to edit the rest of the application
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import os
|
||||
from typing import Type
|
||||
|
||||
from llama_cloud import ExtractConfig
|
||||
from llama_cloud_services.extract import ExtractMode
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
# If you change this to true, the schema and extraction configuration will be fetched from the remote extraction agent
|
||||
# rather than using the ExtractionSchema and configuration defined below.
|
||||
USE_REMOTE_EXTRACTION_SCHEMA: bool = False
|
||||
# The name of the extraction agent to use. Prefers the name of this deployment when deployed to isolate environments.
|
||||
# Note that the application will create a new agent from the below ExtractionSchema if the extraction agent does not yet exist.
|
||||
EXTRACTION_AGENT_NAME: str = (
|
||||
os.getenv("LLAMA_DEPLOY_DEPLOYMENT_NAME") or "extraction-review"
|
||||
)
|
||||
# The name of the collection to use for storing extracted data. This will be qualified by the agent name.
|
||||
# When developing locally, this will use the _public collection (shared within the project), otherwise agent
|
||||
# data is isolated to each agent
|
||||
EXTRACTED_DATA_COLLECTION: str = "sec-filing-extraction"
|
||||
|
||||
|
||||
# SEC Filing Classification Types
|
||||
SEC_FILING_TYPES = ["10-K", "10-Q", "8-K", "other"]
|
||||
|
||||
|
||||
# Base class for common fields across all SEC filings
|
||||
class BaseSECFiling(BaseModel):
|
||||
"""Common fields present in all SEC filings"""
|
||||
|
||||
company_name: str = Field(
|
||||
description="The full legal name of the company filing the document"
|
||||
)
|
||||
ticker_symbol: str | None = Field(
|
||||
default=None,
|
||||
description="The stock ticker symbol of the company. May not be present for all filings.",
|
||||
)
|
||||
cik: str | None = Field(
|
||||
default=None,
|
||||
description="Central Index Key - the unique identifier assigned by the SEC to the company",
|
||||
)
|
||||
filing_date: str | None = Field(
|
||||
default=None,
|
||||
description="The date the document was filed with the SEC (format: YYYY-MM-DD)",
|
||||
)
|
||||
fiscal_year_end: str | None = Field(
|
||||
default=None,
|
||||
description="The fiscal year end date for the company (format: YYYY-MM-DD)",
|
||||
)
|
||||
sic_code: str | None = Field(
|
||||
default=None,
|
||||
description="Standard Industrial Classification code for the company's industry",
|
||||
)
|
||||
|
||||
|
||||
# Financial metrics that appear in multiple filing types
|
||||
class FinancialMetrics(BaseModel):
|
||||
"""Key financial metrics extracted from statements"""
|
||||
|
||||
total_revenue: str | None = Field(
|
||||
default=None,
|
||||
description="Total revenue/sales for the period. Include currency and amount (e.g., '$1.2B USD')",
|
||||
)
|
||||
net_income: str | None = Field(
|
||||
default=None,
|
||||
description="Net income/profit for the period. Include currency and amount",
|
||||
)
|
||||
total_assets: str | None = Field(
|
||||
default=None,
|
||||
description="Total assets as of the balance sheet date. Include currency and amount",
|
||||
)
|
||||
total_liabilities: str | None = Field(
|
||||
default=None,
|
||||
description="Total liabilities as of the balance sheet date. Include currency and amount",
|
||||
)
|
||||
stockholders_equity: str | None = Field(
|
||||
default=None,
|
||||
description="Total stockholders' equity. Include currency and amount",
|
||||
)
|
||||
cash_and_equivalents: str | None = Field(
|
||||
default=None,
|
||||
description="Cash and cash equivalents. Include currency and amount",
|
||||
)
|
||||
earnings_per_share: str | None = Field(
|
||||
default=None, description="Earnings per share (EPS) for the period"
|
||||
)
|
||||
|
||||
|
||||
# Risk factor for use in 10-K and 10-Q
|
||||
class RiskFactor(BaseModel):
|
||||
"""Individual risk factor identified in the filing"""
|
||||
|
||||
category: str = Field(
|
||||
description="Category of risk (e.g., 'Market Risk', 'Operational Risk', 'Legal Risk')"
|
||||
)
|
||||
description: str = Field(description="Brief description of the specific risk")
|
||||
|
||||
|
||||
# 10-K: Annual Report
|
||||
class Filing10K(BaseSECFiling):
|
||||
"""
|
||||
Form 10-K is an annual report required by the SEC that provides a comprehensive
|
||||
summary of a company's financial performance.
|
||||
"""
|
||||
|
||||
document_type: str = Field(default="10-K", description="Should always be '10-K'")
|
||||
fiscal_year: int | None = Field(
|
||||
default=None,
|
||||
description="The fiscal year covered by this annual report (e.g., 2023)",
|
||||
)
|
||||
|
||||
# Business overview
|
||||
business_description: str | None = Field(
|
||||
default=None,
|
||||
description="A 2-3 sentence summary of the company's business and operations",
|
||||
)
|
||||
|
||||
# Financial data
|
||||
financial_metrics: FinancialMetrics | None = Field(
|
||||
default=None, description="Key financial metrics from the annual statements"
|
||||
)
|
||||
|
||||
# Risk factors
|
||||
risk_factors: list[RiskFactor] | None = Field(
|
||||
default=None,
|
||||
description="List of material risk factors disclosed in the filing. Extract 3-5 most significant risks.",
|
||||
)
|
||||
|
||||
# Management discussion
|
||||
management_discussion_summary: str | None = Field(
|
||||
default=None,
|
||||
description="2-3 sentence summary of Management's Discussion and Analysis (MD&A) section",
|
||||
)
|
||||
|
||||
# Legal proceedings
|
||||
legal_proceedings: list[str] | None = Field(
|
||||
default=None,
|
||||
description="List of significant legal proceedings or litigation mentioned",
|
||||
)
|
||||
|
||||
# Executive officers
|
||||
executive_officers: list[str] | None = Field(
|
||||
default=None,
|
||||
description="Names and titles of key executive officers (CEO, CFO, etc.)",
|
||||
)
|
||||
|
||||
# Auditor information
|
||||
auditor_name: str | None = Field(
|
||||
default=None,
|
||||
description="Name of the independent registered public accounting firm",
|
||||
)
|
||||
|
||||
# Key insights
|
||||
key_highlights: list[str] | None = Field(
|
||||
default=None,
|
||||
description="3-5 key highlights or notable items from the annual report",
|
||||
)
|
||||
|
||||
|
||||
# 10-Q: Quarterly Report
|
||||
class Filing10Q(BaseSECFiling):
|
||||
"""
|
||||
Form 10-Q is a quarterly report that provides a continuing view of a company's
|
||||
financial position during the year.
|
||||
"""
|
||||
|
||||
document_type: str = Field(default="10-Q", description="Should always be '10-Q'")
|
||||
fiscal_quarter: str | None = Field(
|
||||
default=None,
|
||||
description="The fiscal quarter covered (e.g., 'Q1 2024', 'Q2 2023')",
|
||||
)
|
||||
fiscal_year: int | None = Field(
|
||||
default=None, description="The fiscal year for this quarter (e.g., 2024)"
|
||||
)
|
||||
period_end_date: str | None = Field(
|
||||
default=None,
|
||||
description="The end date of the quarterly period (format: YYYY-MM-DD)",
|
||||
)
|
||||
|
||||
# Financial data
|
||||
financial_metrics: FinancialMetrics | None = Field(
|
||||
default=None, description="Key financial metrics from the quarterly statements"
|
||||
)
|
||||
|
||||
# Comparison to prior periods
|
||||
year_over_year_revenue_change: str | None = Field(
|
||||
default=None,
|
||||
description="Year-over-year revenue change percentage or description (e.g., 'up 15%')",
|
||||
)
|
||||
quarter_over_quarter_revenue_change: str | None = Field(
|
||||
default=None,
|
||||
description="Quarter-over-quarter revenue change percentage or description",
|
||||
)
|
||||
|
||||
# Management discussion
|
||||
management_discussion_summary: str | None = Field(
|
||||
default=None,
|
||||
description="2-3 sentence summary of Management's Discussion and Analysis for the quarter",
|
||||
)
|
||||
|
||||
# Risk factors
|
||||
material_changes_to_risks: str | None = Field(
|
||||
default=None,
|
||||
description="Summary of any material changes to risk factors since the last 10-K",
|
||||
)
|
||||
|
||||
# Legal updates
|
||||
legal_proceedings_updates: list[str] | None = Field(
|
||||
default=None,
|
||||
description="Updates to legal proceedings or new litigation since last filing",
|
||||
)
|
||||
|
||||
# Key insights
|
||||
key_highlights: list[str] | None = Field(
|
||||
default=None,
|
||||
description="3-5 key highlights or notable items from the quarterly report",
|
||||
)
|
||||
|
||||
|
||||
# 8-K: Current Report
|
||||
class Filing8K(BaseSECFiling):
|
||||
"""
|
||||
Form 8-K is a current report used to notify investors of significant events
|
||||
that shareholders should know about.
|
||||
"""
|
||||
|
||||
document_type: str = Field(default="8-K", description="Should always be '8-K'")
|
||||
|
||||
# Event information
|
||||
event_date: str | None = Field(
|
||||
default=None,
|
||||
description="The date of the event being reported (format: YYYY-MM-DD)",
|
||||
)
|
||||
event_type: str | None = Field(
|
||||
default=None,
|
||||
description="Type of event (e.g., 'Merger/Acquisition', 'Leadership Change', 'Earnings Release', 'Material Agreement')",
|
||||
)
|
||||
item_numbers: list[str] | None = Field(
|
||||
default=None,
|
||||
description="Item numbers from the 8-K form (e.g., ['1.01', '5.02']) indicating which sections are included",
|
||||
)
|
||||
|
||||
# Event description
|
||||
event_summary: str = Field(
|
||||
description="2-4 sentence summary describing the material event being reported"
|
||||
)
|
||||
event_details: str | None = Field(
|
||||
default=None,
|
||||
description="More detailed description of the event and its implications",
|
||||
)
|
||||
|
||||
# Financial impact
|
||||
estimated_financial_impact: str | None = Field(
|
||||
default=None,
|
||||
description="Estimated financial impact of the event, if disclosed",
|
||||
)
|
||||
|
||||
# Related parties
|
||||
related_parties: list[str] | None = Field(
|
||||
default=None,
|
||||
description="Names of other companies, individuals, or entities involved in the event",
|
||||
)
|
||||
|
||||
# Exhibits filed
|
||||
material_exhibits: list[str] | None = Field(
|
||||
default=None,
|
||||
description="Description of significant exhibits filed with the 8-K (e.g., 'Press Release', 'Material Agreement')",
|
||||
)
|
||||
|
||||
# Forward-looking statements
|
||||
contains_forward_looking_statements: bool | None = Field(
|
||||
default=None,
|
||||
description="Whether the filing contains forward-looking statements",
|
||||
)
|
||||
|
||||
# Key takeaways
|
||||
investment_implications: str | None = Field(
|
||||
default=None,
|
||||
description="1-2 sentence assessment of potential implications for investors",
|
||||
)
|
||||
|
||||
|
||||
# Other filings catch-all
|
||||
class FilingOther(BaseSECFiling):
|
||||
"""
|
||||
Catch-all schema for other SEC filing types (e.g., S-1, DEF 14A, 13F, etc.)
|
||||
"""
|
||||
|
||||
document_type: str = Field(
|
||||
description="The type of SEC filing (e.g., 'S-1', 'DEF 14A', '13F', 'SC 13D')"
|
||||
)
|
||||
|
||||
filing_purpose: str | None = Field(
|
||||
default=None,
|
||||
description="The purpose of this filing type (e.g., 'IPO Registration', 'Proxy Statement', 'Insider Holdings')",
|
||||
)
|
||||
|
||||
summary: str = Field(
|
||||
description="3-4 sentence summary of the filing's key content and purpose"
|
||||
)
|
||||
|
||||
key_information: list[str] | None = Field(
|
||||
default=None,
|
||||
description="List of 3-7 key pieces of information from the filing",
|
||||
)
|
||||
|
||||
financial_data: FinancialMetrics | None = Field(
|
||||
default=None, description="Any relevant financial metrics present in the filing"
|
||||
)
|
||||
|
||||
material_events: list[str] | None = Field(
|
||||
default=None,
|
||||
description="List of any material events or transactions described",
|
||||
)
|
||||
|
||||
parties_involved: list[str] | None = Field(
|
||||
default=None,
|
||||
description="Other parties mentioned (companies, executives, investors, etc.)",
|
||||
)
|
||||
|
||||
investment_relevance: str | None = Field(
|
||||
default=None,
|
||||
description="Brief note on why this filing might be relevant for investment analysis",
|
||||
)
|
||||
|
||||
|
||||
# Default schema for backward compatibility - now uses 10-K as the base
|
||||
class ExtractionSchema(Filing10K):
|
||||
"""Default extraction schema - uses 10-K structure for backward compatibility"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
# Mapping of filing types to their schemas
|
||||
FILING_SCHEMAS = {
|
||||
"10-K": Filing10K,
|
||||
"10-Q": Filing10Q,
|
||||
"8-K": Filing8K,
|
||||
"other": FilingOther,
|
||||
}
|
||||
|
||||
|
||||
# This is only used if USE_REMOTE_EXTRACTION_SCHEMA is False.
|
||||
EXTRACT_CONFIG = ExtractConfig(
|
||||
extraction_mode=ExtractMode.PREMIUM,
|
||||
system_prompt=None,
|
||||
# advanced. Only compatible with Premium mode.
|
||||
use_reasoning=False,
|
||||
cite_sources=False,
|
||||
confidence_scores=True,
|
||||
)
|
||||
|
||||
|
||||
SCHEMA: Type[BaseModel] | None = (
|
||||
None if USE_REMOTE_EXTRACTION_SCHEMA else ExtractionSchema
|
||||
)
|
||||
@@ -0,0 +1,36 @@
|
||||
from typing import Any
|
||||
from workflows import Workflow, step
|
||||
from workflows.events import StartEvent, StopEvent
|
||||
|
||||
import jsonref
|
||||
|
||||
from .config import EXTRACTED_DATA_COLLECTION, FILING_SCHEMAS
|
||||
|
||||
|
||||
class MetadataResponse(StopEvent):
|
||||
schemas: dict[str, dict[str, Any]]
|
||||
extracted_data_collection: str
|
||||
|
||||
|
||||
class MetadataWorkflow(Workflow):
|
||||
"""
|
||||
Simple single step workflow to expose configuration to the UI, such as all JSON schemas and collection name.
|
||||
"""
|
||||
|
||||
@step
|
||||
async def get_metadata(self, _: StartEvent) -> MetadataResponse:
|
||||
# Convert all filing schemas to JSON schemas
|
||||
schemas = {}
|
||||
for filing_type, schema_class in FILING_SCHEMAS.items():
|
||||
json_schema = schema_class.model_json_schema()
|
||||
# Resolve any $ref references
|
||||
json_schema = jsonref.replace_refs(json_schema, proxies=False)
|
||||
schemas[filing_type] = json_schema
|
||||
|
||||
return MetadataResponse(
|
||||
schemas=schemas,
|
||||
extracted_data_collection=EXTRACTED_DATA_COLLECTION,
|
||||
)
|
||||
|
||||
|
||||
workflow = MetadataWorkflow(timeout=None)
|
||||
@@ -0,0 +1,401 @@
|
||||
import asyncio
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
from typing import Any, Literal
|
||||
|
||||
import httpx
|
||||
from llama_cloud import ClassificationResult, ExtractRun
|
||||
from llama_cloud.types import ClassifierRule, ClassifyParsingConfiguration
|
||||
from llama_cloud_services.extract import SourceText
|
||||
from llama_cloud_services.beta.agent_data import ExtractedData, InvalidExtractionData
|
||||
from pydantic import BaseModel
|
||||
from workflows import Context, Workflow, step
|
||||
from workflows.events import Event, StartEvent, StopEvent
|
||||
|
||||
from .clients import (
|
||||
get_classifier_client,
|
||||
get_llama_cloud_client,
|
||||
get_data_client,
|
||||
get_extract_agent,
|
||||
)
|
||||
from .config import FILING_SCHEMAS
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FileEvent(StartEvent):
|
||||
file_id: str
|
||||
|
||||
|
||||
class DownloadFileEvent(Event):
|
||||
pass
|
||||
|
||||
|
||||
class FileDownloadedEvent(Event):
|
||||
pass
|
||||
|
||||
|
||||
class ClassifyFileEvent(Event):
|
||||
pass
|
||||
|
||||
|
||||
class FileClassifiedEvent(Event):
|
||||
filing_type: str
|
||||
confidence: float | None = None
|
||||
reasoning: str | None = None
|
||||
|
||||
|
||||
class UIToast(Event):
|
||||
level: Literal["info", "warning", "error"]
|
||||
message: str
|
||||
|
||||
|
||||
class ExtractedEvent(Event):
|
||||
data: ExtractedData
|
||||
|
||||
|
||||
class ExtractedInvalidEvent(Event):
|
||||
data: ExtractedData[dict[str, Any]]
|
||||
|
||||
|
||||
class ExtractionState(BaseModel):
|
||||
file_id: str | None = None
|
||||
file_path: str | None = None
|
||||
filename: str | None = None
|
||||
filing_type: str | None = None
|
||||
classification_confidence: float | None = None
|
||||
classification_reasoning: str | None = None
|
||||
|
||||
|
||||
class ProcessFileWorkflow(Workflow):
|
||||
"""
|
||||
Given a file path, this workflow will process a single file through the custom extraction logic.
|
||||
"""
|
||||
|
||||
@step()
|
||||
async def run_file(self, event: FileEvent, ctx: Context) -> DownloadFileEvent:
|
||||
logger.info(f"Running file {event.file_id}")
|
||||
async with ctx.store.edit_state() as state:
|
||||
state.file_id = event.file_id
|
||||
return DownloadFileEvent()
|
||||
|
||||
@step()
|
||||
async def download_file(
|
||||
self, event: DownloadFileEvent, ctx: Context[ExtractionState]
|
||||
) -> ClassifyFileEvent:
|
||||
"""Download the file reference from the cloud storage"""
|
||||
state = await ctx.store.get_state()
|
||||
if state.file_id is None:
|
||||
raise ValueError("File ID is not set")
|
||||
try:
|
||||
file_metadata = await get_llama_cloud_client().files.get_file(
|
||||
id=state.file_id
|
||||
)
|
||||
file_url = await get_llama_cloud_client().files.read_file_content(
|
||||
state.file_id
|
||||
)
|
||||
|
||||
temp_dir = tempfile.gettempdir()
|
||||
filename = file_metadata.name
|
||||
file_path = os.path.join(temp_dir, filename)
|
||||
client = httpx.AsyncClient()
|
||||
# Report progress to the UI
|
||||
logger.info(f"Downloading file {file_url.url} to {file_path}")
|
||||
|
||||
async with client.stream("GET", file_url.url) as response:
|
||||
with open(file_path, "wb") as f:
|
||||
async for chunk in response.aiter_bytes():
|
||||
f.write(chunk)
|
||||
logger.info(f"Downloaded file {file_url.url} to {file_path}")
|
||||
async with ctx.store.edit_state() as state:
|
||||
state.file_path = file_path
|
||||
state.filename = filename
|
||||
return ClassifyFileEvent()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error downloading file {state.file_id}: {e}", exc_info=True)
|
||||
ctx.write_event_to_stream(
|
||||
UIToast(
|
||||
level="error",
|
||||
message=f"Error downloading file {state.file_id}: {e}",
|
||||
)
|
||||
)
|
||||
raise e
|
||||
|
||||
@step()
|
||||
async def classify_file(
|
||||
self, event: ClassifyFileEvent, ctx: Context[ExtractionState]
|
||||
) -> FileClassifiedEvent:
|
||||
"""Classify the SEC filing document type"""
|
||||
state = await ctx.store.get_state()
|
||||
if state.file_path is None or state.filename is None:
|
||||
raise ValueError("File path or filename is not set")
|
||||
|
||||
try:
|
||||
logger.info(f"Classifying file {state.filename}")
|
||||
ctx.write_event_to_stream(
|
||||
UIToast(level="info", message=f"Classifying file {state.filename}")
|
||||
)
|
||||
|
||||
# Initialize the classifier
|
||||
|
||||
classifier = get_classifier_client()
|
||||
|
||||
# Define classification rules for SEC filing types
|
||||
rules = [
|
||||
ClassifierRule(
|
||||
type="10-K",
|
||||
description=(
|
||||
"Form 10-K is an annual report filed by public companies with the SEC. "
|
||||
"It provides a comprehensive summary of a company's financial performance for the year, "
|
||||
"including audited financial statements, management's discussion and analysis (MD&A), "
|
||||
"risk factors, business description, and executive compensation. "
|
||||
"Look for: 'Form 10-K', 'Annual Report', fiscal year references, audited financials."
|
||||
),
|
||||
),
|
||||
ClassifierRule(
|
||||
type="10-Q",
|
||||
description=(
|
||||
"Form 10-Q is a quarterly report filed by public companies with the SEC. "
|
||||
"It provides unaudited financial statements and management discussion for a specific quarter. "
|
||||
"Contains quarterly financial data, updates on business operations, and material changes. "
|
||||
"Look for: 'Form 10-Q', 'Quarterly Report', quarter references (Q1, Q2, Q3), unaudited statements."
|
||||
),
|
||||
),
|
||||
ClassifierRule(
|
||||
type="8-K",
|
||||
description=(
|
||||
"Form 8-K is a current report filed to announce material events or corporate changes. "
|
||||
"Used to notify investors of significant events like mergers, acquisitions, leadership changes, "
|
||||
"earnings releases, or other material corporate events that shareholders should know about. "
|
||||
"Look for: 'Form 8-K', 'Current Report', Item numbers (e.g., Item 1.01, Item 5.02), event dates, "
|
||||
"specific triggering events."
|
||||
),
|
||||
),
|
||||
ClassifierRule(
|
||||
type="other",
|
||||
description=(
|
||||
"Any other SEC filing type not covered by 10-K, 10-Q, or 8-K. "
|
||||
"This includes forms such as S-1 (IPO registration), DEF 14A (proxy statement), "
|
||||
"13F (institutional holdings), SC 13D (beneficial ownership), and other SEC forms."
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
# Configure parsing - only parse first few pages for classification
|
||||
parsing_config = ClassifyParsingConfiguration(
|
||||
max_pages=5, # Only parse first 5 pages for faster classification
|
||||
)
|
||||
|
||||
# Classify the file
|
||||
results = await classifier.aclassify_file_paths(
|
||||
rules=rules,
|
||||
file_input_paths=[state.file_path],
|
||||
parsing_configuration=parsing_config,
|
||||
)
|
||||
|
||||
# Extract classification result
|
||||
if results.items and len(results.items) > 0:
|
||||
item = results.items[0]
|
||||
result: ClassificationResult | None = item.result
|
||||
if result:
|
||||
filing_type = result.type
|
||||
confidence = result.confidence
|
||||
reasoning = result.reasoning
|
||||
|
||||
logger.info(
|
||||
f"Classified {state.filename} as {filing_type} "
|
||||
f"(confidence: {confidence}, reasoning: {reasoning})"
|
||||
)
|
||||
ctx.write_event_to_stream(
|
||||
UIToast(
|
||||
level="info",
|
||||
message=f"Classified as {filing_type} SEC filing",
|
||||
)
|
||||
)
|
||||
|
||||
async with ctx.store.edit_state() as state:
|
||||
state.filing_type = filing_type
|
||||
state.classification_confidence = confidence
|
||||
state.classification_reasoning = reasoning
|
||||
|
||||
return FileClassifiedEvent(
|
||||
filing_type=filing_type,
|
||||
confidence=confidence,
|
||||
reasoning=reasoning,
|
||||
)
|
||||
else:
|
||||
# Classification failed, default to "other"
|
||||
logger.warning(
|
||||
f"Classification failed for {state.filename}, defaulting to 'other'"
|
||||
)
|
||||
ctx.write_event_to_stream(
|
||||
UIToast(
|
||||
level="warning",
|
||||
message="Classification uncertain, using default schema",
|
||||
)
|
||||
)
|
||||
async with ctx.store.edit_state() as state:
|
||||
state.filing_type = "other"
|
||||
return FileClassifiedEvent(filing_type="other")
|
||||
else:
|
||||
# No results, default to "other"
|
||||
logger.warning(f"No classification results for {state.filename}")
|
||||
async with ctx.store.edit_state() as state:
|
||||
state.filing_type = "other"
|
||||
return FileClassifiedEvent(filing_type="other")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error classifying file {state.filename}: {e}", exc_info=True)
|
||||
ctx.write_event_to_stream(
|
||||
UIToast(
|
||||
level="warning",
|
||||
message=f"Classification failed, using default schema: {e}",
|
||||
)
|
||||
)
|
||||
# On error, default to "other" and continue
|
||||
async with ctx.store.edit_state() as state:
|
||||
state.filing_type = "other"
|
||||
return FileClassifiedEvent(filing_type="other")
|
||||
|
||||
@step()
|
||||
async def process_file(
|
||||
self, event: FileClassifiedEvent, ctx: Context[ExtractionState]
|
||||
) -> ExtractedEvent | ExtractedInvalidEvent:
|
||||
"""Runs the extraction against the file"""
|
||||
state = await ctx.store.get_state()
|
||||
if state.file_path is None or state.filename is None:
|
||||
raise ValueError("File path or filename is not set")
|
||||
try:
|
||||
# Get the appropriate schema based on classification
|
||||
filing_type = (state.filing_type or "other").upper()
|
||||
schema = FILING_SCHEMAS.get(filing_type, FILING_SCHEMAS["other"])
|
||||
|
||||
logger.info(f"Using schema for filing type: {filing_type}")
|
||||
ctx.write_event_to_stream(
|
||||
UIToast(
|
||||
level="info",
|
||||
message=f"Extracting data using {filing_type} schema",
|
||||
)
|
||||
)
|
||||
|
||||
agent = get_extract_agent()
|
||||
# Update the agent's data schema for this specific filing type
|
||||
agent.data_schema = schema
|
||||
# track the content of the file, so as to be able to de-duplicate
|
||||
file_content = Path(state.file_path).read_bytes()
|
||||
file_hash = hashlib.sha256(file_content).hexdigest()
|
||||
source_text = SourceText(
|
||||
file=state.file_path,
|
||||
filename=state.filename,
|
||||
)
|
||||
logger.info(f"Extracting data from file {state.filename}")
|
||||
ctx.write_event_to_stream(
|
||||
UIToast(
|
||||
level="info", message=f"Extracting data from file {state.filename}"
|
||||
)
|
||||
)
|
||||
extracted_result: ExtractRun = await agent.aextract(source_text)
|
||||
try:
|
||||
logger.info(f"Extracted data: {extracted_result}")
|
||||
data = ExtractedData.from_extraction_result(
|
||||
result=extracted_result,
|
||||
schema=schema,
|
||||
file_hash=file_hash,
|
||||
)
|
||||
# Add classification information to the extracted data
|
||||
if data.metadata is None:
|
||||
data.metadata = {}
|
||||
data.metadata["classification"] = filing_type
|
||||
data.metadata["classification_confidence"] = (
|
||||
state.classification_confidence
|
||||
)
|
||||
data.metadata["classification_reasoning"] = (
|
||||
state.classification_reasoning
|
||||
)
|
||||
return ExtractedEvent(data=data)
|
||||
except InvalidExtractionData as e:
|
||||
logger.error(f"Error validating extracted data: {e}", exc_info=True)
|
||||
return ExtractedInvalidEvent(data=e.invalid_item)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error extracting data from file {state.filename}: {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
ctx.write_event_to_stream(
|
||||
UIToast(
|
||||
level="error",
|
||||
message=f"Error extracting data from file {state.filename}: {e}",
|
||||
)
|
||||
)
|
||||
raise e
|
||||
|
||||
@step()
|
||||
async def record_extracted_data(
|
||||
self, event: ExtractedEvent | ExtractedInvalidEvent, ctx: Context
|
||||
) -> StopEvent:
|
||||
"""Records the extracted data to the agent data API"""
|
||||
try:
|
||||
logger.info(f"Recorded extracted data for file {event.data.file_name}")
|
||||
ctx.write_event_to_stream(
|
||||
UIToast(
|
||||
level="info",
|
||||
message=f"Recorded extracted data for file {event.data.file_name}",
|
||||
)
|
||||
)
|
||||
# remove past data when reprocessing the same file
|
||||
if event.data.file_hash:
|
||||
existing_data = await get_data_client().untyped_search(
|
||||
filter={
|
||||
"file_hash": {
|
||||
"eq": event.data.file_hash,
|
||||
},
|
||||
},
|
||||
)
|
||||
if existing_data.items:
|
||||
logger.info(
|
||||
f"Removing past data for file {event.data.file_name} with hash {event.data.file_hash}"
|
||||
)
|
||||
await asyncio.gather(
|
||||
*[
|
||||
get_data_client().delete_item(item.id)
|
||||
for item in existing_data.items
|
||||
]
|
||||
)
|
||||
# finally, save the new data
|
||||
item_id = await get_data_client().create_item(event.data)
|
||||
return StopEvent(
|
||||
result=item_id.id,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error recording extracted data for file {event.data.file_name}: {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
ctx.write_event_to_stream(
|
||||
UIToast(
|
||||
level="error",
|
||||
message=f"Error recording extracted data for file {event.data.file_name}: {e}",
|
||||
)
|
||||
)
|
||||
raise e
|
||||
|
||||
|
||||
workflow = ProcessFileWorkflow(timeout=None)
|
||||
|
||||
if __name__ == "__main__":
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
async def main():
|
||||
file = await get_llama_cloud_client().files.upload_file(
|
||||
upload_file=Path("test.pdf").open("rb")
|
||||
)
|
||||
await workflow.run(start_event=FileEvent(file_id=file.id))
|
||||
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,57 @@
|
||||
"""
|
||||
Selects a locally defined shema, or queries the remote extraction agent for the schema.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import jsonref
|
||||
from .clients import get_extract_agent
|
||||
from .config import USE_REMOTE_EXTRACTION_SCHEMA, ExtractionSchema
|
||||
from typing import Any, Type
|
||||
from pydantic import BaseModel
|
||||
from pydantic import create_model, Field
|
||||
|
||||
|
||||
SCHEMA: Type[BaseModel] | None = (
|
||||
None if USE_REMOTE_EXTRACTION_SCHEMA else ExtractionSchema
|
||||
)
|
||||
|
||||
|
||||
_schema_lock = asyncio.Lock()
|
||||
|
||||
|
||||
async def get_extraction_schema() -> Type[BaseModel]:
|
||||
global SCHEMA
|
||||
if SCHEMA is not None:
|
||||
return SCHEMA
|
||||
async with _schema_lock:
|
||||
if SCHEMA is not None:
|
||||
return SCHEMA
|
||||
agent = get_extract_agent()
|
||||
SCHEMA = model_from_schema(agent.data_schema)
|
||||
return SCHEMA
|
||||
|
||||
|
||||
async def get_extraction_schema_json() -> dict[str, Any]:
|
||||
json_schema = (await get_extraction_schema()).model_json_schema()
|
||||
json_schema = jsonref.replace_refs(json_schema, proxies=False)
|
||||
return json_schema
|
||||
|
||||
|
||||
def model_from_schema(schema: dict[str, Any]) -> Type[BaseModel]:
|
||||
"""
|
||||
Converts a JSON schema back to a Pydantic model.
|
||||
"""
|
||||
typemap = {
|
||||
"string": str,
|
||||
"integer": int,
|
||||
"number": float,
|
||||
"boolean": bool,
|
||||
"array": list,
|
||||
"object": dict,
|
||||
}
|
||||
fields = {}
|
||||
for prop, meta in schema.get("properties", {}).items():
|
||||
py_type = typemap.get(meta.get("type"), Any)
|
||||
default = ... if prop in schema.get("required", []) else None
|
||||
fields[prop] = (py_type, Field(default, description=meta.get("description")))
|
||||
return create_model(schema.get("title", "DynamicModel"), **fields)
|
||||
@@ -0,0 +1,2 @@
|
||||
def test_placeholder():
|
||||
pass
|
||||
@@ -0,0 +1,43 @@
|
||||
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
|
||||
|
||||
# dependencies
|
||||
/node_modules
|
||||
/.pnp
|
||||
.pnp.*
|
||||
.yarn/*
|
||||
!.yarn/patches
|
||||
!.yarn/plugins
|
||||
!.yarn/releases
|
||||
!.yarn/versions
|
||||
|
||||
# testing
|
||||
/coverage
|
||||
|
||||
# next.js
|
||||
/.next/
|
||||
/out/
|
||||
/dist/
|
||||
|
||||
# production
|
||||
/build
|
||||
|
||||
# misc
|
||||
.DS_Store
|
||||
*.pem
|
||||
|
||||
# debug
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
.pnpm-debug.log*
|
||||
|
||||
# env files (can opt-in for committing if needed)
|
||||
.env*
|
||||
|
||||
# vercel
|
||||
.vercel
|
||||
|
||||
# typescript
|
||||
*.tsbuildinfo
|
||||
next-env.d.ts
|
||||
|
||||
@@ -0,0 +1,7 @@
|
||||
# Data Extraction UI
|
||||
|
||||
This is a simple next.js template that builds on the @llamaindex/agent-app ui component library
|
||||
for showing displaying tables of extracted data.
|
||||
|
||||
Ideally run this with `llamactl` in the parent directory (See [README.md](../README.md)),
|
||||
but you can also run it standalone with `npm run dev`, but workflow integrations will not work
|
||||
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"$schema": "https://ui.shadcn.com/schema.json",
|
||||
"style": "new-york",
|
||||
"rsc": true,
|
||||
"tsx": true,
|
||||
"tailwind": {
|
||||
"config": "",
|
||||
"css": "src/index.css",
|
||||
"baseColor": "zinc",
|
||||
"cssVariables": true,
|
||||
"prefix": ""
|
||||
},
|
||||
"aliases": {
|
||||
"components": "@/components",
|
||||
"utils": "@/lib/utils",
|
||||
"ui": "@/components/ui",
|
||||
"lib": "@/lib",
|
||||
"hooks": "@/hooks"
|
||||
},
|
||||
"iconLibrary": "lucide"
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>Review</title>
|
||||
</head>
|
||||
<body>
|
||||
<div id="root"></div>
|
||||
<script type="module" src="/src/main.tsx"></script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,45 @@
|
||||
{
|
||||
"name": "extraction-review-ui",
|
||||
"version": "0.1.0",
|
||||
"private": true,
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"dev": "vite",
|
||||
"build": "tsc && vite build",
|
||||
"preview": "vite preview",
|
||||
"lint": "tsc --noEmit",
|
||||
"format": "prettier --write src",
|
||||
"format-check": "prettier --check src",
|
||||
"all-check": "pnpm i && pnpm run lint && pnpm run format-check && pnpm run build",
|
||||
"all-fix": "pnpm i && pnpm run lint && pnpm run format && pnpm run build"
|
||||
},
|
||||
"dependencies": {
|
||||
"@babel/runtime": "^7.27.6",
|
||||
"@lezer/highlight": "^1.2.1",
|
||||
"@llamaindex/ui": "^2.1.2",
|
||||
"@radix-ui/themes": "^3.2.1",
|
||||
"class-variance-authority": "^0.7.1",
|
||||
"clsx": "^2.1.1",
|
||||
"llama-cloud-services": "^0.3.4",
|
||||
"lucide-react": "^0.514.0",
|
||||
"react": "^18.3.0",
|
||||
"react-dom": "^18.3.0",
|
||||
"react-router-dom": "^6.30.0",
|
||||
"sonner": "^2.0.5",
|
||||
"tw-animate-css": "^1.3.5"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@tailwindcss/postcss": "^4.1.10",
|
||||
"@types/node": "^20",
|
||||
"@types/react": "^19",
|
||||
"@types/react-dom": "^19",
|
||||
"@vitejs/plugin-react": "^4.3.4",
|
||||
"postcss": "^8.5.5",
|
||||
"prettier": "^3.6.2",
|
||||
"tailwind-merge": "^3.3.1",
|
||||
"tailwindcss": "^4.1.8",
|
||||
"typescript": "^5",
|
||||
"vite": "^6.0.5"
|
||||
},
|
||||
"packageManager": "pnpm@10.11.1+sha512.e519b9f7639869dc8d5c3c5dfef73b3f091094b0a006d7317353c72b124e80e1afd429732e28705ad6bfa1ee879c1fce46c128ccebd3192101f43dd67c667912"
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
const config = {
|
||||
plugins: {
|
||||
"@tailwindcss/postcss": {},
|
||||
},
|
||||
};
|
||||
|
||||
export default config;
|
||||
@@ -0,0 +1,70 @@
|
||||
import React from "react";
|
||||
import { Routes, Route } from "react-router-dom";
|
||||
import { Theme } from "@radix-ui/themes";
|
||||
import {
|
||||
Breadcrumb,
|
||||
BreadcrumbItem,
|
||||
BreadcrumbList,
|
||||
BreadcrumbSeparator,
|
||||
} from "@llamaindex/ui";
|
||||
import { Link } from "react-router-dom";
|
||||
import { Toaster } from "@llamaindex/ui";
|
||||
import { useToolbar, ToolbarProvider } from "@/lib/ToolbarContext";
|
||||
import { MetadataProvider } from "@/lib/MetadataProvider";
|
||||
|
||||
// Import pages
|
||||
import HomePage from "./pages/HomePage";
|
||||
import ItemPage from "./pages/ItemPage";
|
||||
|
||||
export default function App() {
|
||||
return (
|
||||
<Theme>
|
||||
<MetadataProvider>
|
||||
<ToolbarProvider>
|
||||
<div className="grid grid-rows-[auto_1fr] h-screen">
|
||||
<Toolbar />
|
||||
<main className="overflow-auto">
|
||||
<Routes>
|
||||
<Route path="/" element={<HomePage />} />
|
||||
<Route path="/item/:itemId" element={<ItemPage />} />
|
||||
</Routes>
|
||||
</main>
|
||||
</div>
|
||||
<Toaster />
|
||||
</ToolbarProvider>
|
||||
</MetadataProvider>
|
||||
</Theme>
|
||||
);
|
||||
}
|
||||
|
||||
const Toolbar = () => {
|
||||
const { buttons, breadcrumbs } = useToolbar();
|
||||
|
||||
return (
|
||||
<header className="sticky top-0 z-50 flex h-16 shrink-0 items-center gap-2 border-b px-4 bg-white/95 backdrop-blur supports-[backdrop-filter]:bg-white/60">
|
||||
<Breadcrumb>
|
||||
<BreadcrumbList>
|
||||
{breadcrumbs.map((item, index) => (
|
||||
<React.Fragment key={index}>
|
||||
{index > 0 && <BreadcrumbSeparator />}
|
||||
<BreadcrumbItem>
|
||||
{item.href && !item.isCurrentPage ? (
|
||||
<Link to={item.href} className="font-medium text-base">
|
||||
{item.label}
|
||||
</Link>
|
||||
) : (
|
||||
<span
|
||||
className={`font-medium ${index === 0 ? "text-base" : ""}`}
|
||||
>
|
||||
{item.label}
|
||||
</span>
|
||||
)}
|
||||
</BreadcrumbItem>
|
||||
</React.Fragment>
|
||||
))}
|
||||
</BreadcrumbList>
|
||||
</Breadcrumb>
|
||||
{buttons}
|
||||
</header>
|
||||
);
|
||||
};
|
||||
@@ -0,0 +1,120 @@
|
||||
@import "tailwindcss";
|
||||
@import "tw-animate-css";
|
||||
|
||||
@custom-variant dark (&:is(.dark *));
|
||||
|
||||
@theme inline {
|
||||
--radius-sm: calc(var(--radius) - 4px);
|
||||
--radius-md: calc(var(--radius) - 2px);
|
||||
--radius-lg: var(--radius);
|
||||
--radius-xl: calc(var(--radius) + 4px);
|
||||
--color-background: var(--background);
|
||||
--color-foreground: var(--foreground);
|
||||
--color-card: var(--card);
|
||||
--color-card-foreground: var(--card-foreground);
|
||||
--color-popover: var(--popover);
|
||||
--color-popover-foreground: var(--popover-foreground);
|
||||
--color-primary: var(--primary);
|
||||
--color-primary-foreground: var(--primary-foreground);
|
||||
--color-secondary: var(--secondary);
|
||||
--color-secondary-foreground: var(--secondary-foreground);
|
||||
--color-muted: var(--muted);
|
||||
--color-muted-foreground: var(--muted-foreground);
|
||||
--color-accent: var(--accent);
|
||||
--color-accent-foreground: var(--accent-foreground);
|
||||
--color-destructive: var(--destructive);
|
||||
--color-border: var(--border);
|
||||
--color-input: var(--input);
|
||||
--color-ring: var(--ring);
|
||||
--color-chart-1: var(--chart-1);
|
||||
--color-chart-2: var(--chart-2);
|
||||
--color-chart-3: var(--chart-3);
|
||||
--color-chart-4: var(--chart-4);
|
||||
--color-chart-5: var(--chart-5);
|
||||
--color-sidebar: var(--sidebar);
|
||||
--color-sidebar-foreground: var(--sidebar-foreground);
|
||||
--color-sidebar-primary: var(--sidebar-primary);
|
||||
--color-sidebar-primary-foreground: var(--sidebar-primary-foreground);
|
||||
--color-sidebar-accent: var(--sidebar-accent);
|
||||
--color-sidebar-accent-foreground: var(--sidebar-accent-foreground);
|
||||
--color-sidebar-border: var(--sidebar-border);
|
||||
--color-sidebar-ring: var(--sidebar-ring);
|
||||
}
|
||||
|
||||
:root {
|
||||
--radius: 0.625rem;
|
||||
--card: oklch(1 0 0);
|
||||
--card-foreground: oklch(0.141 0.005 285.823);
|
||||
--popover: oklch(1 0 0);
|
||||
--popover-foreground: oklch(0.141 0.005 285.823);
|
||||
--primary: oklch(0.21 0.006 285.885);
|
||||
--primary-foreground: oklch(0.985 0 0);
|
||||
--secondary: oklch(0.967 0.001 286.375);
|
||||
--secondary-foreground: oklch(0.21 0.006 285.885);
|
||||
--muted: oklch(0.967 0.001 286.375);
|
||||
--muted-foreground: oklch(0.552 0.016 285.938);
|
||||
--accent: oklch(0.967 0.001 286.375);
|
||||
--accent-foreground: oklch(0.21 0.006 285.885);
|
||||
--destructive: oklch(0.577 0.245 27.325);
|
||||
--border: oklch(0.92 0.004 286.32);
|
||||
--input: oklch(0.92 0.004 286.32);
|
||||
--ring: oklch(0.705 0.015 286.067);
|
||||
--chart-1: oklch(0.646 0.222 41.116);
|
||||
--chart-2: oklch(0.6 0.118 184.704);
|
||||
--chart-3: oklch(0.398 0.07 227.392);
|
||||
--chart-4: oklch(0.828 0.189 84.429);
|
||||
--chart-5: oklch(0.769 0.188 70.08);
|
||||
--sidebar: oklch(0.985 0 0);
|
||||
--sidebar-foreground: oklch(0.141 0.005 285.823);
|
||||
--sidebar-primary: oklch(0.21 0.006 285.885);
|
||||
--sidebar-primary-foreground: oklch(0.985 0 0);
|
||||
--sidebar-accent: oklch(0.967 0.001 286.375);
|
||||
--sidebar-accent-foreground: oklch(0.21 0.006 285.885);
|
||||
--sidebar-border: oklch(0.92 0.004 286.32);
|
||||
--sidebar-ring: oklch(0.705 0.015 286.067);
|
||||
--background: oklch(1 0 0);
|
||||
--foreground: oklch(0.141 0.005 285.823);
|
||||
}
|
||||
|
||||
.dark {
|
||||
--background: oklch(0.141 0.005 285.823);
|
||||
--foreground: oklch(0.985 0 0);
|
||||
--card: oklch(0.21 0.006 285.885);
|
||||
--card-foreground: oklch(0.985 0 0);
|
||||
--popover: oklch(0.21 0.006 285.885);
|
||||
--popover-foreground: oklch(0.985 0 0);
|
||||
--primary: oklch(0.92 0.004 286.32);
|
||||
--primary-foreground: oklch(0.21 0.006 285.885);
|
||||
--secondary: oklch(0.274 0.006 286.033);
|
||||
--secondary-foreground: oklch(0.985 0 0);
|
||||
--muted: oklch(0.274 0.006 286.033);
|
||||
--muted-foreground: oklch(0.705 0.015 286.067);
|
||||
--accent: oklch(0.274 0.006 286.033);
|
||||
--accent-foreground: oklch(0.985 0 0);
|
||||
--destructive: oklch(0.704 0.191 22.216);
|
||||
--border: oklch(1 0 0 / 10%);
|
||||
--input: oklch(1 0 0 / 15%);
|
||||
--ring: oklch(0.552 0.016 285.938);
|
||||
--chart-1: oklch(0.488 0.243 264.376);
|
||||
--chart-2: oklch(0.696 0.17 162.48);
|
||||
--chart-3: oklch(0.769 0.188 70.08);
|
||||
--chart-4: oklch(0.627 0.265 303.9);
|
||||
--chart-5: oklch(0.645 0.246 16.439);
|
||||
--sidebar: oklch(0.21 0.006 285.885);
|
||||
--sidebar-foreground: oklch(0.985 0 0);
|
||||
--sidebar-primary: oklch(0.488 0.243 264.376);
|
||||
--sidebar-primary-foreground: oklch(0.985 0 0);
|
||||
--sidebar-accent: oklch(0.274 0.006 286.033);
|
||||
--sidebar-accent-foreground: oklch(0.985 0 0);
|
||||
--sidebar-border: oklch(1 0 0 / 10%);
|
||||
--sidebar-ring: oklch(0.552 0.016 285.938);
|
||||
}
|
||||
|
||||
@layer base {
|
||||
* {
|
||||
@apply border-border outline-ring/50;
|
||||
}
|
||||
body {
|
||||
@apply bg-background text-foreground;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,71 @@
|
||||
import { createContext, useContext, ReactNode, useMemo } from "react";
|
||||
import { ApiProvider, ApiClients } from "@llamaindex/ui";
|
||||
import { useMetadata, Metadata } from "./useMetadata";
|
||||
import { createBaseWorkflowClient, createClients } from "./client";
|
||||
import { Clock, XCircle } from "lucide-react";
|
||||
|
||||
interface MetadataContextValue {
|
||||
metadata: Metadata;
|
||||
clients: ApiClients;
|
||||
}
|
||||
|
||||
const MetadataContext = createContext<MetadataContextValue | null>(null);
|
||||
|
||||
export function MetadataProvider({ children }: { children: ReactNode }) {
|
||||
const baseClients: ApiClients = useMemo(() => {
|
||||
return {
|
||||
workflowsClient: createBaseWorkflowClient(),
|
||||
} as ApiClients;
|
||||
}, []);
|
||||
return (
|
||||
<ApiProvider clients={baseClients}>
|
||||
<InnerMetadataProvider>{children}</InnerMetadataProvider>
|
||||
</ApiProvider>
|
||||
);
|
||||
}
|
||||
|
||||
function InnerMetadataProvider({ children }: { children: ReactNode }) {
|
||||
const { metadata, loading, error } = useMetadata();
|
||||
const clients = useMemo(
|
||||
() => (metadata ? createClients(metadata) : undefined),
|
||||
[metadata],
|
||||
);
|
||||
|
||||
if (loading) {
|
||||
return (
|
||||
<div className="flex h-screen items-center justify-center">
|
||||
<div className="text-center">
|
||||
<Clock className="h-8 w-8 animate-spin mx-auto mb-2" />
|
||||
<div className="text-sm text-gray-500">Loading configuration...</div>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
if (error || !metadata || !clients) {
|
||||
return (
|
||||
<div className="flex h-screen items-center justify-center">
|
||||
<div className="text-center">
|
||||
<XCircle className="h-8 w-8 text-red-500 mx-auto mb-2" />
|
||||
<div className="text-sm text-gray-500">
|
||||
Error loading configuration: {error || "Unknown error"}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<MetadataContext.Provider value={{ metadata, clients }}>
|
||||
<ApiProvider clients={clients}>{children}</ApiProvider>
|
||||
</MetadataContext.Provider>
|
||||
);
|
||||
}
|
||||
|
||||
export function useMetadataContext() {
|
||||
const context = useContext(MetadataContext);
|
||||
if (!context) {
|
||||
throw new Error("useMetadataContext must be used within MetadataProvider");
|
||||
}
|
||||
return context;
|
||||
}
|
||||
@@ -0,0 +1,41 @@
|
||||
import React from "react";
|
||||
import { APP_TITLE } from "./config";
|
||||
|
||||
export interface BreadcrumbItem {
|
||||
label: string;
|
||||
href?: string;
|
||||
isCurrentPage?: boolean;
|
||||
}
|
||||
|
||||
export const ToolbarCtx = React.createContext<{
|
||||
buttons: React.ReactNode[];
|
||||
setButtons: (fn: (prev: React.ReactNode[]) => React.ReactNode[]) => void;
|
||||
breadcrumbs: BreadcrumbItem[];
|
||||
setBreadcrumbs: (items: BreadcrumbItem[]) => void;
|
||||
}>({
|
||||
buttons: [],
|
||||
setButtons: () => {},
|
||||
breadcrumbs: [],
|
||||
setBreadcrumbs: () => {},
|
||||
});
|
||||
|
||||
export const ToolbarProvider = ({
|
||||
children,
|
||||
}: {
|
||||
children: React.ReactNode;
|
||||
}) => {
|
||||
const [buttons, setButtons] = React.useState<React.ReactNode[]>([]);
|
||||
const [breadcrumbs, setBreadcrumbs] = React.useState<BreadcrumbItem[]>([
|
||||
{ label: APP_TITLE, href: "/" },
|
||||
]);
|
||||
|
||||
return (
|
||||
<ToolbarCtx.Provider
|
||||
value={{ buttons, setButtons, breadcrumbs, setBreadcrumbs }}
|
||||
>
|
||||
{children}
|
||||
</ToolbarCtx.Provider>
|
||||
);
|
||||
};
|
||||
|
||||
export const useToolbar = () => React.useContext(ToolbarCtx);
|
||||
@@ -0,0 +1,51 @@
|
||||
import { ExtractedData } from "llama-cloud-services/beta/agent";
|
||||
import {
|
||||
ApiClients,
|
||||
createWorkflowsClient,
|
||||
createWorkflowsConfig,
|
||||
createCloudAgentClient,
|
||||
cloudApiClient,
|
||||
} from "@llamaindex/ui";
|
||||
import { AGENT_NAME } from "./config";
|
||||
import type { Metadata } from "./useMetadata";
|
||||
|
||||
const platformToken = import.meta.env.VITE_LLAMA_CLOUD_API_KEY;
|
||||
const apiBaseUrl = import.meta.env.VITE_LLAMA_CLOUD_BASE_URL;
|
||||
const projectId = import.meta.env.VITE_LLAMA_DEPLOY_PROJECT_ID;
|
||||
|
||||
// Configure the platform client
|
||||
cloudApiClient.setConfig({
|
||||
...(apiBaseUrl && { baseUrl: apiBaseUrl }),
|
||||
headers: {
|
||||
// optionally use a backend API token scoped to a project. For local development,
|
||||
...(platformToken && { authorization: `Bearer ${platformToken}` }),
|
||||
// This header is required for requests to correctly scope to the agent's project
|
||||
// when authenticating with a user cookie
|
||||
...(projectId && { "Project-Id": projectId }),
|
||||
},
|
||||
});
|
||||
|
||||
export function createBaseWorkflowClient(): ReturnType<
|
||||
typeof createWorkflowsClient
|
||||
> {
|
||||
return createWorkflowsClient(
|
||||
createWorkflowsConfig({
|
||||
baseUrl: `/deployments/${AGENT_NAME}/`,
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
export function createClients(metadata: Metadata): ApiClients {
|
||||
const workflowsClient = createBaseWorkflowClient();
|
||||
const agentClient = createCloudAgentClient<ExtractedData<any>>({
|
||||
client: cloudApiClient,
|
||||
windowUrl: typeof window !== "undefined" ? window.location.href : undefined,
|
||||
collection: metadata.extracted_data_collection,
|
||||
});
|
||||
|
||||
return {
|
||||
workflowsClient,
|
||||
cloudApiClient,
|
||||
agentDataClient: agentClient,
|
||||
} as ApiClients;
|
||||
}
|
||||
@@ -0,0 +1,2 @@
|
||||
export const APP_TITLE = "Extraction Review";
|
||||
export const AGENT_NAME = import.meta.env.VITE_LLAMA_DEPLOY_DEPLOYMENT_NAME;
|
||||
@@ -0,0 +1,39 @@
|
||||
import type {
|
||||
ExtractedData,
|
||||
TypedAgentData,
|
||||
} from "llama-cloud-services/beta/agent";
|
||||
|
||||
/**
|
||||
* Downloads data as a JSON file
|
||||
*/
|
||||
export function downloadJSON<T>(
|
||||
data: T,
|
||||
filename: string = "extraction-results.json",
|
||||
) {
|
||||
const jsonString = JSON.stringify(data, null, 2);
|
||||
const blob = new Blob([jsonString], { type: "application/json" });
|
||||
const url = URL.createObjectURL(blob);
|
||||
|
||||
const link = document.createElement("a");
|
||||
link.href = url;
|
||||
link.download = filename;
|
||||
document.body.appendChild(link);
|
||||
link.click();
|
||||
|
||||
// Cleanup
|
||||
document.body.removeChild(link);
|
||||
URL.revokeObjectURL(url);
|
||||
}
|
||||
|
||||
/**
|
||||
* Downloads extracted data item as JSON
|
||||
*/
|
||||
export function downloadExtractedDataItem<T>(
|
||||
item: TypedAgentData<ExtractedData<T>>,
|
||||
) {
|
||||
const fileName = item.data.file_name || "item";
|
||||
const timestamp = item.createdAt.toISOString().split("T")[0];
|
||||
const filename = `${fileName}-${timestamp}.json`;
|
||||
|
||||
downloadJSON(item, filename);
|
||||
}
|
||||
@@ -0,0 +1,41 @@
|
||||
import { useWorkflowHandler, useWorkflowRun } from "@llamaindex/ui";
|
||||
import { useEffect, useState } from "react";
|
||||
|
||||
export interface Metadata {
|
||||
schemas: Record<string, any>;
|
||||
extracted_data_collection: string;
|
||||
}
|
||||
|
||||
export interface UseMetadataResult {
|
||||
metadata: Metadata;
|
||||
loading: boolean;
|
||||
error: string | undefined;
|
||||
}
|
||||
|
||||
export function useMetadata() {
|
||||
const run = useWorkflowRun();
|
||||
const [handlerId, setHandlerId] = useState<string | undefined>(undefined);
|
||||
const handler = useWorkflowHandler(handlerId ?? "");
|
||||
const [error, setError] = useState<string | undefined>(undefined);
|
||||
const [loading, setLoading] = useState(true);
|
||||
|
||||
useEffect(() => {
|
||||
setLoading(true);
|
||||
run
|
||||
.runWorkflow("metadata", {})
|
||||
.then((handlerSummary) => {
|
||||
setHandlerId(handlerSummary.handler_id);
|
||||
})
|
||||
.catch((error) => {
|
||||
setError(error.message);
|
||||
})
|
||||
.finally(() => {
|
||||
setLoading(false);
|
||||
});
|
||||
}, []);
|
||||
const stopEvent = handler.events.find((event) =>
|
||||
event.type.endsWith("MetadataResponse"),
|
||||
);
|
||||
const metadata = stopEvent?.data as Metadata | undefined;
|
||||
return { metadata, loading, error };
|
||||
}
|
||||
@@ -0,0 +1,6 @@
|
||||
import { clsx, type ClassValue } from "clsx";
|
||||
import { twMerge } from "tailwind-merge";
|
||||
|
||||
export function cn(...inputs: ClassValue[]) {
|
||||
return twMerge(clsx(inputs));
|
||||
}
|
||||
@@ -0,0 +1,14 @@
|
||||
import { StrictMode } from "react";
|
||||
import { createRoot } from "react-dom/client";
|
||||
import { HashRouter } from "react-router-dom";
|
||||
import App from "./App";
|
||||
import "@llamaindex/ui/styles.css";
|
||||
import "./index.css";
|
||||
|
||||
createRoot(document.getElementById("root")!).render(
|
||||
<StrictMode>
|
||||
<HashRouter>
|
||||
<App />
|
||||
</HashRouter>
|
||||
</StrictMode>,
|
||||
);
|
||||
@@ -0,0 +1,23 @@
|
||||
.main {
|
||||
padding: 1rem;
|
||||
}
|
||||
|
||||
.grid {
|
||||
display: flex;
|
||||
flex-direction: row;
|
||||
gap: 1rem;
|
||||
margin-bottom: 1rem;
|
||||
& > * {
|
||||
flex: 1;
|
||||
}
|
||||
}
|
||||
|
||||
.commandBar {
|
||||
display: flex;
|
||||
justify-content: flex-end;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
.progressBar {
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
import {
|
||||
ItemCount,
|
||||
WorkflowTrigger,
|
||||
WorkflowProgressBar,
|
||||
ExtractedDataItemGrid,
|
||||
useWorkflowHandlerList,
|
||||
} from "@llamaindex/ui";
|
||||
import type { TypedAgentData } from "llama-cloud-services/beta/agent";
|
||||
import styles from "./HomePage.module.css";
|
||||
import { useNavigate } from "react-router-dom";
|
||||
import { useEffect, useState } from "react";
|
||||
|
||||
export default function HomePage() {
|
||||
const { taskKey } = taskCompletedState();
|
||||
return <TaskList key={taskKey} />;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a key that increments when a task is completed, can be used to force a re-render of the task list
|
||||
*/
|
||||
function taskCompletedState() {
|
||||
const { handlers } = useWorkflowHandlerList("process-file");
|
||||
const runningTasks = handlers.filter(
|
||||
(handler) => handler.status === "running",
|
||||
);
|
||||
const [runningTaskCount, setRunningTaskCount] = useState(runningTasks.length);
|
||||
const [taskKey, setTaskKey] = useState(0);
|
||||
useEffect(() => {
|
||||
if (runningTasks.length < runningTaskCount) {
|
||||
// forcefully reload task list after a task is completed
|
||||
setTaskKey(taskKey + 1);
|
||||
}
|
||||
setRunningTaskCount(runningTasks.length);
|
||||
}, [runningTasks.length]);
|
||||
return { runningTaskCount, taskKey };
|
||||
}
|
||||
|
||||
function TaskList() {
|
||||
const navigate = useNavigate();
|
||||
const goToItem = (item: TypedAgentData) => {
|
||||
navigate(`/item/${item.id}`);
|
||||
};
|
||||
return (
|
||||
<div className={styles.page}>
|
||||
<main className={styles.main}>
|
||||
<div className={styles.grid}>
|
||||
<ItemCount title="Total Items" />
|
||||
<ItemCount
|
||||
title="Reviewed"
|
||||
filter={{
|
||||
status: { eq: "approved" },
|
||||
}}
|
||||
/>
|
||||
<ItemCount
|
||||
title="Needs Review"
|
||||
filter={{
|
||||
status: { eq: "pending_review" },
|
||||
}}
|
||||
/>
|
||||
</div>
|
||||
<div className={styles.commandBar}>
|
||||
<WorkflowTrigger
|
||||
workflowName="process-file"
|
||||
customWorkflowInput={(files) => {
|
||||
return {
|
||||
file_id: files[0].fileId,
|
||||
};
|
||||
}}
|
||||
/>
|
||||
</div>
|
||||
<WorkflowProgressBar
|
||||
className={styles.progressBar}
|
||||
workflowName="process-file"
|
||||
/>
|
||||
<ExtractedDataItemGrid
|
||||
onRowClick={goToItem}
|
||||
builtInColumns={{
|
||||
fileName: true,
|
||||
status: true,
|
||||
createdAt: true,
|
||||
itemsToReview: true,
|
||||
actions: true,
|
||||
}}
|
||||
/>
|
||||
</main>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
@@ -0,0 +1,189 @@
|
||||
import { useEffect, useState } from "react";
|
||||
import {
|
||||
AcceptReject,
|
||||
ExtractedDataDisplay,
|
||||
FilePreview,
|
||||
useItemData,
|
||||
type Highlight,
|
||||
Button,
|
||||
} from "@llamaindex/ui";
|
||||
import { Clock, XCircle, Download } from "lucide-react";
|
||||
import { useParams } from "react-router-dom";
|
||||
import { useToolbar } from "@/lib/ToolbarContext";
|
||||
import { useNavigate } from "react-router-dom";
|
||||
import { modifyJsonSchema } from "@llamaindex/ui/lib";
|
||||
import { APP_TITLE } from "@/lib/config";
|
||||
import { downloadExtractedDataItem } from "@/lib/export";
|
||||
import { useMetadataContext } from "@/lib/MetadataProvider";
|
||||
|
||||
export default function ItemPage() {
|
||||
const { itemId } = useParams<{ itemId: string }>();
|
||||
const { setButtons, setBreadcrumbs } = useToolbar();
|
||||
const [highlight, setHighlight] = useState<Highlight | undefined>(undefined);
|
||||
const { metadata } = useMetadataContext();
|
||||
|
||||
// Use the hook to fetch item data (initially with a default schema)
|
||||
const itemHookData = useItemData<any>({
|
||||
// We'll update the schema based on classification once data loads
|
||||
jsonSchema: modifyJsonSchema(metadata.schemas["10-K"] || {}, {}),
|
||||
itemId: itemId as string,
|
||||
isMock: false,
|
||||
});
|
||||
|
||||
// Determine the correct schema based on classification
|
||||
const classification = (
|
||||
(itemHookData.item?.data?.metadata?.classification as string | undefined) ||
|
||||
"10-K"
|
||||
).toUpperCase();
|
||||
const correctSchema =
|
||||
metadata.schemas[classification] || metadata.schemas["10-K"];
|
||||
|
||||
// Update the schema in itemHookData if classification is available
|
||||
const [schemaKey, setSchemaKey] = useState(0);
|
||||
const [appliedSchema, setAppliedSchema] = useState(correctSchema);
|
||||
|
||||
useEffect(() => {
|
||||
if (classification && metadata.schemas[classification]) {
|
||||
setAppliedSchema(modifyJsonSchema(metadata.schemas[classification], {}));
|
||||
setSchemaKey(schemaKey + 1);
|
||||
}
|
||||
}, [classification, metadata.schemas]);
|
||||
|
||||
const navigate = useNavigate();
|
||||
|
||||
// Update breadcrumb when item data loads
|
||||
useEffect(() => {
|
||||
const fileName = itemHookData.item?.data?.file_name;
|
||||
if (fileName) {
|
||||
setBreadcrumbs([
|
||||
{ label: APP_TITLE, href: "/" },
|
||||
{
|
||||
label: fileName,
|
||||
isCurrentPage: true,
|
||||
},
|
||||
]);
|
||||
}
|
||||
|
||||
return () => {
|
||||
// Reset to default breadcrumb when leaving the page
|
||||
setBreadcrumbs([{ label: APP_TITLE, href: "/" }]);
|
||||
};
|
||||
}, [itemHookData.item?.data?.file_name, setBreadcrumbs]);
|
||||
|
||||
useEffect(() => {
|
||||
setButtons(() => [
|
||||
<div className="ml-auto flex items-center gap-2">
|
||||
<Button
|
||||
variant="outline"
|
||||
size="sm"
|
||||
onClick={() => {
|
||||
if (itemData) {
|
||||
downloadExtractedDataItem(itemData);
|
||||
}
|
||||
}}
|
||||
disabled={!itemData}
|
||||
>
|
||||
<Download className="h-4 w-4 mr-2" />
|
||||
Export JSON
|
||||
</Button>
|
||||
<AcceptReject<any>
|
||||
itemData={itemHookData}
|
||||
onComplete={() => navigate("/")}
|
||||
/>
|
||||
</div>,
|
||||
]);
|
||||
return () => {
|
||||
setButtons(() => []);
|
||||
};
|
||||
}, [itemHookData.data, setButtons]);
|
||||
|
||||
const {
|
||||
item: itemData,
|
||||
updateData,
|
||||
loading: isLoading,
|
||||
error,
|
||||
} = itemHookData;
|
||||
|
||||
const classificationReasoning = itemData?.data?.metadata
|
||||
?.classification_reasoning as string | undefined;
|
||||
|
||||
if (isLoading) {
|
||||
return (
|
||||
<div className="flex h-screen items-center justify-center">
|
||||
<div className="text-center">
|
||||
<Clock className="h-8 w-8 animate-spin mx-auto mb-2" />
|
||||
<div className="text-sm text-gray-500">Loading item...</div>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
if (error || !itemData) {
|
||||
return (
|
||||
<div className="flex h-screen items-center justify-center">
|
||||
<div className="text-center">
|
||||
<XCircle className="h-8 w-8 text-red-500 mx-auto mb-2" />
|
||||
<div className="text-sm text-gray-500">
|
||||
Error loading item: {error || "Item not found"}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="flex h-full bg-gray-50">
|
||||
{/* Left Side - File Preview */}
|
||||
<div className="w-1/2 border-r border-gray-200 bg-white">
|
||||
{itemData.data.file_id && (
|
||||
<FilePreview
|
||||
fileId={itemData.data.file_id}
|
||||
onBoundingBoxClick={(box, pageNumber) => {
|
||||
console.log("Bounding box clicked:", box, "on page:", pageNumber);
|
||||
}}
|
||||
highlight={highlight}
|
||||
/>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* Right Side - Review Panel */}
|
||||
<div className="flex-1 bg-white h-full overflow-y-auto">
|
||||
<div className="p-4 space-y-4">
|
||||
{/* Classification Info */}
|
||||
{classification && (
|
||||
<div className="bg-blue-50 border border-blue-200 rounded-lg p-3 mb-4">
|
||||
<div className="text-sm font-semibold text-blue-900">
|
||||
Document Type: {classification}
|
||||
</div>
|
||||
{classificationReasoning && (
|
||||
<div className="text-xs text-blue-600 mt-1">
|
||||
{classificationReasoning}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
{/* Extracted Data */}
|
||||
<ExtractedDataDisplay<any>
|
||||
key={schemaKey}
|
||||
extractedData={itemData.data}
|
||||
title="Extracted Data"
|
||||
onChange={(updatedData) => {
|
||||
updateData(updatedData);
|
||||
}}
|
||||
onClickField={(args) => {
|
||||
// TODO: set multiple highlights
|
||||
setHighlight({
|
||||
page: args.metadata?.citation?.[0]?.page ?? 1,
|
||||
x: 100,
|
||||
y: 100,
|
||||
width: 0,
|
||||
height: 0,
|
||||
});
|
||||
}}
|
||||
jsonSchema={appliedSchema}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
Vendored
+15
@@ -0,0 +1,15 @@
|
||||
/// <reference types="vite/client" />
|
||||
|
||||
interface ImportMetaEnv {
|
||||
readonly VITE_LLAMA_CLOUD_API_KEY?: string;
|
||||
readonly VITE_LLAMA_CLOUD_BASE_URL?: string;
|
||||
|
||||
// injected from llama_deploy
|
||||
readonly VITE_LLAMA_DEPLOY_BASE_PATH: string;
|
||||
readonly VITE_LLAMA_DEPLOY_DEPLOYMENT_NAME: string;
|
||||
readonly VITE_LLAMA_DEPLOY_PROJECT_ID: string;
|
||||
}
|
||||
|
||||
interface ImportMeta {
|
||||
readonly env: ImportMetaEnv;
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"target": "ES2020",
|
||||
"useDefineForClassFields": true,
|
||||
"lib": ["ES2020", "DOM", "DOM.Iterable"],
|
||||
"module": "ESNext",
|
||||
"skipLibCheck": true,
|
||||
|
||||
/* Bundler mode */
|
||||
"moduleResolution": "bundler",
|
||||
"allowImportingTsExtensions": true,
|
||||
"resolveJsonModule": true,
|
||||
"isolatedModules": true,
|
||||
"noEmit": true,
|
||||
"jsx": "react-jsx",
|
||||
|
||||
/* Linting */
|
||||
"strict": true,
|
||||
"noUnusedLocals": true,
|
||||
"noUnusedParameters": true,
|
||||
"noFallthroughCasesInSwitch": true,
|
||||
|
||||
/* Path mapping */
|
||||
"baseUrl": ".",
|
||||
"paths": {
|
||||
"@/*": ["./src/*"]
|
||||
}
|
||||
},
|
||||
"include": ["src", "vite.config.ts", "src/vite-env.d.ts"],
|
||||
"exclude": ["node_modules"]
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
import { defineConfig } from "vite";
|
||||
import react from "@vitejs/plugin-react";
|
||||
import path from "path";
|
||||
|
||||
// https://vitejs.dev/config/
|
||||
export default defineConfig(({}) => {
|
||||
const deploymentName = process.env.LLAMA_DEPLOY_DEPLOYMENT_NAME;
|
||||
const basePath = process.env.LLAMA_DEPLOY_DEPLOYMENT_BASE_PATH;
|
||||
const projectId = process.env.LLAMA_DEPLOY_PROJECT_ID;
|
||||
const port = process.env.PORT ? Number(process.env.PORT) : 3000;
|
||||
const baseUrl = process.env.LLAMA_CLOUD_BASE_URL;
|
||||
return {
|
||||
plugins: [react()],
|
||||
resolve: {
|
||||
alias: {
|
||||
"@": path.resolve(__dirname, "./src"),
|
||||
},
|
||||
},
|
||||
server: {
|
||||
port: port,
|
||||
host: true,
|
||||
},
|
||||
build: {
|
||||
outDir: "dist",
|
||||
sourcemap: true,
|
||||
},
|
||||
base: basePath,
|
||||
define: {
|
||||
// Primary define uses NAME
|
||||
"import.meta.env.VITE_LLAMA_DEPLOY_DEPLOYMENT_NAME": JSON.stringify(
|
||||
deploymentName
|
||||
),
|
||||
"import.meta.env.VITE_LLAMA_DEPLOY_DEPLOYMENT_BASE_PATH": JSON.stringify(basePath),
|
||||
...(projectId && {
|
||||
"import.meta.env.VITE_LLAMA_DEPLOY_PROJECT_ID":
|
||||
JSON.stringify(projectId),
|
||||
}),
|
||||
...(baseUrl && {
|
||||
"import.meta.env.VITE_LLAMA_CLOUD_BASE_URL": JSON.stringify(baseUrl),
|
||||
}),
|
||||
},
|
||||
};
|
||||
});
|
||||
@@ -0,0 +1,2 @@
|
||||
# Changes here will be overwritten by Copier; NEVER EDIT MANUALLY
|
||||
{{ _copier_answers|to_nice_yaml -}}
|
||||
Reference in New Issue
Block a user