From 726d6c0e4ce652308f9465fe2ca3cabbdf468e41 Mon Sep 17 00:00:00 2001 From: Adrian Lyjak Date: Tue, 4 Nov 2025 17:29:54 -0500 Subject: [PATCH] Add back classify-extract-sec --- .copier-answers.yml | 3 + .env.template | 2 + .gitignore | 7 + README.md | 69 +++- pyproject.toml | 51 +++ src/extraction_review/__init__.py | 0 src/extraction_review/clients.py | 88 +++++ src/extraction_review/config.py | 362 +++++++++++++++++++ src/extraction_review/metadata_workflow.py | 36 ++ src/extraction_review/process_file.py | 401 +++++++++++++++++++++ src/extraction_review/schema.py | 57 +++ tests/test_placeholder.py | 2 + ui/.gitignore | 43 +++ ui/README.md | 7 + ui/components.json | 21 ++ ui/index.html | 12 + ui/package.json | 45 +++ ui/postcss.config.mjs | 7 + ui/src/App.tsx | 70 ++++ ui/src/index.css | 120 ++++++ ui/src/lib/MetadataProvider.tsx | 71 ++++ ui/src/lib/ToolbarContext.tsx | 41 +++ ui/src/lib/client.ts | 51 +++ ui/src/lib/config.ts | 2 + ui/src/lib/export.ts | 39 ++ ui/src/lib/useMetadata.ts | 41 +++ ui/src/lib/utils.ts | 6 + ui/src/main.tsx | 14 + ui/src/pages/HomePage.module.css | 23 ++ ui/src/pages/HomePage.tsx | 88 +++++ ui/src/pages/ItemPage.tsx | 189 ++++++++++ ui/src/vite-env.d.ts | 15 + ui/tsconfig.json | 31 ++ ui/vite.config.ts | 43 +++ {{ _copier_conf.answers_file }}.jinja | 2 + 35 files changed, 2057 insertions(+), 2 deletions(-) create mode 100644 .copier-answers.yml create mode 100644 .env.template create mode 100644 .gitignore create mode 100644 pyproject.toml create mode 100644 src/extraction_review/__init__.py create mode 100644 src/extraction_review/clients.py create mode 100644 src/extraction_review/config.py create mode 100644 src/extraction_review/metadata_workflow.py create mode 100644 src/extraction_review/process_file.py create mode 100644 src/extraction_review/schema.py create mode 100644 tests/test_placeholder.py create mode 100644 ui/.gitignore create mode 100644 ui/README.md create mode 100644 ui/components.json create mode 100644 ui/index.html create mode 100644 ui/package.json create mode 100644 ui/postcss.config.mjs create mode 100644 ui/src/App.tsx create mode 100644 ui/src/index.css create mode 100644 ui/src/lib/MetadataProvider.tsx create mode 100644 ui/src/lib/ToolbarContext.tsx create mode 100644 ui/src/lib/client.ts create mode 100644 ui/src/lib/config.ts create mode 100644 ui/src/lib/export.ts create mode 100644 ui/src/lib/useMetadata.ts create mode 100644 ui/src/lib/utils.ts create mode 100644 ui/src/main.tsx create mode 100644 ui/src/pages/HomePage.module.css create mode 100644 ui/src/pages/HomePage.tsx create mode 100644 ui/src/pages/ItemPage.tsx create mode 100644 ui/src/vite-env.d.ts create mode 100644 ui/tsconfig.json create mode 100644 ui/vite.config.ts create mode 100644 {{ _copier_conf.answers_file }}.jinja diff --git a/.copier-answers.yml b/.copier-answers.yml new file mode 100644 index 0000000..484e56c --- /dev/null +++ b/.copier-answers.yml @@ -0,0 +1,3 @@ +# Changes here will be overwritten by Copier; NEVER EDIT MANUALLY +_commit: v0.2.1 +_src_path: https://github.com/run-llama/template-workflow-data-extraction diff --git a/.env.template b/.env.template new file mode 100644 index 0000000..f6b663a --- /dev/null +++ b/.env.template @@ -0,0 +1,2 @@ +# copy to .env and place any needed secrets here. LLAMA_CLOUD_API_KEY will be automatically set +# OPENAI_API_KEY=sk-xxx diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1db54c3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +.env +__pycache__ +workflows.db + +.venv +package-lock.json +node_modules diff --git a/README.md b/README.md index 6afe5d7..647c647 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,67 @@ -# template-workflow-classify-extract-sec -Llama Index Workflow Template +# SEC Filing Data Extraction and Analysis + +A LlamaAgents application for extracting structured information from SEC filings using LlamaClassify and LlamaExtract. This application automatically classifies SEC documents (10-K, 10-Q, 8-K, or other) and extracts relevant financial and business information tailored to each filing type. + +## Features + +- **Intelligent Classification**: Uses LlamaClassify to automatically identify SEC filing types (10-K, 10-Q, 8-K, other) +- **Dynamic Schema Selection**: Applies specialized extraction schemas based on document type +- **Comprehensive Data Extraction**: Extracts filing-specific information: + - **10-K**: Annual reports with financial metrics, risk factors, business descriptions, executive information + - **10-Q**: Quarterly reports with period-over-period comparisons and updates + - **8-K**: Current reports with material event information and impact analysis + - **Other**: Catch-all for S-1, DEF 14A, 13F, and other filing types +- **Agent Data Storage**: Stores extracted data in LlamaCloud Agent Data for easy querying and analysis +- **UI Integration**: Web interface for reviewing and managing extracted data + +## Configuration + +All main configuration is in `src/extraction_review/config.py`: + +## How It Works + +The application uses a multi-step workflow powered by LlamaIndex: + +1. **File Upload**: User uploads an SEC filing document through the UI +2. **Download**: File is downloaded from LlamaCloud storage +3. **Classification**: LlamaClassify analyzes the first 5 pages to determine filing type (10-K, 10-Q, 8-K, or other) +4. **Schema Selection**: Appropriate extraction schema is selected based on classification +5. **Extraction**: LlamaExtract processes the document using the selected schema +6. **Storage**: Extracted data is stored in Agent Data with deduplication by file hash +7. **Review**: UI displays extracted data for review and editing + +### Workflows + +The application includes two main workflows: + +- **`process-file`** (`src/extraction_review/process_file.py`): Main workflow for processing SEC filings + - Steps: download → classify → extract → store + - Uses typed context to pass state between steps + - Streams progress updates to UI via `UIToast` events + +- **`metadata`** (`src/extraction_review/metadata_workflow.py`): Exposes configuration metadata to UI + - Returns JSON schema and collection name for dynamic UI generation + +## Linting and Type Checking + +Python and javascript packages contain helpful scripts to lint, format, and type check the code. + +To check and fix python code: + +```bash +uv run hatch run lint +uv run hatch run typecheck +uv run hatch run test +# run all at once +uv run hatch run all-fix +``` + +To check and fix javascript code, within the `ui` directory: + +```bash +pnpm run lint +pnpm run typecheck +pnpm run test +# run all at once +pnpm run all-fix +``` \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..ca2e5b2 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,51 @@ +[project] +name = "extraction-review" +version = "0.1.0" +description = "Extracts data" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "llama-cloud-services>=0.6.69", + "llama-index-workflows>=2.2.0,<3.0.0", + "python-dotenv>=1.1.0", + "jsonref>=1.1.0", + "click>=8.2.1,<8.3.0", + "httpx>=0.28.1", + "llama-index-core>=0.14.0", +] + +[dependency-groups] +dev = [ + "ruff>=0.11.10", + "typescript>=0.0.12", + "ty>=0.0.1a16", + "pytest>=8.4.1", + "hatch>=1.14.1", + "llamactl>=0.3.0" +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.envs.default.scripts] +"format" = "ruff format ." +"format-check" = "ruff format --check ." +"lint" = "ruff check --fix ." +"lint-check" = ["ruff check ."] +typecheck = "ty check src" +test = "pytest" +"all-check" = ["format-check", "lint-check", "test"] +"all-fix" = ["format", "lint", "test"] + +[tool.llamadeploy] +env_files = [".env"] +llama_cloud = true + +[tool.llamadeploy.workflows] +process-file = "extraction_review.process_file:workflow" +metadata = "extraction_review.metadata_workflow:workflow" + +[tool.llamadeploy.ui] +directory = "ui" + diff --git a/src/extraction_review/__init__.py b/src/extraction_review/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/extraction_review/clients.py b/src/extraction_review/clients.py new file mode 100644 index 0000000..71b7a64 --- /dev/null +++ b/src/extraction_review/clients.py @@ -0,0 +1,88 @@ +import functools +import os +from typing import Any +import httpx + +from llama_cloud_services import ExtractionAgent, LlamaExtract +from llama_cloud.core.api_error import ApiError +from llama_cloud_services.beta.agent_data import AsyncAgentDataClient, ExtractedData +from llama_cloud_services.beta.classifier.client import ClassifyClient +from llama_cloud.client import AsyncLlamaCloud +import logging + +from extraction_review.config import ( + EXTRACT_CONFIG, + EXTRACTED_DATA_COLLECTION, + EXTRACTION_AGENT_NAME, + USE_REMOTE_EXTRACTION_SCHEMA, + ExtractionSchema, +) + +logger = logging.getLogger(__name__) + +# deployed agents may infer their name from the deployment name +# Note: Make sure that an agent deployment with this name actually exists +# otherwise calls to get or set data will fail. You may need to adjust the `or ` +# name for development +agent_name = os.getenv("LLAMA_DEPLOY_DEPLOYMENT_NAME") +# required for all llama cloud calls +api_key = os.environ["LLAMA_CLOUD_API_KEY"] +# get this in case running against a different environment than production +base_url = os.getenv("LLAMA_CLOUD_BASE_URL") +project_id = os.getenv("LLAMA_DEPLOY_PROJECT_ID") + + +@functools.lru_cache(maxsize=None) +def get_extract_agent() -> ExtractionAgent: + extract_api = LlamaExtract( + api_key=api_key, base_url=base_url, project_id=project_id + ) + + try: + existing = extract_api.get_agent(EXTRACTION_AGENT_NAME) + if not USE_REMOTE_EXTRACTION_SCHEMA: + existing.data_schema = ExtractionSchema + existing.config = EXTRACT_CONFIG + return existing + except ApiError as e: + if e.status_code == 404: + if USE_REMOTE_EXTRACTION_SCHEMA: + logger.warning( + "Extraction agent does not exist, creating a new one from the local schema" + ) + return extract_api.create_agent( + name=EXTRACTION_AGENT_NAME, + data_schema=ExtractionSchema, + config=EXTRACT_CONFIG, + ) + else: + raise + + +@functools.lru_cache(maxsize=None) +def get_data_client() -> AsyncAgentDataClient: + return AsyncAgentDataClient( + deployment_name=agent_name, + collection=EXTRACTED_DATA_COLLECTION, + type=ExtractedData[Any], + client=get_llama_cloud_client(), + ) + + +@functools.lru_cache(maxsize=None) +def get_llama_cloud_client(): + return AsyncLlamaCloud( + base_url=base_url, + token=api_key, + httpx_client=httpx.AsyncClient( + timeout=60, headers={"Project-Id": project_id} if project_id else None + ), + ) + + +@functools.lru_cache(maxsize=None) +def get_classifier_client(): + return ClassifyClient( + client=get_llama_cloud_client(), + project_id=project_id, + ) diff --git a/src/extraction_review/config.py b/src/extraction_review/config.py new file mode 100644 index 0000000..bf2782d --- /dev/null +++ b/src/extraction_review/config.py @@ -0,0 +1,362 @@ +""" +For simple configuration of the extraction review application, just customize this file. + +If you need more control, feel free to edit the rest of the application +""" + +from __future__ import annotations +import os +from typing import Type + +from llama_cloud import ExtractConfig +from llama_cloud_services.extract import ExtractMode +from pydantic import BaseModel, Field + +# If you change this to true, the schema and extraction configuration will be fetched from the remote extraction agent +# rather than using the ExtractionSchema and configuration defined below. +USE_REMOTE_EXTRACTION_SCHEMA: bool = False +# The name of the extraction agent to use. Prefers the name of this deployment when deployed to isolate environments. +# Note that the application will create a new agent from the below ExtractionSchema if the extraction agent does not yet exist. +EXTRACTION_AGENT_NAME: str = ( + os.getenv("LLAMA_DEPLOY_DEPLOYMENT_NAME") or "extraction-review" +) +# The name of the collection to use for storing extracted data. This will be qualified by the agent name. +# When developing locally, this will use the _public collection (shared within the project), otherwise agent +# data is isolated to each agent +EXTRACTED_DATA_COLLECTION: str = "sec-filing-extraction" + + +# SEC Filing Classification Types +SEC_FILING_TYPES = ["10-K", "10-Q", "8-K", "other"] + + +# Base class for common fields across all SEC filings +class BaseSECFiling(BaseModel): + """Common fields present in all SEC filings""" + + company_name: str = Field( + description="The full legal name of the company filing the document" + ) + ticker_symbol: str | None = Field( + default=None, + description="The stock ticker symbol of the company. May not be present for all filings.", + ) + cik: str | None = Field( + default=None, + description="Central Index Key - the unique identifier assigned by the SEC to the company", + ) + filing_date: str | None = Field( + default=None, + description="The date the document was filed with the SEC (format: YYYY-MM-DD)", + ) + fiscal_year_end: str | None = Field( + default=None, + description="The fiscal year end date for the company (format: YYYY-MM-DD)", + ) + sic_code: str | None = Field( + default=None, + description="Standard Industrial Classification code for the company's industry", + ) + + +# Financial metrics that appear in multiple filing types +class FinancialMetrics(BaseModel): + """Key financial metrics extracted from statements""" + + total_revenue: str | None = Field( + default=None, + description="Total revenue/sales for the period. Include currency and amount (e.g., '$1.2B USD')", + ) + net_income: str | None = Field( + default=None, + description="Net income/profit for the period. Include currency and amount", + ) + total_assets: str | None = Field( + default=None, + description="Total assets as of the balance sheet date. Include currency and amount", + ) + total_liabilities: str | None = Field( + default=None, + description="Total liabilities as of the balance sheet date. Include currency and amount", + ) + stockholders_equity: str | None = Field( + default=None, + description="Total stockholders' equity. Include currency and amount", + ) + cash_and_equivalents: str | None = Field( + default=None, + description="Cash and cash equivalents. Include currency and amount", + ) + earnings_per_share: str | None = Field( + default=None, description="Earnings per share (EPS) for the period" + ) + + +# Risk factor for use in 10-K and 10-Q +class RiskFactor(BaseModel): + """Individual risk factor identified in the filing""" + + category: str = Field( + description="Category of risk (e.g., 'Market Risk', 'Operational Risk', 'Legal Risk')" + ) + description: str = Field(description="Brief description of the specific risk") + + +# 10-K: Annual Report +class Filing10K(BaseSECFiling): + """ + Form 10-K is an annual report required by the SEC that provides a comprehensive + summary of a company's financial performance. + """ + + document_type: str = Field(default="10-K", description="Should always be '10-K'") + fiscal_year: int | None = Field( + default=None, + description="The fiscal year covered by this annual report (e.g., 2023)", + ) + + # Business overview + business_description: str | None = Field( + default=None, + description="A 2-3 sentence summary of the company's business and operations", + ) + + # Financial data + financial_metrics: FinancialMetrics | None = Field( + default=None, description="Key financial metrics from the annual statements" + ) + + # Risk factors + risk_factors: list[RiskFactor] | None = Field( + default=None, + description="List of material risk factors disclosed in the filing. Extract 3-5 most significant risks.", + ) + + # Management discussion + management_discussion_summary: str | None = Field( + default=None, + description="2-3 sentence summary of Management's Discussion and Analysis (MD&A) section", + ) + + # Legal proceedings + legal_proceedings: list[str] | None = Field( + default=None, + description="List of significant legal proceedings or litigation mentioned", + ) + + # Executive officers + executive_officers: list[str] | None = Field( + default=None, + description="Names and titles of key executive officers (CEO, CFO, etc.)", + ) + + # Auditor information + auditor_name: str | None = Field( + default=None, + description="Name of the independent registered public accounting firm", + ) + + # Key insights + key_highlights: list[str] | None = Field( + default=None, + description="3-5 key highlights or notable items from the annual report", + ) + + +# 10-Q: Quarterly Report +class Filing10Q(BaseSECFiling): + """ + Form 10-Q is a quarterly report that provides a continuing view of a company's + financial position during the year. + """ + + document_type: str = Field(default="10-Q", description="Should always be '10-Q'") + fiscal_quarter: str | None = Field( + default=None, + description="The fiscal quarter covered (e.g., 'Q1 2024', 'Q2 2023')", + ) + fiscal_year: int | None = Field( + default=None, description="The fiscal year for this quarter (e.g., 2024)" + ) + period_end_date: str | None = Field( + default=None, + description="The end date of the quarterly period (format: YYYY-MM-DD)", + ) + + # Financial data + financial_metrics: FinancialMetrics | None = Field( + default=None, description="Key financial metrics from the quarterly statements" + ) + + # Comparison to prior periods + year_over_year_revenue_change: str | None = Field( + default=None, + description="Year-over-year revenue change percentage or description (e.g., 'up 15%')", + ) + quarter_over_quarter_revenue_change: str | None = Field( + default=None, + description="Quarter-over-quarter revenue change percentage or description", + ) + + # Management discussion + management_discussion_summary: str | None = Field( + default=None, + description="2-3 sentence summary of Management's Discussion and Analysis for the quarter", + ) + + # Risk factors + material_changes_to_risks: str | None = Field( + default=None, + description="Summary of any material changes to risk factors since the last 10-K", + ) + + # Legal updates + legal_proceedings_updates: list[str] | None = Field( + default=None, + description="Updates to legal proceedings or new litigation since last filing", + ) + + # Key insights + key_highlights: list[str] | None = Field( + default=None, + description="3-5 key highlights or notable items from the quarterly report", + ) + + +# 8-K: Current Report +class Filing8K(BaseSECFiling): + """ + Form 8-K is a current report used to notify investors of significant events + that shareholders should know about. + """ + + document_type: str = Field(default="8-K", description="Should always be '8-K'") + + # Event information + event_date: str | None = Field( + default=None, + description="The date of the event being reported (format: YYYY-MM-DD)", + ) + event_type: str | None = Field( + default=None, + description="Type of event (e.g., 'Merger/Acquisition', 'Leadership Change', 'Earnings Release', 'Material Agreement')", + ) + item_numbers: list[str] | None = Field( + default=None, + description="Item numbers from the 8-K form (e.g., ['1.01', '5.02']) indicating which sections are included", + ) + + # Event description + event_summary: str = Field( + description="2-4 sentence summary describing the material event being reported" + ) + event_details: str | None = Field( + default=None, + description="More detailed description of the event and its implications", + ) + + # Financial impact + estimated_financial_impact: str | None = Field( + default=None, + description="Estimated financial impact of the event, if disclosed", + ) + + # Related parties + related_parties: list[str] | None = Field( + default=None, + description="Names of other companies, individuals, or entities involved in the event", + ) + + # Exhibits filed + material_exhibits: list[str] | None = Field( + default=None, + description="Description of significant exhibits filed with the 8-K (e.g., 'Press Release', 'Material Agreement')", + ) + + # Forward-looking statements + contains_forward_looking_statements: bool | None = Field( + default=None, + description="Whether the filing contains forward-looking statements", + ) + + # Key takeaways + investment_implications: str | None = Field( + default=None, + description="1-2 sentence assessment of potential implications for investors", + ) + + +# Other filings catch-all +class FilingOther(BaseSECFiling): + """ + Catch-all schema for other SEC filing types (e.g., S-1, DEF 14A, 13F, etc.) + """ + + document_type: str = Field( + description="The type of SEC filing (e.g., 'S-1', 'DEF 14A', '13F', 'SC 13D')" + ) + + filing_purpose: str | None = Field( + default=None, + description="The purpose of this filing type (e.g., 'IPO Registration', 'Proxy Statement', 'Insider Holdings')", + ) + + summary: str = Field( + description="3-4 sentence summary of the filing's key content and purpose" + ) + + key_information: list[str] | None = Field( + default=None, + description="List of 3-7 key pieces of information from the filing", + ) + + financial_data: FinancialMetrics | None = Field( + default=None, description="Any relevant financial metrics present in the filing" + ) + + material_events: list[str] | None = Field( + default=None, + description="List of any material events or transactions described", + ) + + parties_involved: list[str] | None = Field( + default=None, + description="Other parties mentioned (companies, executives, investors, etc.)", + ) + + investment_relevance: str | None = Field( + default=None, + description="Brief note on why this filing might be relevant for investment analysis", + ) + + +# Default schema for backward compatibility - now uses 10-K as the base +class ExtractionSchema(Filing10K): + """Default extraction schema - uses 10-K structure for backward compatibility""" + + pass + + +# Mapping of filing types to their schemas +FILING_SCHEMAS = { + "10-K": Filing10K, + "10-Q": Filing10Q, + "8-K": Filing8K, + "other": FilingOther, +} + + +# This is only used if USE_REMOTE_EXTRACTION_SCHEMA is False. +EXTRACT_CONFIG = ExtractConfig( + extraction_mode=ExtractMode.PREMIUM, + system_prompt=None, + # advanced. Only compatible with Premium mode. + use_reasoning=False, + cite_sources=False, + confidence_scores=True, +) + + +SCHEMA: Type[BaseModel] | None = ( + None if USE_REMOTE_EXTRACTION_SCHEMA else ExtractionSchema +) diff --git a/src/extraction_review/metadata_workflow.py b/src/extraction_review/metadata_workflow.py new file mode 100644 index 0000000..643b6f5 --- /dev/null +++ b/src/extraction_review/metadata_workflow.py @@ -0,0 +1,36 @@ +from typing import Any +from workflows import Workflow, step +from workflows.events import StartEvent, StopEvent + +import jsonref + +from .config import EXTRACTED_DATA_COLLECTION, FILING_SCHEMAS + + +class MetadataResponse(StopEvent): + schemas: dict[str, dict[str, Any]] + extracted_data_collection: str + + +class MetadataWorkflow(Workflow): + """ + Simple single step workflow to expose configuration to the UI, such as all JSON schemas and collection name. + """ + + @step + async def get_metadata(self, _: StartEvent) -> MetadataResponse: + # Convert all filing schemas to JSON schemas + schemas = {} + for filing_type, schema_class in FILING_SCHEMAS.items(): + json_schema = schema_class.model_json_schema() + # Resolve any $ref references + json_schema = jsonref.replace_refs(json_schema, proxies=False) + schemas[filing_type] = json_schema + + return MetadataResponse( + schemas=schemas, + extracted_data_collection=EXTRACTED_DATA_COLLECTION, + ) + + +workflow = MetadataWorkflow(timeout=None) diff --git a/src/extraction_review/process_file.py b/src/extraction_review/process_file.py new file mode 100644 index 0000000..03b5439 --- /dev/null +++ b/src/extraction_review/process_file.py @@ -0,0 +1,401 @@ +import asyncio +import hashlib +import logging +import os +from pathlib import Path +import tempfile +from typing import Any, Literal + +import httpx +from llama_cloud import ClassificationResult, ExtractRun +from llama_cloud.types import ClassifierRule, ClassifyParsingConfiguration +from llama_cloud_services.extract import SourceText +from llama_cloud_services.beta.agent_data import ExtractedData, InvalidExtractionData +from pydantic import BaseModel +from workflows import Context, Workflow, step +from workflows.events import Event, StartEvent, StopEvent + +from .clients import ( + get_classifier_client, + get_llama_cloud_client, + get_data_client, + get_extract_agent, +) +from .config import FILING_SCHEMAS + +logger = logging.getLogger(__name__) + + +class FileEvent(StartEvent): + file_id: str + + +class DownloadFileEvent(Event): + pass + + +class FileDownloadedEvent(Event): + pass + + +class ClassifyFileEvent(Event): + pass + + +class FileClassifiedEvent(Event): + filing_type: str + confidence: float | None = None + reasoning: str | None = None + + +class UIToast(Event): + level: Literal["info", "warning", "error"] + message: str + + +class ExtractedEvent(Event): + data: ExtractedData + + +class ExtractedInvalidEvent(Event): + data: ExtractedData[dict[str, Any]] + + +class ExtractionState(BaseModel): + file_id: str | None = None + file_path: str | None = None + filename: str | None = None + filing_type: str | None = None + classification_confidence: float | None = None + classification_reasoning: str | None = None + + +class ProcessFileWorkflow(Workflow): + """ + Given a file path, this workflow will process a single file through the custom extraction logic. + """ + + @step() + async def run_file(self, event: FileEvent, ctx: Context) -> DownloadFileEvent: + logger.info(f"Running file {event.file_id}") + async with ctx.store.edit_state() as state: + state.file_id = event.file_id + return DownloadFileEvent() + + @step() + async def download_file( + self, event: DownloadFileEvent, ctx: Context[ExtractionState] + ) -> ClassifyFileEvent: + """Download the file reference from the cloud storage""" + state = await ctx.store.get_state() + if state.file_id is None: + raise ValueError("File ID is not set") + try: + file_metadata = await get_llama_cloud_client().files.get_file( + id=state.file_id + ) + file_url = await get_llama_cloud_client().files.read_file_content( + state.file_id + ) + + temp_dir = tempfile.gettempdir() + filename = file_metadata.name + file_path = os.path.join(temp_dir, filename) + client = httpx.AsyncClient() + # Report progress to the UI + logger.info(f"Downloading file {file_url.url} to {file_path}") + + async with client.stream("GET", file_url.url) as response: + with open(file_path, "wb") as f: + async for chunk in response.aiter_bytes(): + f.write(chunk) + logger.info(f"Downloaded file {file_url.url} to {file_path}") + async with ctx.store.edit_state() as state: + state.file_path = file_path + state.filename = filename + return ClassifyFileEvent() + + except Exception as e: + logger.error(f"Error downloading file {state.file_id}: {e}", exc_info=True) + ctx.write_event_to_stream( + UIToast( + level="error", + message=f"Error downloading file {state.file_id}: {e}", + ) + ) + raise e + + @step() + async def classify_file( + self, event: ClassifyFileEvent, ctx: Context[ExtractionState] + ) -> FileClassifiedEvent: + """Classify the SEC filing document type""" + state = await ctx.store.get_state() + if state.file_path is None or state.filename is None: + raise ValueError("File path or filename is not set") + + try: + logger.info(f"Classifying file {state.filename}") + ctx.write_event_to_stream( + UIToast(level="info", message=f"Classifying file {state.filename}") + ) + + # Initialize the classifier + + classifier = get_classifier_client() + + # Define classification rules for SEC filing types + rules = [ + ClassifierRule( + type="10-K", + description=( + "Form 10-K is an annual report filed by public companies with the SEC. " + "It provides a comprehensive summary of a company's financial performance for the year, " + "including audited financial statements, management's discussion and analysis (MD&A), " + "risk factors, business description, and executive compensation. " + "Look for: 'Form 10-K', 'Annual Report', fiscal year references, audited financials." + ), + ), + ClassifierRule( + type="10-Q", + description=( + "Form 10-Q is a quarterly report filed by public companies with the SEC. " + "It provides unaudited financial statements and management discussion for a specific quarter. " + "Contains quarterly financial data, updates on business operations, and material changes. " + "Look for: 'Form 10-Q', 'Quarterly Report', quarter references (Q1, Q2, Q3), unaudited statements." + ), + ), + ClassifierRule( + type="8-K", + description=( + "Form 8-K is a current report filed to announce material events or corporate changes. " + "Used to notify investors of significant events like mergers, acquisitions, leadership changes, " + "earnings releases, or other material corporate events that shareholders should know about. " + "Look for: 'Form 8-K', 'Current Report', Item numbers (e.g., Item 1.01, Item 5.02), event dates, " + "specific triggering events." + ), + ), + ClassifierRule( + type="other", + description=( + "Any other SEC filing type not covered by 10-K, 10-Q, or 8-K. " + "This includes forms such as S-1 (IPO registration), DEF 14A (proxy statement), " + "13F (institutional holdings), SC 13D (beneficial ownership), and other SEC forms." + ), + ), + ] + + # Configure parsing - only parse first few pages for classification + parsing_config = ClassifyParsingConfiguration( + max_pages=5, # Only parse first 5 pages for faster classification + ) + + # Classify the file + results = await classifier.aclassify_file_paths( + rules=rules, + file_input_paths=[state.file_path], + parsing_configuration=parsing_config, + ) + + # Extract classification result + if results.items and len(results.items) > 0: + item = results.items[0] + result: ClassificationResult | None = item.result + if result: + filing_type = result.type + confidence = result.confidence + reasoning = result.reasoning + + logger.info( + f"Classified {state.filename} as {filing_type} " + f"(confidence: {confidence}, reasoning: {reasoning})" + ) + ctx.write_event_to_stream( + UIToast( + level="info", + message=f"Classified as {filing_type} SEC filing", + ) + ) + + async with ctx.store.edit_state() as state: + state.filing_type = filing_type + state.classification_confidence = confidence + state.classification_reasoning = reasoning + + return FileClassifiedEvent( + filing_type=filing_type, + confidence=confidence, + reasoning=reasoning, + ) + else: + # Classification failed, default to "other" + logger.warning( + f"Classification failed for {state.filename}, defaulting to 'other'" + ) + ctx.write_event_to_stream( + UIToast( + level="warning", + message="Classification uncertain, using default schema", + ) + ) + async with ctx.store.edit_state() as state: + state.filing_type = "other" + return FileClassifiedEvent(filing_type="other") + else: + # No results, default to "other" + logger.warning(f"No classification results for {state.filename}") + async with ctx.store.edit_state() as state: + state.filing_type = "other" + return FileClassifiedEvent(filing_type="other") + + except Exception as e: + logger.error(f"Error classifying file {state.filename}: {e}", exc_info=True) + ctx.write_event_to_stream( + UIToast( + level="warning", + message=f"Classification failed, using default schema: {e}", + ) + ) + # On error, default to "other" and continue + async with ctx.store.edit_state() as state: + state.filing_type = "other" + return FileClassifiedEvent(filing_type="other") + + @step() + async def process_file( + self, event: FileClassifiedEvent, ctx: Context[ExtractionState] + ) -> ExtractedEvent | ExtractedInvalidEvent: + """Runs the extraction against the file""" + state = await ctx.store.get_state() + if state.file_path is None or state.filename is None: + raise ValueError("File path or filename is not set") + try: + # Get the appropriate schema based on classification + filing_type = (state.filing_type or "other").upper() + schema = FILING_SCHEMAS.get(filing_type, FILING_SCHEMAS["other"]) + + logger.info(f"Using schema for filing type: {filing_type}") + ctx.write_event_to_stream( + UIToast( + level="info", + message=f"Extracting data using {filing_type} schema", + ) + ) + + agent = get_extract_agent() + # Update the agent's data schema for this specific filing type + agent.data_schema = schema + # track the content of the file, so as to be able to de-duplicate + file_content = Path(state.file_path).read_bytes() + file_hash = hashlib.sha256(file_content).hexdigest() + source_text = SourceText( + file=state.file_path, + filename=state.filename, + ) + logger.info(f"Extracting data from file {state.filename}") + ctx.write_event_to_stream( + UIToast( + level="info", message=f"Extracting data from file {state.filename}" + ) + ) + extracted_result: ExtractRun = await agent.aextract(source_text) + try: + logger.info(f"Extracted data: {extracted_result}") + data = ExtractedData.from_extraction_result( + result=extracted_result, + schema=schema, + file_hash=file_hash, + ) + # Add classification information to the extracted data + if data.metadata is None: + data.metadata = {} + data.metadata["classification"] = filing_type + data.metadata["classification_confidence"] = ( + state.classification_confidence + ) + data.metadata["classification_reasoning"] = ( + state.classification_reasoning + ) + return ExtractedEvent(data=data) + except InvalidExtractionData as e: + logger.error(f"Error validating extracted data: {e}", exc_info=True) + return ExtractedInvalidEvent(data=e.invalid_item) + except Exception as e: + logger.error( + f"Error extracting data from file {state.filename}: {e}", + exc_info=True, + ) + ctx.write_event_to_stream( + UIToast( + level="error", + message=f"Error extracting data from file {state.filename}: {e}", + ) + ) + raise e + + @step() + async def record_extracted_data( + self, event: ExtractedEvent | ExtractedInvalidEvent, ctx: Context + ) -> StopEvent: + """Records the extracted data to the agent data API""" + try: + logger.info(f"Recorded extracted data for file {event.data.file_name}") + ctx.write_event_to_stream( + UIToast( + level="info", + message=f"Recorded extracted data for file {event.data.file_name}", + ) + ) + # remove past data when reprocessing the same file + if event.data.file_hash: + existing_data = await get_data_client().untyped_search( + filter={ + "file_hash": { + "eq": event.data.file_hash, + }, + }, + ) + if existing_data.items: + logger.info( + f"Removing past data for file {event.data.file_name} with hash {event.data.file_hash}" + ) + await asyncio.gather( + *[ + get_data_client().delete_item(item.id) + for item in existing_data.items + ] + ) + # finally, save the new data + item_id = await get_data_client().create_item(event.data) + return StopEvent( + result=item_id.id, + ) + except Exception as e: + logger.error( + f"Error recording extracted data for file {event.data.file_name}: {e}", + exc_info=True, + ) + ctx.write_event_to_stream( + UIToast( + level="error", + message=f"Error recording extracted data for file {event.data.file_name}: {e}", + ) + ) + raise e + + +workflow = ProcessFileWorkflow(timeout=None) + +if __name__ == "__main__": + from dotenv import load_dotenv + + load_dotenv() + logging.basicConfig(level=logging.INFO) + + async def main(): + file = await get_llama_cloud_client().files.upload_file( + upload_file=Path("test.pdf").open("rb") + ) + await workflow.run(start_event=FileEvent(file_id=file.id)) + + asyncio.run(main()) diff --git a/src/extraction_review/schema.py b/src/extraction_review/schema.py new file mode 100644 index 0000000..c3b41fd --- /dev/null +++ b/src/extraction_review/schema.py @@ -0,0 +1,57 @@ +""" +Selects a locally defined shema, or queries the remote extraction agent for the schema. +""" + +import asyncio +import jsonref +from .clients import get_extract_agent +from .config import USE_REMOTE_EXTRACTION_SCHEMA, ExtractionSchema +from typing import Any, Type +from pydantic import BaseModel +from pydantic import create_model, Field + + +SCHEMA: Type[BaseModel] | None = ( + None if USE_REMOTE_EXTRACTION_SCHEMA else ExtractionSchema +) + + +_schema_lock = asyncio.Lock() + + +async def get_extraction_schema() -> Type[BaseModel]: + global SCHEMA + if SCHEMA is not None: + return SCHEMA + async with _schema_lock: + if SCHEMA is not None: + return SCHEMA + agent = get_extract_agent() + SCHEMA = model_from_schema(agent.data_schema) + return SCHEMA + + +async def get_extraction_schema_json() -> dict[str, Any]: + json_schema = (await get_extraction_schema()).model_json_schema() + json_schema = jsonref.replace_refs(json_schema, proxies=False) + return json_schema + + +def model_from_schema(schema: dict[str, Any]) -> Type[BaseModel]: + """ + Converts a JSON schema back to a Pydantic model. + """ + typemap = { + "string": str, + "integer": int, + "number": float, + "boolean": bool, + "array": list, + "object": dict, + } + fields = {} + for prop, meta in schema.get("properties", {}).items(): + py_type = typemap.get(meta.get("type"), Any) + default = ... if prop in schema.get("required", []) else None + fields[prop] = (py_type, Field(default, description=meta.get("description"))) + return create_model(schema.get("title", "DynamicModel"), **fields) diff --git a/tests/test_placeholder.py b/tests/test_placeholder.py new file mode 100644 index 0000000..201975f --- /dev/null +++ b/tests/test_placeholder.py @@ -0,0 +1,2 @@ +def test_placeholder(): + pass diff --git a/ui/.gitignore b/ui/.gitignore new file mode 100644 index 0000000..97856e8 --- /dev/null +++ b/ui/.gitignore @@ -0,0 +1,43 @@ +# See https://help.github.com/articles/ignoring-files/ for more about ignoring files. + +# dependencies +/node_modules +/.pnp +.pnp.* +.yarn/* +!.yarn/patches +!.yarn/plugins +!.yarn/releases +!.yarn/versions + +# testing +/coverage + +# next.js +/.next/ +/out/ +/dist/ + +# production +/build + +# misc +.DS_Store +*.pem + +# debug +npm-debug.log* +yarn-debug.log* +yarn-error.log* +.pnpm-debug.log* + +# env files (can opt-in for committing if needed) +.env* + +# vercel +.vercel + +# typescript +*.tsbuildinfo +next-env.d.ts + diff --git a/ui/README.md b/ui/README.md new file mode 100644 index 0000000..45b37fc --- /dev/null +++ b/ui/README.md @@ -0,0 +1,7 @@ +# Data Extraction UI + +This is a simple next.js template that builds on the @llamaindex/agent-app ui component library +for showing displaying tables of extracted data. + +Ideally run this with `llamactl` in the parent directory (See [README.md](../README.md)), +but you can also run it standalone with `npm run dev`, but workflow integrations will not work \ No newline at end of file diff --git a/ui/components.json b/ui/components.json new file mode 100644 index 0000000..17bc984 --- /dev/null +++ b/ui/components.json @@ -0,0 +1,21 @@ +{ + "$schema": "https://ui.shadcn.com/schema.json", + "style": "new-york", + "rsc": true, + "tsx": true, + "tailwind": { + "config": "", + "css": "src/index.css", + "baseColor": "zinc", + "cssVariables": true, + "prefix": "" + }, + "aliases": { + "components": "@/components", + "utils": "@/lib/utils", + "ui": "@/components/ui", + "lib": "@/lib", + "hooks": "@/hooks" + }, + "iconLibrary": "lucide" +} \ No newline at end of file diff --git a/ui/index.html b/ui/index.html new file mode 100644 index 0000000..b55a8d0 --- /dev/null +++ b/ui/index.html @@ -0,0 +1,12 @@ + + + + + + Review + + +
+ + + \ No newline at end of file diff --git a/ui/package.json b/ui/package.json new file mode 100644 index 0000000..9b56687 --- /dev/null +++ b/ui/package.json @@ -0,0 +1,45 @@ +{ + "name": "extraction-review-ui", + "version": "0.1.0", + "private": true, + "type": "module", + "scripts": { + "dev": "vite", + "build": "tsc && vite build", + "preview": "vite preview", + "lint": "tsc --noEmit", + "format": "prettier --write src", + "format-check": "prettier --check src", + "all-check": "pnpm i && pnpm run lint && pnpm run format-check && pnpm run build", + "all-fix": "pnpm i && pnpm run lint && pnpm run format && pnpm run build" + }, + "dependencies": { + "@babel/runtime": "^7.27.6", + "@lezer/highlight": "^1.2.1", + "@llamaindex/ui": "^2.1.2", + "@radix-ui/themes": "^3.2.1", + "class-variance-authority": "^0.7.1", + "clsx": "^2.1.1", + "llama-cloud-services": "^0.3.4", + "lucide-react": "^0.514.0", + "react": "^18.3.0", + "react-dom": "^18.3.0", + "react-router-dom": "^6.30.0", + "sonner": "^2.0.5", + "tw-animate-css": "^1.3.5" + }, + "devDependencies": { + "@tailwindcss/postcss": "^4.1.10", + "@types/node": "^20", + "@types/react": "^19", + "@types/react-dom": "^19", + "@vitejs/plugin-react": "^4.3.4", + "postcss": "^8.5.5", + "prettier": "^3.6.2", + "tailwind-merge": "^3.3.1", + "tailwindcss": "^4.1.8", + "typescript": "^5", + "vite": "^6.0.5" + }, + "packageManager": "pnpm@10.11.1+sha512.e519b9f7639869dc8d5c3c5dfef73b3f091094b0a006d7317353c72b124e80e1afd429732e28705ad6bfa1ee879c1fce46c128ccebd3192101f43dd67c667912" +} diff --git a/ui/postcss.config.mjs b/ui/postcss.config.mjs new file mode 100644 index 0000000..4da3340 --- /dev/null +++ b/ui/postcss.config.mjs @@ -0,0 +1,7 @@ +const config = { + plugins: { + "@tailwindcss/postcss": {}, + }, +}; + +export default config; \ No newline at end of file diff --git a/ui/src/App.tsx b/ui/src/App.tsx new file mode 100644 index 0000000..949efb4 --- /dev/null +++ b/ui/src/App.tsx @@ -0,0 +1,70 @@ +import React from "react"; +import { Routes, Route } from "react-router-dom"; +import { Theme } from "@radix-ui/themes"; +import { + Breadcrumb, + BreadcrumbItem, + BreadcrumbList, + BreadcrumbSeparator, +} from "@llamaindex/ui"; +import { Link } from "react-router-dom"; +import { Toaster } from "@llamaindex/ui"; +import { useToolbar, ToolbarProvider } from "@/lib/ToolbarContext"; +import { MetadataProvider } from "@/lib/MetadataProvider"; + +// Import pages +import HomePage from "./pages/HomePage"; +import ItemPage from "./pages/ItemPage"; + +export default function App() { + return ( + + + +
+ +
+ + } /> + } /> + +
+
+ +
+
+
+ ); +} + +const Toolbar = () => { + const { buttons, breadcrumbs } = useToolbar(); + + return ( +
+ + + {breadcrumbs.map((item, index) => ( + + {index > 0 && } + + {item.href && !item.isCurrentPage ? ( + + {item.label} + + ) : ( + + {item.label} + + )} + + + ))} + + + {buttons} +
+ ); +}; diff --git a/ui/src/index.css b/ui/src/index.css new file mode 100644 index 0000000..d8e401f --- /dev/null +++ b/ui/src/index.css @@ -0,0 +1,120 @@ +@import "tailwindcss"; +@import "tw-animate-css"; + +@custom-variant dark (&:is(.dark *)); + +@theme inline { + --radius-sm: calc(var(--radius) - 4px); + --radius-md: calc(var(--radius) - 2px); + --radius-lg: var(--radius); + --radius-xl: calc(var(--radius) + 4px); + --color-background: var(--background); + --color-foreground: var(--foreground); + --color-card: var(--card); + --color-card-foreground: var(--card-foreground); + --color-popover: var(--popover); + --color-popover-foreground: var(--popover-foreground); + --color-primary: var(--primary); + --color-primary-foreground: var(--primary-foreground); + --color-secondary: var(--secondary); + --color-secondary-foreground: var(--secondary-foreground); + --color-muted: var(--muted); + --color-muted-foreground: var(--muted-foreground); + --color-accent: var(--accent); + --color-accent-foreground: var(--accent-foreground); + --color-destructive: var(--destructive); + --color-border: var(--border); + --color-input: var(--input); + --color-ring: var(--ring); + --color-chart-1: var(--chart-1); + --color-chart-2: var(--chart-2); + --color-chart-3: var(--chart-3); + --color-chart-4: var(--chart-4); + --color-chart-5: var(--chart-5); + --color-sidebar: var(--sidebar); + --color-sidebar-foreground: var(--sidebar-foreground); + --color-sidebar-primary: var(--sidebar-primary); + --color-sidebar-primary-foreground: var(--sidebar-primary-foreground); + --color-sidebar-accent: var(--sidebar-accent); + --color-sidebar-accent-foreground: var(--sidebar-accent-foreground); + --color-sidebar-border: var(--sidebar-border); + --color-sidebar-ring: var(--sidebar-ring); +} + +:root { + --radius: 0.625rem; + --card: oklch(1 0 0); + --card-foreground: oklch(0.141 0.005 285.823); + --popover: oklch(1 0 0); + --popover-foreground: oklch(0.141 0.005 285.823); + --primary: oklch(0.21 0.006 285.885); + --primary-foreground: oklch(0.985 0 0); + --secondary: oklch(0.967 0.001 286.375); + --secondary-foreground: oklch(0.21 0.006 285.885); + --muted: oklch(0.967 0.001 286.375); + --muted-foreground: oklch(0.552 0.016 285.938); + --accent: oklch(0.967 0.001 286.375); + --accent-foreground: oklch(0.21 0.006 285.885); + --destructive: oklch(0.577 0.245 27.325); + --border: oklch(0.92 0.004 286.32); + --input: oklch(0.92 0.004 286.32); + --ring: oklch(0.705 0.015 286.067); + --chart-1: oklch(0.646 0.222 41.116); + --chart-2: oklch(0.6 0.118 184.704); + --chart-3: oklch(0.398 0.07 227.392); + --chart-4: oklch(0.828 0.189 84.429); + --chart-5: oklch(0.769 0.188 70.08); + --sidebar: oklch(0.985 0 0); + --sidebar-foreground: oklch(0.141 0.005 285.823); + --sidebar-primary: oklch(0.21 0.006 285.885); + --sidebar-primary-foreground: oklch(0.985 0 0); + --sidebar-accent: oklch(0.967 0.001 286.375); + --sidebar-accent-foreground: oklch(0.21 0.006 285.885); + --sidebar-border: oklch(0.92 0.004 286.32); + --sidebar-ring: oklch(0.705 0.015 286.067); + --background: oklch(1 0 0); + --foreground: oklch(0.141 0.005 285.823); +} + +.dark { + --background: oklch(0.141 0.005 285.823); + --foreground: oklch(0.985 0 0); + --card: oklch(0.21 0.006 285.885); + --card-foreground: oklch(0.985 0 0); + --popover: oklch(0.21 0.006 285.885); + --popover-foreground: oklch(0.985 0 0); + --primary: oklch(0.92 0.004 286.32); + --primary-foreground: oklch(0.21 0.006 285.885); + --secondary: oklch(0.274 0.006 286.033); + --secondary-foreground: oklch(0.985 0 0); + --muted: oklch(0.274 0.006 286.033); + --muted-foreground: oklch(0.705 0.015 286.067); + --accent: oklch(0.274 0.006 286.033); + --accent-foreground: oklch(0.985 0 0); + --destructive: oklch(0.704 0.191 22.216); + --border: oklch(1 0 0 / 10%); + --input: oklch(1 0 0 / 15%); + --ring: oklch(0.552 0.016 285.938); + --chart-1: oklch(0.488 0.243 264.376); + --chart-2: oklch(0.696 0.17 162.48); + --chart-3: oklch(0.769 0.188 70.08); + --chart-4: oklch(0.627 0.265 303.9); + --chart-5: oklch(0.645 0.246 16.439); + --sidebar: oklch(0.21 0.006 285.885); + --sidebar-foreground: oklch(0.985 0 0); + --sidebar-primary: oklch(0.488 0.243 264.376); + --sidebar-primary-foreground: oklch(0.985 0 0); + --sidebar-accent: oklch(0.274 0.006 286.033); + --sidebar-accent-foreground: oklch(0.985 0 0); + --sidebar-border: oklch(1 0 0 / 10%); + --sidebar-ring: oklch(0.552 0.016 285.938); +} + +@layer base { + * { + @apply border-border outline-ring/50; + } + body { + @apply bg-background text-foreground; + } +} diff --git a/ui/src/lib/MetadataProvider.tsx b/ui/src/lib/MetadataProvider.tsx new file mode 100644 index 0000000..3cdee00 --- /dev/null +++ b/ui/src/lib/MetadataProvider.tsx @@ -0,0 +1,71 @@ +import { createContext, useContext, ReactNode, useMemo } from "react"; +import { ApiProvider, ApiClients } from "@llamaindex/ui"; +import { useMetadata, Metadata } from "./useMetadata"; +import { createBaseWorkflowClient, createClients } from "./client"; +import { Clock, XCircle } from "lucide-react"; + +interface MetadataContextValue { + metadata: Metadata; + clients: ApiClients; +} + +const MetadataContext = createContext(null); + +export function MetadataProvider({ children }: { children: ReactNode }) { + const baseClients: ApiClients = useMemo(() => { + return { + workflowsClient: createBaseWorkflowClient(), + } as ApiClients; + }, []); + return ( + + {children} + + ); +} + +function InnerMetadataProvider({ children }: { children: ReactNode }) { + const { metadata, loading, error } = useMetadata(); + const clients = useMemo( + () => (metadata ? createClients(metadata) : undefined), + [metadata], + ); + + if (loading) { + return ( +
+
+ +
Loading configuration...
+
+
+ ); + } + + if (error || !metadata || !clients) { + return ( +
+
+ +
+ Error loading configuration: {error || "Unknown error"} +
+
+
+ ); + } + + return ( + + {children} + + ); +} + +export function useMetadataContext() { + const context = useContext(MetadataContext); + if (!context) { + throw new Error("useMetadataContext must be used within MetadataProvider"); + } + return context; +} diff --git a/ui/src/lib/ToolbarContext.tsx b/ui/src/lib/ToolbarContext.tsx new file mode 100644 index 0000000..6fa1067 --- /dev/null +++ b/ui/src/lib/ToolbarContext.tsx @@ -0,0 +1,41 @@ +import React from "react"; +import { APP_TITLE } from "./config"; + +export interface BreadcrumbItem { + label: string; + href?: string; + isCurrentPage?: boolean; +} + +export const ToolbarCtx = React.createContext<{ + buttons: React.ReactNode[]; + setButtons: (fn: (prev: React.ReactNode[]) => React.ReactNode[]) => void; + breadcrumbs: BreadcrumbItem[]; + setBreadcrumbs: (items: BreadcrumbItem[]) => void; +}>({ + buttons: [], + setButtons: () => {}, + breadcrumbs: [], + setBreadcrumbs: () => {}, +}); + +export const ToolbarProvider = ({ + children, +}: { + children: React.ReactNode; +}) => { + const [buttons, setButtons] = React.useState([]); + const [breadcrumbs, setBreadcrumbs] = React.useState([ + { label: APP_TITLE, href: "/" }, + ]); + + return ( + + {children} + + ); +}; + +export const useToolbar = () => React.useContext(ToolbarCtx); diff --git a/ui/src/lib/client.ts b/ui/src/lib/client.ts new file mode 100644 index 0000000..566b956 --- /dev/null +++ b/ui/src/lib/client.ts @@ -0,0 +1,51 @@ +import { ExtractedData } from "llama-cloud-services/beta/agent"; +import { + ApiClients, + createWorkflowsClient, + createWorkflowsConfig, + createCloudAgentClient, + cloudApiClient, +} from "@llamaindex/ui"; +import { AGENT_NAME } from "./config"; +import type { Metadata } from "./useMetadata"; + +const platformToken = import.meta.env.VITE_LLAMA_CLOUD_API_KEY; +const apiBaseUrl = import.meta.env.VITE_LLAMA_CLOUD_BASE_URL; +const projectId = import.meta.env.VITE_LLAMA_DEPLOY_PROJECT_ID; + +// Configure the platform client +cloudApiClient.setConfig({ + ...(apiBaseUrl && { baseUrl: apiBaseUrl }), + headers: { + // optionally use a backend API token scoped to a project. For local development, + ...(platformToken && { authorization: `Bearer ${platformToken}` }), + // This header is required for requests to correctly scope to the agent's project + // when authenticating with a user cookie + ...(projectId && { "Project-Id": projectId }), + }, +}); + +export function createBaseWorkflowClient(): ReturnType< + typeof createWorkflowsClient +> { + return createWorkflowsClient( + createWorkflowsConfig({ + baseUrl: `/deployments/${AGENT_NAME}/`, + }), + ); +} + +export function createClients(metadata: Metadata): ApiClients { + const workflowsClient = createBaseWorkflowClient(); + const agentClient = createCloudAgentClient>({ + client: cloudApiClient, + windowUrl: typeof window !== "undefined" ? window.location.href : undefined, + collection: metadata.extracted_data_collection, + }); + + return { + workflowsClient, + cloudApiClient, + agentDataClient: agentClient, + } as ApiClients; +} diff --git a/ui/src/lib/config.ts b/ui/src/lib/config.ts new file mode 100644 index 0000000..b1f3492 --- /dev/null +++ b/ui/src/lib/config.ts @@ -0,0 +1,2 @@ +export const APP_TITLE = "Extraction Review"; +export const AGENT_NAME = import.meta.env.VITE_LLAMA_DEPLOY_DEPLOYMENT_NAME; diff --git a/ui/src/lib/export.ts b/ui/src/lib/export.ts new file mode 100644 index 0000000..5a1da7f --- /dev/null +++ b/ui/src/lib/export.ts @@ -0,0 +1,39 @@ +import type { + ExtractedData, + TypedAgentData, +} from "llama-cloud-services/beta/agent"; + +/** + * Downloads data as a JSON file + */ +export function downloadJSON( + data: T, + filename: string = "extraction-results.json", +) { + const jsonString = JSON.stringify(data, null, 2); + const blob = new Blob([jsonString], { type: "application/json" }); + const url = URL.createObjectURL(blob); + + const link = document.createElement("a"); + link.href = url; + link.download = filename; + document.body.appendChild(link); + link.click(); + + // Cleanup + document.body.removeChild(link); + URL.revokeObjectURL(url); +} + +/** + * Downloads extracted data item as JSON + */ +export function downloadExtractedDataItem( + item: TypedAgentData>, +) { + const fileName = item.data.file_name || "item"; + const timestamp = item.createdAt.toISOString().split("T")[0]; + const filename = `${fileName}-${timestamp}.json`; + + downloadJSON(item, filename); +} diff --git a/ui/src/lib/useMetadata.ts b/ui/src/lib/useMetadata.ts new file mode 100644 index 0000000..eef286e --- /dev/null +++ b/ui/src/lib/useMetadata.ts @@ -0,0 +1,41 @@ +import { useWorkflowHandler, useWorkflowRun } from "@llamaindex/ui"; +import { useEffect, useState } from "react"; + +export interface Metadata { + schemas: Record; + extracted_data_collection: string; +} + +export interface UseMetadataResult { + metadata: Metadata; + loading: boolean; + error: string | undefined; +} + +export function useMetadata() { + const run = useWorkflowRun(); + const [handlerId, setHandlerId] = useState(undefined); + const handler = useWorkflowHandler(handlerId ?? ""); + const [error, setError] = useState(undefined); + const [loading, setLoading] = useState(true); + + useEffect(() => { + setLoading(true); + run + .runWorkflow("metadata", {}) + .then((handlerSummary) => { + setHandlerId(handlerSummary.handler_id); + }) + .catch((error) => { + setError(error.message); + }) + .finally(() => { + setLoading(false); + }); + }, []); + const stopEvent = handler.events.find((event) => + event.type.endsWith("MetadataResponse"), + ); + const metadata = stopEvent?.data as Metadata | undefined; + return { metadata, loading, error }; +} diff --git a/ui/src/lib/utils.ts b/ui/src/lib/utils.ts new file mode 100644 index 0000000..a5ef193 --- /dev/null +++ b/ui/src/lib/utils.ts @@ -0,0 +1,6 @@ +import { clsx, type ClassValue } from "clsx"; +import { twMerge } from "tailwind-merge"; + +export function cn(...inputs: ClassValue[]) { + return twMerge(clsx(inputs)); +} diff --git a/ui/src/main.tsx b/ui/src/main.tsx new file mode 100644 index 0000000..b56bed7 --- /dev/null +++ b/ui/src/main.tsx @@ -0,0 +1,14 @@ +import { StrictMode } from "react"; +import { createRoot } from "react-dom/client"; +import { HashRouter } from "react-router-dom"; +import App from "./App"; +import "@llamaindex/ui/styles.css"; +import "./index.css"; + +createRoot(document.getElementById("root")!).render( + + + + + , +); diff --git a/ui/src/pages/HomePage.module.css b/ui/src/pages/HomePage.module.css new file mode 100644 index 0000000..b2e2b61 --- /dev/null +++ b/ui/src/pages/HomePage.module.css @@ -0,0 +1,23 @@ +.main { + padding: 1rem; +} + +.grid { + display: flex; + flex-direction: row; + gap: 1rem; + margin-bottom: 1rem; + & > * { + flex: 1; + } +} + +.commandBar { + display: flex; + justify-content: flex-end; + margin-bottom: 1rem; +} + +.progressBar { + margin-bottom: 1rem; +} diff --git a/ui/src/pages/HomePage.tsx b/ui/src/pages/HomePage.tsx new file mode 100644 index 0000000..97cd6f6 --- /dev/null +++ b/ui/src/pages/HomePage.tsx @@ -0,0 +1,88 @@ +import { + ItemCount, + WorkflowTrigger, + WorkflowProgressBar, + ExtractedDataItemGrid, + useWorkflowHandlerList, +} from "@llamaindex/ui"; +import type { TypedAgentData } from "llama-cloud-services/beta/agent"; +import styles from "./HomePage.module.css"; +import { useNavigate } from "react-router-dom"; +import { useEffect, useState } from "react"; + +export default function HomePage() { + const { taskKey } = taskCompletedState(); + return ; +} + +/** + * Returns a key that increments when a task is completed, can be used to force a re-render of the task list + */ +function taskCompletedState() { + const { handlers } = useWorkflowHandlerList("process-file"); + const runningTasks = handlers.filter( + (handler) => handler.status === "running", + ); + const [runningTaskCount, setRunningTaskCount] = useState(runningTasks.length); + const [taskKey, setTaskKey] = useState(0); + useEffect(() => { + if (runningTasks.length < runningTaskCount) { + // forcefully reload task list after a task is completed + setTaskKey(taskKey + 1); + } + setRunningTaskCount(runningTasks.length); + }, [runningTasks.length]); + return { runningTaskCount, taskKey }; +} + +function TaskList() { + const navigate = useNavigate(); + const goToItem = (item: TypedAgentData) => { + navigate(`/item/${item.id}`); + }; + return ( +
+
+
+ + + +
+
+ { + return { + file_id: files[0].fileId, + }; + }} + /> +
+ + +
+
+ ); +} diff --git a/ui/src/pages/ItemPage.tsx b/ui/src/pages/ItemPage.tsx new file mode 100644 index 0000000..7e437f8 --- /dev/null +++ b/ui/src/pages/ItemPage.tsx @@ -0,0 +1,189 @@ +import { useEffect, useState } from "react"; +import { + AcceptReject, + ExtractedDataDisplay, + FilePreview, + useItemData, + type Highlight, + Button, +} from "@llamaindex/ui"; +import { Clock, XCircle, Download } from "lucide-react"; +import { useParams } from "react-router-dom"; +import { useToolbar } from "@/lib/ToolbarContext"; +import { useNavigate } from "react-router-dom"; +import { modifyJsonSchema } from "@llamaindex/ui/lib"; +import { APP_TITLE } from "@/lib/config"; +import { downloadExtractedDataItem } from "@/lib/export"; +import { useMetadataContext } from "@/lib/MetadataProvider"; + +export default function ItemPage() { + const { itemId } = useParams<{ itemId: string }>(); + const { setButtons, setBreadcrumbs } = useToolbar(); + const [highlight, setHighlight] = useState(undefined); + const { metadata } = useMetadataContext(); + + // Use the hook to fetch item data (initially with a default schema) + const itemHookData = useItemData({ + // We'll update the schema based on classification once data loads + jsonSchema: modifyJsonSchema(metadata.schemas["10-K"] || {}, {}), + itemId: itemId as string, + isMock: false, + }); + + // Determine the correct schema based on classification + const classification = ( + (itemHookData.item?.data?.metadata?.classification as string | undefined) || + "10-K" + ).toUpperCase(); + const correctSchema = + metadata.schemas[classification] || metadata.schemas["10-K"]; + + // Update the schema in itemHookData if classification is available + const [schemaKey, setSchemaKey] = useState(0); + const [appliedSchema, setAppliedSchema] = useState(correctSchema); + + useEffect(() => { + if (classification && metadata.schemas[classification]) { + setAppliedSchema(modifyJsonSchema(metadata.schemas[classification], {})); + setSchemaKey(schemaKey + 1); + } + }, [classification, metadata.schemas]); + + const navigate = useNavigate(); + + // Update breadcrumb when item data loads + useEffect(() => { + const fileName = itemHookData.item?.data?.file_name; + if (fileName) { + setBreadcrumbs([ + { label: APP_TITLE, href: "/" }, + { + label: fileName, + isCurrentPage: true, + }, + ]); + } + + return () => { + // Reset to default breadcrumb when leaving the page + setBreadcrumbs([{ label: APP_TITLE, href: "/" }]); + }; + }, [itemHookData.item?.data?.file_name, setBreadcrumbs]); + + useEffect(() => { + setButtons(() => [ +
+ + + itemData={itemHookData} + onComplete={() => navigate("/")} + /> +
, + ]); + return () => { + setButtons(() => []); + }; + }, [itemHookData.data, setButtons]); + + const { + item: itemData, + updateData, + loading: isLoading, + error, + } = itemHookData; + + const classificationReasoning = itemData?.data?.metadata + ?.classification_reasoning as string | undefined; + + if (isLoading) { + return ( +
+
+ +
Loading item...
+
+
+ ); + } + + if (error || !itemData) { + return ( +
+
+ +
+ Error loading item: {error || "Item not found"} +
+
+
+ ); + } + + return ( +
+ {/* Left Side - File Preview */} +
+ {itemData.data.file_id && ( + { + console.log("Bounding box clicked:", box, "on page:", pageNumber); + }} + highlight={highlight} + /> + )} +
+ + {/* Right Side - Review Panel */} +
+
+ {/* Classification Info */} + {classification && ( +
+
+ Document Type: {classification} +
+ {classificationReasoning && ( +
+ {classificationReasoning} +
+ )} +
+ )} + {/* Extracted Data */} + + key={schemaKey} + extractedData={itemData.data} + title="Extracted Data" + onChange={(updatedData) => { + updateData(updatedData); + }} + onClickField={(args) => { + // TODO: set multiple highlights + setHighlight({ + page: args.metadata?.citation?.[0]?.page ?? 1, + x: 100, + y: 100, + width: 0, + height: 0, + }); + }} + jsonSchema={appliedSchema} + /> +
+
+
+ ); +} diff --git a/ui/src/vite-env.d.ts b/ui/src/vite-env.d.ts new file mode 100644 index 0000000..25ad5bc --- /dev/null +++ b/ui/src/vite-env.d.ts @@ -0,0 +1,15 @@ +/// + +interface ImportMetaEnv { + readonly VITE_LLAMA_CLOUD_API_KEY?: string; + readonly VITE_LLAMA_CLOUD_BASE_URL?: string; + + // injected from llama_deploy + readonly VITE_LLAMA_DEPLOY_BASE_PATH: string; + readonly VITE_LLAMA_DEPLOY_DEPLOYMENT_NAME: string; + readonly VITE_LLAMA_DEPLOY_PROJECT_ID: string; +} + +interface ImportMeta { + readonly env: ImportMetaEnv; +} diff --git a/ui/tsconfig.json b/ui/tsconfig.json new file mode 100644 index 0000000..4fbd359 --- /dev/null +++ b/ui/tsconfig.json @@ -0,0 +1,31 @@ +{ + "compilerOptions": { + "target": "ES2020", + "useDefineForClassFields": true, + "lib": ["ES2020", "DOM", "DOM.Iterable"], + "module": "ESNext", + "skipLibCheck": true, + + /* Bundler mode */ + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "resolveJsonModule": true, + "isolatedModules": true, + "noEmit": true, + "jsx": "react-jsx", + + /* Linting */ + "strict": true, + "noUnusedLocals": true, + "noUnusedParameters": true, + "noFallthroughCasesInSwitch": true, + + /* Path mapping */ + "baseUrl": ".", + "paths": { + "@/*": ["./src/*"] + } + }, + "include": ["src", "vite.config.ts", "src/vite-env.d.ts"], + "exclude": ["node_modules"] +} diff --git a/ui/vite.config.ts b/ui/vite.config.ts new file mode 100644 index 0000000..cfc8178 --- /dev/null +++ b/ui/vite.config.ts @@ -0,0 +1,43 @@ +import { defineConfig } from "vite"; +import react from "@vitejs/plugin-react"; +import path from "path"; + +// https://vitejs.dev/config/ +export default defineConfig(({}) => { + const deploymentName = process.env.LLAMA_DEPLOY_DEPLOYMENT_NAME; + const basePath = process.env.LLAMA_DEPLOY_DEPLOYMENT_BASE_PATH; + const projectId = process.env.LLAMA_DEPLOY_PROJECT_ID; + const port = process.env.PORT ? Number(process.env.PORT) : 3000; + const baseUrl = process.env.LLAMA_CLOUD_BASE_URL; + return { + plugins: [react()], + resolve: { + alias: { + "@": path.resolve(__dirname, "./src"), + }, + }, + server: { + port: port, + host: true, + }, + build: { + outDir: "dist", + sourcemap: true, + }, + base: basePath, + define: { + // Primary define uses NAME + "import.meta.env.VITE_LLAMA_DEPLOY_DEPLOYMENT_NAME": JSON.stringify( + deploymentName + ), + "import.meta.env.VITE_LLAMA_DEPLOY_DEPLOYMENT_BASE_PATH": JSON.stringify(basePath), + ...(projectId && { + "import.meta.env.VITE_LLAMA_DEPLOY_PROJECT_ID": + JSON.stringify(projectId), + }), + ...(baseUrl && { + "import.meta.env.VITE_LLAMA_CLOUD_BASE_URL": JSON.stringify(baseUrl), + }), + }, + }; +}); diff --git a/{{ _copier_conf.answers_file }}.jinja b/{{ _copier_conf.answers_file }}.jinja new file mode 100644 index 0000000..88acac8 --- /dev/null +++ b/{{ _copier_conf.answers_file }}.jinja @@ -0,0 +1,2 @@ +# Changes here will be overwritten by Copier; NEVER EDIT MANUALLY +{{ _copier_answers|to_nice_yaml -}} \ No newline at end of file