From c1b00b79201fce9f5c9fa809cf1fdf2b7a934405 Mon Sep 17 00:00:00 2001 From: Adrian Lyjak Date: Wed, 22 Apr 2026 21:34:28 -0400 Subject: [PATCH] Classify v2 fake, agent_data.create migration, downstream copier updates (#268) --- pyproject.toml | 2 +- src/extraction_review/clients.py | 7 ----- src/extraction_review/process_file.py | 2 +- tests/conftest.py | 21 +++++++------- tests/test_workflow.py | 41 ++------------------------- 5 files changed, 15 insertions(+), 58 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 77e5910..add1bfc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ description = "Extracts data" readme = "README.md" requires-python = ">=3.12" dependencies = [ - "llama-cloud>=2.3.0,<3", + "llama-cloud>=2.4.1,<3", "json-schema-to-pydantic>=0.4.8", "llama-index-workflows>=2.16.0,<3.0.0", "python-dotenv>=1.1.0", diff --git a/src/extraction_review/clients.py b/src/extraction_review/clients.py index b65d270..8b8fbda 100644 --- a/src/extraction_review/clients.py +++ b/src/extraction_review/clients.py @@ -13,13 +13,6 @@ api_key = os.getenv("LLAMA_CLOUD_API_KEY") base_url = os.getenv("LLAMA_CLOUD_BASE_URL") project_id = os.getenv("LLAMA_DEPLOY_PROJECT_ID") -if os.getenv("FAKE_LLAMA_CLOUD"): - from llama_cloud_fake import FakeLlamaCloudServer - - fake = FakeLlamaCloudServer().install() -else: - fake = None - def get_llama_cloud_client() -> AsyncLlamaCloud: """Cloud services connection for file storage and processing.""" diff --git a/src/extraction_review/process_file.py b/src/extraction_review/process_file.py index a2bbdac..e5013e2 100644 --- a/src/extraction_review/process_file.py +++ b/src/extraction_review/process_file.py @@ -221,7 +221,7 @@ class ProcessFileWorkflow(Workflow): f"Removed {delete_result.deleted_count} existing record(s) " f"for file {extracted_data.file_name}" ) - item = await llama_cloud_client.beta.agent_data.agent_data( + item = await llama_cloud_client.beta.agent_data.create( data=data_dict, deployment_name=agent_name or "_public", collection=EXTRACTED_DATA_COLLECTION, diff --git a/tests/conftest.py b/tests/conftest.py index 00fdb43..881adaa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,21 +1,20 @@ -""" -Pytest configuration for data-extraction tests. - -IMPORTANT: FAKE_LLAMA_CLOUD must be set before any test modules are imported, -as extraction_review.clients reads this at module load time to initialize -the mock server. -""" +"""Pytest configuration: install the LlamaCloud fake server for all tests.""" import logging -import os import sys -# Configure logging to stdout at INFO level +import pytest +from llama_cloud_fake import FakeLlamaCloudServer + logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", handlers=[logging.StreamHandler(sys.stdout)], ) -# Enable the fake LlamaCloud server for all tests -os.environ["FAKE_LLAMA_CLOUD"] = "true" +_fake = FakeLlamaCloudServer().install() + + +@pytest.fixture +def fake() -> FakeLlamaCloudServer: + return _fake diff --git a/tests/test_workflow.py b/tests/test_workflow.py index 8b954f6..1f5a81e 100644 --- a/tests/test_workflow.py +++ b/tests/test_workflow.py @@ -1,35 +1,15 @@ -""" - -This is the default test for the data-extraction template. -Adapt the test to your use case following the guidelines below. - - -You can read and modify this file. -Here are your editing permissions, which you **MUST ALWAYS** follow: - -- Lines and blocks tagged with `` should be ALWAYS modified to something different, based on the use case. -- Lines and blocks tagged with `` should be adapted to the specific use case you are dealing with, but only if needed. -- If something does not have tags, it **MUST NOT** be modified. - -""" - import json -import warnings from pathlib import Path import pytest -from extraction_review.clients import fake - -# from extraction_review.config import EXTRACTED_DATA_COLLECTION from extraction_review.metadata_workflow import MetadataResponse from extraction_review.metadata_workflow import workflow as metadata_workflow from extraction_review.process_file import FileEvent from extraction_review.process_file import workflow as process_file_workflow +from llama_cloud_fake import FakeLlamaCloudServer from workflows.events import StartEvent -# - def get_extraction_schema() -> dict: """Load the extraction schema from the unified config file.""" @@ -39,40 +19,25 @@ def get_extraction_schema() -> dict: @pytest.mark.asyncio -# async def test_process_file_workflow( monkeypatch: pytest.MonkeyPatch, + fake: FakeLlamaCloudServer, ) -> None: monkeypatch.setenv("LLAMA_CLOUD_API_KEY", "fake-api-key") - # load a file to the mock LlamaCloud server and retrieve its file id (modify if you don't have any files to load as input) - if fake is not None: - file_id = fake.files.preload(path="tests/files/test.pdf") - else: - warnings.warn( - "Skipping test because it cannot be mocked. Set `FAKE_LLAMA_CLOUD=true` in your environment to enable this test..." - ) - return + file_id = fake.files.preload(path="tests/files/test.pdf") try: result = await process_file_workflow.run(start_event=FileEvent(file_id=file_id)) except Exception: result = None assert result is not None # all generated agent data IDs are alphanumeric strings with 7 characters - # the following assert statements ensure that that is the case assert isinstance(result, str) assert len(result) == 7 -# - - -# @pytest.mark.asyncio async def test_metadata_workflow() -> None: result = await metadata_workflow.run(start_event=StartEvent()) assert isinstance(result, MetadataResponse) assert result.extracted_data_collection == EXTRACTED_DATA_COLLECTION assert result.json_schema == get_extraction_schema() - - -#