mirror of
https://github.com/run-llama/template-workflow-data-extraction.git
synced 2026-06-30 21:38:03 -04:00
Classify v2 fake, agent_data.create migration, downstream copier updates (#268)
This commit is contained in:
+1
-1
@@ -5,7 +5,7 @@ description = "Extracts data"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"llama-cloud>=2.3.0,<3",
|
||||
"llama-cloud>=2.4.1,<3",
|
||||
"json-schema-to-pydantic>=0.4.8",
|
||||
"llama-index-workflows>=2.16.0,<3.0.0",
|
||||
"python-dotenv>=1.1.0",
|
||||
|
||||
@@ -13,13 +13,6 @@ api_key = os.getenv("LLAMA_CLOUD_API_KEY")
|
||||
base_url = os.getenv("LLAMA_CLOUD_BASE_URL")
|
||||
project_id = os.getenv("LLAMA_DEPLOY_PROJECT_ID")
|
||||
|
||||
if os.getenv("FAKE_LLAMA_CLOUD"):
|
||||
from llama_cloud_fake import FakeLlamaCloudServer
|
||||
|
||||
fake = FakeLlamaCloudServer().install()
|
||||
else:
|
||||
fake = None
|
||||
|
||||
|
||||
def get_llama_cloud_client() -> AsyncLlamaCloud:
|
||||
"""Cloud services connection for file storage and processing."""
|
||||
|
||||
@@ -221,7 +221,7 @@ class ProcessFileWorkflow(Workflow):
|
||||
f"Removed {delete_result.deleted_count} existing record(s) "
|
||||
f"for file {extracted_data.file_name}"
|
||||
)
|
||||
item = await llama_cloud_client.beta.agent_data.agent_data(
|
||||
item = await llama_cloud_client.beta.agent_data.create(
|
||||
data=data_dict,
|
||||
deployment_name=agent_name or "_public",
|
||||
collection=EXTRACTED_DATA_COLLECTION,
|
||||
|
||||
+10
-11
@@ -1,21 +1,20 @@
|
||||
"""
|
||||
Pytest configuration for data-extraction tests.
|
||||
|
||||
IMPORTANT: FAKE_LLAMA_CLOUD must be set before any test modules are imported,
|
||||
as extraction_review.clients reads this at module load time to initialize
|
||||
the mock server.
|
||||
"""
|
||||
"""Pytest configuration: install the LlamaCloud fake server for all tests."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Configure logging to stdout at INFO level
|
||||
import pytest
|
||||
from llama_cloud_fake import FakeLlamaCloudServer
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
handlers=[logging.StreamHandler(sys.stdout)],
|
||||
)
|
||||
|
||||
# Enable the fake LlamaCloud server for all tests
|
||||
os.environ["FAKE_LLAMA_CLOUD"] = "true"
|
||||
_fake = FakeLlamaCloudServer().install()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fake() -> FakeLlamaCloudServer:
|
||||
return _fake
|
||||
|
||||
+3
-38
@@ -1,35 +1,15 @@
|
||||
"""
|
||||
<system>
|
||||
This is the default test for the data-extraction template.
|
||||
Adapt the test to your use case following the guidelines below.
|
||||
</system>
|
||||
<guidelines>
|
||||
You can read and modify this file.
|
||||
Here are your editing permissions, which you **MUST ALWAYS** follow:
|
||||
|
||||
- Lines and blocks tagged with `<edit></edit>` should be ALWAYS modified to something different, based on the use case.
|
||||
- Lines and blocks tagged with `<adapt></adapt>` should be adapted to the specific use case you are dealing with, but only if needed.
|
||||
- If something does not have tags, it **MUST NOT** be modified.
|
||||
</guidelines>
|
||||
"""
|
||||
|
||||
import json
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from extraction_review.clients import fake
|
||||
|
||||
# <edit>
|
||||
from extraction_review.config import EXTRACTED_DATA_COLLECTION
|
||||
from extraction_review.metadata_workflow import MetadataResponse
|
||||
from extraction_review.metadata_workflow import workflow as metadata_workflow
|
||||
from extraction_review.process_file import FileEvent
|
||||
from extraction_review.process_file import workflow as process_file_workflow
|
||||
from llama_cloud_fake import FakeLlamaCloudServer
|
||||
from workflows.events import StartEvent
|
||||
|
||||
# </edit>
|
||||
|
||||
|
||||
def get_extraction_schema() -> dict:
|
||||
"""Load the extraction schema from the unified config file."""
|
||||
@@ -39,40 +19,25 @@ def get_extraction_schema() -> dict:
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
# <adapt>
|
||||
async def test_process_file_workflow(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
fake: FakeLlamaCloudServer,
|
||||
) -> None:
|
||||
monkeypatch.setenv("LLAMA_CLOUD_API_KEY", "fake-api-key")
|
||||
# load a file to the mock LlamaCloud server and retrieve its file id (modify if you don't have any files to load as input)
|
||||
if fake is not None:
|
||||
file_id = fake.files.preload(path="tests/files/test.pdf")
|
||||
else:
|
||||
warnings.warn(
|
||||
"Skipping test because it cannot be mocked. Set `FAKE_LLAMA_CLOUD=true` in your environment to enable this test..."
|
||||
)
|
||||
return
|
||||
file_id = fake.files.preload(path="tests/files/test.pdf")
|
||||
try:
|
||||
result = await process_file_workflow.run(start_event=FileEvent(file_id=file_id))
|
||||
except Exception:
|
||||
result = None
|
||||
assert result is not None
|
||||
# all generated agent data IDs are alphanumeric strings with 7 characters
|
||||
# the following assert statements ensure that that is the case
|
||||
assert isinstance(result, str)
|
||||
assert len(result) == 7
|
||||
|
||||
|
||||
# </adapt>
|
||||
|
||||
|
||||
# <adapt>
|
||||
@pytest.mark.asyncio
|
||||
async def test_metadata_workflow() -> None:
|
||||
result = await metadata_workflow.run(start_event=StartEvent())
|
||||
assert isinstance(result, MetadataResponse)
|
||||
assert result.extracted_data_collection == EXTRACTED_DATA_COLLECTION
|
||||
assert result.json_schema == get_extraction_schema()
|
||||
|
||||
|
||||
# </adapt>
|
||||
|
||||
Reference in New Issue
Block a user