Use error description in invalid extraction error (#1081)

* fix: display extraction job error in InvalidExtractionData exception

Refactored InvalidExtractionData to read the `error` field from
ExtractRun and prominently display it in the exception message.
The job-level error is now stored in the `extraction_error` attribute
and included in the invalid_item's metadata as `job_error`.

* Create three-yaks-beg.md

---------

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Adrian Lyjak
2026-01-18 17:43:21 -05:00
committed by GitHub
parent 9239498945
commit 3040951cb8
3 changed files with 74 additions and 4 deletions
+5
View File
@@ -0,0 +1,5 @@
---
"llama-cloud-services-py": patch
---
Use error description in ExtractedData invalid extraction error
@@ -475,26 +475,49 @@ class ExtractedData(BaseModel, Generic[ExtractedT]):
},
)
except ValidationError as e:
# Capture the job-level error from the extraction run if available
job_error = result.error
invalid_item = ExtractedData[Dict[str, Any]].create(
data=result.data or {},
status="error",
field_metadata=field_metadata,
metadata={"extraction_error": str(e), **(metadata or {})},
metadata={
"extraction_error": str(e),
**({"job_error": job_error} if job_error else {}),
**(metadata or {}),
},
file_id=file_id,
file_name=file_name,
file_hash=file_hash,
)
raise InvalidExtractionData(invalid_item) from e
raise InvalidExtractionData(invalid_item, extraction_error=job_error) from e
class InvalidExtractionData(Exception):
"""
Exception raised when the extracted data does not conform to the schema.
Attributes:
invalid_item: The ExtractedData instance containing the invalid data and metadata
extraction_error: The error message from the extraction job, if available
"""
def __init__(self, invalid_item: ExtractedData[Dict[str, Any]]):
def __init__(
self,
invalid_item: ExtractedData[Dict[str, Any]],
extraction_error: Optional[str] = None,
):
self.invalid_item = invalid_item
super().__init__("Not able to parse the extracted data, parsed invalid format")
self.extraction_error = extraction_error
# Build an informative error message
if extraction_error:
message = f"Extraction error: {extraction_error}"
else:
message = "Not able to parse the extracted data, parsed invalid format"
super().__init__(message)
def calculate_overall_confidence(
@@ -423,6 +423,7 @@ def create_extract_run(
},
data_schema: Dict[str, Any] = {},
file: File = create_file(),
error: Optional[str] = None,
) -> ExtractRun:
return ExtractRun.parse_obj(
{
@@ -439,6 +440,7 @@ def create_extract_run(
"status": "SUCCESS",
"project_id": str(uuid.uuid4()),
"from_ui": False,
"error": error,
}
)
@@ -544,6 +546,46 @@ def test_extracted_data_from_extraction_result_invalid_data():
assert invalid_data.field_metadata["name"].confidence == 0.9
assert invalid_data.overall_confidence == 0.9
# Verify default error message when no job error present
assert exc_info.value.extraction_error is None
assert "Not able to parse the extracted data" in str(exc_info.value)
def test_extracted_data_from_extraction_result_with_job_error():
"""Test ExtractedData.from_extraction_result with job-level error prominently displayed."""
job_error_message = "Failed to process document: unsupported file format"
# Create ExtractRun with both invalid data AND a job-level error
extract_run = create_extract_run(
data={
"missing_name": "Valid Name",
"age": "not_a_number",
}, # Invalid age, missing name
extraction_metadata={
"name": {"confidence": 0.9},
},
data_schema={},
file=create_file(id="error-file", name="bad_data.pdf"),
error=job_error_message,
)
# Should raise InvalidExtractionData with the job error prominently displayed
with pytest.raises(InvalidExtractionData) as exc_info:
ExtractedData.from_extraction_result(
extract_run, Person, metadata={"test": "metadata"}
)
# Verify the exception message prominently shows the job error
exception = exc_info.value
assert exception.extraction_error == job_error_message
assert f"Extraction error: {job_error_message}" == str(exception)
# Verify the invalid_item contains both errors in metadata
invalid_data = exception.invalid_item
assert invalid_data.metadata.get("job_error") == job_error_message
assert "extraction_error" in invalid_data.metadata # Validation error still present
assert "test" in invalid_data.metadata # Original metadata preserved
class Dimensions(BaseModel):
length: Optional[str] = Field(