Use error description in invalid extraction error (#1081)

* fix: display extraction job error in InvalidExtractionData exception Refactored InvalidExtractionData to read the `error` field from ExtractRun and prominently display it in the exception message. The job-level error is now stored in the `extraction_error` attribute and included in the invalid_item's metadata as `job_error`. * Create three-yaks-beg.md --------- Co-authored-by: Claude <noreply@anthropic.com>
2026-06-30 21:47:56 -04:00 · 2026-01-18 17:43:21 -05:00
parent 9239498945
commit 3040951cb8
3 changed files with 74 additions and 4 deletions
@@ -0,0 +1,5 @@
+---
+"llama-cloud-services-py": patch
+---
+
+Use error description in ExtractedData invalid extraction error
@@ -475,26 +475,49 @@ class ExtractedData(BaseModel, Generic[ExtractedT]):
                },
            )
        except ValidationError as e:
+            # Capture the job-level error from the extraction run if available
+            job_error = result.error
+
            invalid_item = ExtractedData[Dict[str, Any]].create(
                data=result.data or {},
                status="error",
                field_metadata=field_metadata,
-                metadata={"extraction_error": str(e), **(metadata or {})},
+                metadata={
+                    "extraction_error": str(e),
+                    **({"job_error": job_error} if job_error else {}),
+                    **(metadata or {}),
+                },
                file_id=file_id,
                file_name=file_name,
                file_hash=file_hash,
            )
-            raise InvalidExtractionData(invalid_item) from e
+            raise InvalidExtractionData(invalid_item, extraction_error=job_error) from e


 class InvalidExtractionData(Exception):
    """
    Exception raised when the extracted data does not conform to the schema.
+
+    Attributes:
+        invalid_item: The ExtractedData instance containing the invalid data and metadata
+        extraction_error: The error message from the extraction job, if available
    """

-    def __init__(self, invalid_item: ExtractedData[Dict[str, Any]]):
+    def __init__(
+        self,
+        invalid_item: ExtractedData[Dict[str, Any]],
+        extraction_error: Optional[str] = None,
+    ):
        self.invalid_item = invalid_item
-        super().__init__("Not able to parse the extracted data, parsed invalid format")
+        self.extraction_error = extraction_error
+
+        # Build an informative error message
+        if extraction_error:
+            message = f"Extraction error: {extraction_error}"
+        else:
+            message = "Not able to parse the extracted data, parsed invalid format"
+
+        super().__init__(message)


 def calculate_overall_confidence(
@@ -423,6 +423,7 @@ def create_extract_run(
    },
    data_schema: Dict[str, Any] = {},
    file: File = create_file(),
+    error: Optional[str] = None,
 ) -> ExtractRun:
    return ExtractRun.parse_obj(
        {
@@ -439,6 +440,7 @@ def create_extract_run(
            "status": "SUCCESS",
            "project_id": str(uuid.uuid4()),
            "from_ui": False,
+            "error": error,
        }
    )

@@ -544,6 +546,46 @@ def test_extracted_data_from_extraction_result_invalid_data():
    assert invalid_data.field_metadata["name"].confidence == 0.9
    assert invalid_data.overall_confidence == 0.9

+    # Verify default error message when no job error present
+    assert exc_info.value.extraction_error is None
+    assert "Not able to parse the extracted data" in str(exc_info.value)
+
+
+def test_extracted_data_from_extraction_result_with_job_error():
+    """Test ExtractedData.from_extraction_result with job-level error prominently displayed."""
+    job_error_message = "Failed to process document: unsupported file format"
+
+    # Create ExtractRun with both invalid data AND a job-level error
+    extract_run = create_extract_run(
+        data={
+            "missing_name": "Valid Name",
+            "age": "not_a_number",
+        },  # Invalid age, missing name
+        extraction_metadata={
+            "name": {"confidence": 0.9},
+        },
+        data_schema={},
+        file=create_file(id="error-file", name="bad_data.pdf"),
+        error=job_error_message,
+    )
+
+    # Should raise InvalidExtractionData with the job error prominently displayed
+    with pytest.raises(InvalidExtractionData) as exc_info:
+        ExtractedData.from_extraction_result(
+            extract_run, Person, metadata={"test": "metadata"}
+        )
+
+    # Verify the exception message prominently shows the job error
+    exception = exc_info.value
+    assert exception.extraction_error == job_error_message
+    assert f"Extraction error: {job_error_message}" == str(exception)
+
+    # Verify the invalid_item contains both errors in metadata
+    invalid_data = exception.invalid_item
+    assert invalid_data.metadata.get("job_error") == job_error_message
+    assert "extraction_error" in invalid_data.metadata  # Validation error still present
+    assert "test" in invalid_data.metadata  # Original metadata preserved
+

 class Dimensions(BaseModel):
    length: Optional[str] = Field(