From 111d3486a69fa8a57ec22a166bc378508bcef5c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Ledwo=C5=84?= Date: Wed, 22 Oct 2025 19:05:02 +0200 Subject: [PATCH] feat: add a blackhole ai capture endpoint (#39567) --- common/ingestion/acceptance_tests/__init__.py | 1 + .../ingestion/acceptance_tests/api_client.py | 234 +++ common/ingestion/acceptance_tests/conftest.py | 79 + .../acceptance_tests/implementation-plan.md | 85 + .../acceptance_tests/requirements.txt | 8 + .../ingestion/acceptance_tests/run_tests.py | 93 ++ .../acceptance_tests/test_basic_capture.py | 112 ++ .../acceptance_tests/test_llm_analytics.py | 812 ++++++++++ common/ingestion/acceptance_tests/utils.py | 17 + docker-compose.dev.yml | 2 + rust/Cargo.lock | 19 + rust/capture/Cargo.toml | 4 +- .../docs/llma-acceptance-test-suite.md | 218 +++ .../docs/llma-capture-implementation-plan.md | 61 +- rust/capture/docs/llma-capture-overview.md | 285 ++-- .../docs/llma-integration-test-suite.md | 340 +--- rust/capture/src/ai_endpoint.rs | 476 ++++++ rust/capture/src/config.rs | 4 + rust/capture/src/lib.rs | 1 + rust/capture/src/router.rs | 22 +- rust/capture/src/server.rs | 1 + .../capture/tests/common/integration_utils.rs | 1 + rust/capture/tests/common/utils.rs | 1 + rust/capture/tests/integration_ai_endpoint.rs | 1404 +++++++++++++++++ rust/capture/tests/limiters.rs | 5 + 25 files changed, 3891 insertions(+), 394 deletions(-) create mode 100644 common/ingestion/acceptance_tests/__init__.py create mode 100644 common/ingestion/acceptance_tests/api_client.py create mode 100644 common/ingestion/acceptance_tests/conftest.py create mode 100644 common/ingestion/acceptance_tests/implementation-plan.md create mode 100644 common/ingestion/acceptance_tests/requirements.txt create mode 100755 common/ingestion/acceptance_tests/run_tests.py create mode 100644 common/ingestion/acceptance_tests/test_basic_capture.py create mode 100644 common/ingestion/acceptance_tests/test_llm_analytics.py create mode 100644 common/ingestion/acceptance_tests/utils.py create mode 100644 rust/capture/docs/llma-acceptance-test-suite.md create mode 100644 rust/capture/src/ai_endpoint.rs create mode 100644 rust/capture/tests/integration_ai_endpoint.rs diff --git a/common/ingestion/acceptance_tests/__init__.py b/common/ingestion/acceptance_tests/__init__.py new file mode 100644 index 0000000000..e502861442 --- /dev/null +++ b/common/ingestion/acceptance_tests/__init__.py @@ -0,0 +1 @@ +# PostHog Ingestion Acceptance Tests diff --git a/common/ingestion/acceptance_tests/api_client.py b/common/ingestion/acceptance_tests/api_client.py new file mode 100644 index 0000000000..9b949eb518 --- /dev/null +++ b/common/ingestion/acceptance_tests/api_client.py @@ -0,0 +1,234 @@ +"""PostHog API client for acceptance tests.""" + +import json +import time +import uuid +import logging +from typing import Any, Optional + +import requests +from posthoganalytics import Posthog + +from .utils import get_service_url + +logger = logging.getLogger(__name__) + + +class PostHogTestClient: + """Client for interacting with PostHog API during tests.""" + + def __init__(self, base_url: Optional[str] = None, personal_api_key: Optional[str] = None): + self.base_url = base_url or get_service_url() + self.session = requests.Session() + + # Set personal API key for private endpoints if provided + if personal_api_key: + self.session.headers.update({"Authorization": f"Bearer {personal_api_key}"}) + + def create_organization(self, name: Optional[str] = None) -> dict[str, Any]: + """Create a test organization using private API.""" + org_name = name or f"test_org_{uuid.uuid4().hex[:8]}" + + logger.info("Creating organization '%s'", org_name) + logger.debug("POST %s/api/organizations/", self.base_url) + + response = self.session.post(f"{self.base_url}/api/organizations/", json={"name": org_name}) + + logger.debug("Response status: %s", response.status_code) + response.raise_for_status() + + result = response.json() + logger.info("Organization created with ID: %s", result.get("id")) + return result + + def create_project(self, organization_id: str, name: Optional[str] = None) -> dict[str, Any]: + """Create a test project within an organization using private API.""" + project_name = name or f"test_project_{uuid.uuid4().hex[:8]}" + + logger.info("Creating project '%s' in org %s", project_name, organization_id) + logger.debug("POST %s/api/organizations/%s/projects/", self.base_url, organization_id) + + response = self.session.post( + f"{self.base_url}/api/organizations/{organization_id}/projects/", json={"name": project_name} + ) + + logger.debug("Response status: %s", response.status_code) + response.raise_for_status() + + result = response.json() + logger.info("Project created with ID: %s", result.get("id")) + + # Wait for project to be available in query API + self._wait_for_project_ready(result.get("id")) + + return result + + def _wait_for_project_ready(self, project_id: str, timeout: int = 30) -> None: + """Wait for project to be ready for queries.""" + logger.info("Waiting for project %s to be ready for queries...", project_id) + start_time = time.time() + + while time.time() - start_time < timeout: + try: + # First check if the project exists via the basic project API + project_response = self.session.get(f"{self.base_url}/api/projects/{project_id}/") + if project_response.status_code != 200: + logger.debug("Project %s not accessible via API, waiting...", project_id) + time.sleep(1) + continue + + # Then try a simple HogQL query to see if the project is ready + query_response = self.session.post( + f"{self.base_url}/api/environments/{project_id}/query/", + json={"query": {"kind": "HogQLQuery", "query": "SELECT 1 LIMIT 1"}}, + ) + if query_response.status_code == 200: + logger.info("Project %s is ready for queries", project_id) + return + elif query_response.status_code == 404: + logger.debug("Project %s query endpoint not yet available, waiting...", project_id) + else: + logger.debug( + "Project %s query returned status %s, waiting...", project_id, query_response.status_code + ) + except Exception as e: + logger.debug("Project %s readiness check failed: %s, waiting...", project_id, e) + + time.sleep(1) + + logger.warning("Project %s may not be fully ready after %s seconds", project_id, timeout) + + def send_capture_event(self, api_key: str, event_data: dict[str, Any]) -> None: + """Send an event using the PostHog Python client. + + Uses the official PostHog Python client which handles the capture endpoint. + """ + # Extract event details + event_name = event_data.get("event", "test_event") + distinct_id = event_data.get("distinct_id", f"test_user_{uuid.uuid4().hex[:8]}") + properties = event_data.get("properties", {}) + timestamp = event_data.get("timestamp") + + logger.info("Creating PostHog client instance") + logger.debug("Host: %s", self.base_url) + + # Create PostHog client instance with the API key + posthog_client = Posthog(api_key, host=self.base_url, debug=True) + + logger.info("Sending capture event using PostHog client") + logger.debug("Event: %s", event_name) + logger.debug("Distinct ID: %s", distinct_id) + logger.debug("Properties: %s", properties) + if timestamp: + logger.debug("Timestamp: %s", timestamp) + + # Send event using PostHog client instance + if timestamp: + posthog_client.capture( + distinct_id=distinct_id, event=event_name, properties=properties, timestamp=timestamp + ) + else: + posthog_client.capture(distinct_id=distinct_id, event=event_name, properties=properties) + + logger.info("Event sent via PostHog client") + + # Flush to ensure the event is sent immediately + logger.debug("Flushing PostHog client") + posthog_client.flush() + logger.debug("PostHog client flushed") + + # Shutdown the client + posthog_client.shutdown() + + def query_events_hogql( + self, project_id: str, event_name: Optional[str] = None, distinct_id: Optional[str] = None, limit: int = 100 + ) -> list[dict[str, Any]]: + """Query events using the HogQL query API (recommended method).""" + # Build HogQL query + conditions = [] + if event_name: + conditions.append(f"event = '{event_name}'") + if distinct_id: + conditions.append(f"distinct_id = '{distinct_id}'") + + where_clause = f"WHERE {' AND '.join(conditions)}" if conditions else "" + query = f"SELECT * FROM events {where_clause} ORDER BY timestamp DESC LIMIT {limit}" + + logger.debug("Executing HogQL query: %s", query) + response = self.session.post( + f"{self.base_url}/api/environments/{project_id}/query/", + json={"refresh": "force_blocking", "query": {"kind": "HogQLQuery", "query": query}}, + ) + + logger.debug("HogQL query response status: %s", response.status_code) + response.raise_for_status() + + data = response.json() + logger.debug("HogQL query returned %s results", len(data.get("results", []))) + + # Extract events from HogQL response + if data.get("results"): + # Convert HogQL results to event-like format + columns = data.get("columns", []) + results = [] + for row in data["results"]: + event = {} + for i, col in enumerate(columns): + if i < len(row): + value = row[i] + # Parse JSON columns (properties is returned as JSON string) + if col == "properties" and isinstance(value, str): + try: + value = json.loads(value) + except (json.JSONDecodeError, TypeError): + pass + event[col] = value + results.append(event) + return results + return [] + + def wait_for_event( + self, + project_id: str, + event_name: str, + distinct_id: Optional[str] = None, + timeout: int = 30, + poll_interval: float = 5.0, + ) -> Optional[dict[str, Any]]: + """Poll for an event to appear in the query API.""" + start_time = time.time() + + while time.time() - start_time < timeout: + # Use HogQL query (recommended) + events = self.query_events_hogql(project_id, event_name, distinct_id, limit=10) + + for event in events: + if event.get("event") == event_name: + if distinct_id is None or event.get("distinct_id") == distinct_id: + return event + + time.sleep(poll_interval) + + return None + + def delete_project(self, project_id: str) -> None: + """Delete a project using private API.""" + logger.info("Deleting project %s", project_id) + logger.debug("DELETE %s/api/environments/%s/", self.base_url, project_id) + + response = self.session.delete(f"{self.base_url}/api/environments/{project_id}/") + + logger.debug("Response status: %s", response.status_code) + response.raise_for_status() + logger.info("Project deleted successfully") + + def delete_organization(self, organization_id: str) -> None: + """Delete an organization using private API.""" + logger.info("Deleting organization %s", organization_id) + logger.debug("DELETE %s/api/organizations/%s/", self.base_url, organization_id) + + response = self.session.delete(f"{self.base_url}/api/organizations/{organization_id}/") + + logger.debug("Response status: %s", response.status_code) + response.raise_for_status() + logger.info("Organization deleted successfully") diff --git a/common/ingestion/acceptance_tests/conftest.py b/common/ingestion/acceptance_tests/conftest.py new file mode 100644 index 0000000000..6ef3178f54 --- /dev/null +++ b/common/ingestion/acceptance_tests/conftest.py @@ -0,0 +1,79 @@ +"""Pytest fixtures for acceptance tests.""" + +import os +import logging + +import pytest + +from .api_client import PostHogTestClient + +logger = logging.getLogger(__name__) + + +@pytest.fixture(scope="class") +def test_client(): + """Create a PostHog test client instance for each test class.""" + # Get configuration from environment + base_url = os.environ.get("POSTHOG_TEST_BASE_URL", "http://localhost:8010") + personal_api_key = os.environ.get("POSTHOG_PERSONAL_API_KEY") + + if not personal_api_key: + pytest.skip("POSTHOG_PERSONAL_API_KEY not set - please set it to run acceptance tests") + + logger.info("Creating test client with base_url=%s", base_url) + client = PostHogTestClient(base_url=base_url, personal_api_key=personal_api_key) + + yield client + + # Cleanup happens in individual tests + + +@pytest.fixture(scope="function") +def function_test_client(): + """Create a PostHog test client instance for each individual test (for auth tests).""" + # Get configuration from environment + base_url = os.environ.get("POSTHOG_TEST_BASE_URL", "http://localhost:8010") + personal_api_key = os.environ.get("POSTHOG_PERSONAL_API_KEY") + + if not personal_api_key: + pytest.skip("POSTHOG_PERSONAL_API_KEY not set - please set it to run acceptance tests") + + logger.info("Creating function-scoped test client with base_url=%s", base_url) + client = PostHogTestClient(base_url=base_url, personal_api_key=personal_api_key) + + yield client + + +@pytest.fixture(scope="class") +def shared_org_project(test_client): + """Create a shared organization and project for an entire test class.""" + logger.info("Creating shared organization and project for test class") + + client = test_client + org = client.create_organization() + project = client.create_project(org["id"]) + + # Project creation now waits for readiness internally + + yield { + "org": org, + "project": project, + "client": client, + "org_id": org["id"], + "project_id": project["id"], + "api_key": project["api_token"], + } + + # Cleanup after all tests in class are done + logger.info("Cleaning up shared organization and project") + try: + client.delete_project(project["id"]) + logger.info("Successfully deleted project: %s", project["id"]) + except Exception as e: + logger.warning("Failed to delete project %s: %s", project["id"], e) + + try: + client.delete_organization(org["id"]) + logger.info("Successfully deleted organization: %s", org["id"]) + except Exception as e: + logger.warning("Failed to delete organization %s: %s", org["id"], e) diff --git a/common/ingestion/acceptance_tests/implementation-plan.md b/common/ingestion/acceptance_tests/implementation-plan.md new file mode 100644 index 0000000000..2c624a60da --- /dev/null +++ b/common/ingestion/acceptance_tests/implementation-plan.md @@ -0,0 +1,85 @@ +# LLMA Acceptance Test Implementation Plan + +## Overview + +Python-based acceptance tests for the LLM Analytics capture pipeline, testing against an existing PostHog instance. + +## Test Execution + +### Local Development + +- User starts PostHog stack manually (using docker-compose or other method) +- Set POSTHOG_TEST_BASE_URL environment variable to PostHog instance URL +- Run acceptance tests: `python run_tests.py` + +### GitHub Actions + +- Triggered on PRs affecting rust/capture/ or plugin-server/ +- Sets up PostHog stack in CI +- Runs acceptance tests against the stack +- Collects logs on failure + +## Implementation Steps + +### Commit 1: Basic acceptance test infrastructure ✓ + +1. Create common/ingestion/acceptance_tests/ directory structure +2. Add utils.py for service URL management +3. Add requirements.txt with test dependencies +4. Add run_tests.py test runner script + +### Commit 2: PostHog API client for test setup + +1. Add api_client.py with PostHogTestClient class +2. Implement organization creation via API +3. Implement project creation with API key retrieval +4. Add project deletion for cleanup +5. Add event querying through PostHog Query API +6. Implement polling mechanism for event arrival + +### Commit 3: Basic event capture test + +1. Add test_basic_capture.py +2. Test sending a regular PostHog event to /capture +3. Poll Query API until event appears +4. Verify event properties match what was sent +5. Confirm end-to-end pipeline works + +### Commit 4: Test orchestration + +1. Add conftest.py with pytest fixtures +2. Add per-test project isolation +3. Add environment variable handling +4. Handle cleanup of test data + +### Commit 5: GitHub Actions workflow + +1. Add .github/workflows/llma-acceptance-tests.yml +2. Configure triggers for relevant paths +3. Add PostHog stack startup steps +4. Add test execution with pytest +5. Implement log collection on failure +6. Add cleanup steps + +### Commit 6: Documentation + +1. Add README.md with setup instructions +2. Document local running procedures +3. Document environment variables +4. Add troubleshooting guide +5. Create .env.example file +6. Document how to extend the test suite + +## Usage + +```bash +# Start PostHog manually (e.g., with docker-compose) +docker-compose -f docker-compose.dev-full.yml up -d + +# Set environment variable +export POSTHOG_TEST_BASE_URL=http://localhost:8010 + +# Run tests +cd common/ingestion/acceptance_tests +python run_tests.py +``` diff --git a/common/ingestion/acceptance_tests/requirements.txt b/common/ingestion/acceptance_tests/requirements.txt new file mode 100644 index 0000000000..a91f25a952 --- /dev/null +++ b/common/ingestion/acceptance_tests/requirements.txt @@ -0,0 +1,8 @@ +pytest==7.4.3 +pytest-asyncio==0.21.1 +requests==2.31.0 +requests-toolbelt==1.0.0 +boto3==1.29.7 +docker==6.1.3 +python-multipart==0.0.6 +posthog==6.7.6 diff --git a/common/ingestion/acceptance_tests/run_tests.py b/common/ingestion/acceptance_tests/run_tests.py new file mode 100755 index 0000000000..1ef6679730 --- /dev/null +++ b/common/ingestion/acceptance_tests/run_tests.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 +"""Main test runner for PostHog acceptance tests.""" + +import sys +import logging +import subprocess +from pathlib import Path + +# Configure logging for the test runner +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%H:%M:%S") +logger = logging.getLogger(__name__) + + +def run_tests(): + """Run the acceptance test suite.""" + test_dir = Path(__file__).parent + + # Use virtual environment if it exists + venv_python = test_dir / ".venv" / "bin" / "python" + if venv_python.exists(): + python_cmd = str(venv_python) + else: + python_cmd = sys.executable + + # Run pytest on all test files + # -v: verbose + # -s: no capture, show print statements + # --tb=short: short traceback format + # --log-cli-level=DEBUG: show debug logs in console + # --log-cli-format: format for console logs + result = subprocess.run( + [ + python_cmd, + "-m", + "pytest", + "-v", + "-s", + "--tb=short", + "--log-cli-level=DEBUG", + "--log-cli-format=%(asctime)s [%(levelname)s] %(name)s: %(message)s", + "--log-cli-date-format=%H:%M:%S", + "--numprocesses=auto", + ], + cwd=test_dir, + stdout=sys.stdout, + stderr=sys.stderr, + ) + + return result.returncode + + +def main(): + """Main entry point for the test runner.""" + logger.info("=" * 60) + logger.info("PostHog Acceptance Tests") + logger.info("=" * 60) + + # Log environment info + import os + + logger.info("Environment:") + logger.info( + " POSTHOG_TEST_BASE_URL: %s", os.environ.get("POSTHOG_TEST_BASE_URL", "http://localhost:8010 (default)") + ) + logger.info(" POSTHOG_PERSONAL_API_KEY: %s", "SET" if os.environ.get("POSTHOG_PERSONAL_API_KEY") else "NOT SET") + + exit_code = 1 + + try: + # Run the tests + logger.info("Running acceptance tests...") + logger.info("-" * 60) + exit_code = run_tests() + logger.info("-" * 60) + + if exit_code == 0: + logger.info("✓ All tests passed!") + else: + logger.error("✗ Tests failed with exit code %s", exit_code) + + except KeyboardInterrupt: + logger.info("Interrupted by user") + exit_code = 130 + except Exception as e: + logger.exception("✗ Error: %s", e) + exit_code = 1 + + logger.info("Done.") + return exit_code + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/common/ingestion/acceptance_tests/test_basic_capture.py b/common/ingestion/acceptance_tests/test_basic_capture.py new file mode 100644 index 0000000000..e713975cb0 --- /dev/null +++ b/common/ingestion/acceptance_tests/test_basic_capture.py @@ -0,0 +1,112 @@ +"""Basic capture test - creates project, sends event, and verifies it.""" + +import uuid +import logging + +logger = logging.getLogger(__name__) + + +class TestBasicCapture: + """Test basic event capture flow.""" + + def test_capture_and_query_event(self, test_client): + """Test that we can capture an event and query it back.""" + logger.info("\n" + "=" * 60) + logger.info("STARTING TEST: Basic Event Capture") + logger.info("=" * 60) + + client = test_client + + # Create organization and project + org = None + project = None + + try: + # Step 1: Create test organization + logger.info("Step 1: Creating test organization") + org = client.create_organization() + org_id = org["id"] + logger.info("Organization created: %s", org_id) + + # Step 2: Create test project + logger.info("Step 2: Creating test project") + project = client.create_project(org_id) + project_id = project["id"] + project_api_key = project["api_token"] + logger.info("Project created: %s", project_id) + + # Step 3: Prepare test event + logger.info("Step 3: Preparing test event") + event_name = f"test_event_{uuid.uuid4().hex[:8]}" + distinct_id = f"test_user_{uuid.uuid4().hex[:8]}" + test_properties = {"test_property": "test_value", "test_number": 42, "test_bool": True} + logger.debug("Event name: %s", event_name) + logger.debug("Distinct ID: %s", distinct_id) + logger.debug("Properties: %s", test_properties) + + # Step 4: Send capture event + logger.info("Step 4: Sending event to capture endpoint") + client.send_capture_event( + api_key=project_api_key, + event_data={"event": event_name, "distinct_id": distinct_id, "properties": test_properties}, + ) + + logger.info("Event sent successfully") + + # Step 5: Wait for event to appear in query API + logger.info("Step 5: Waiting for event to be processed") + event = client.wait_for_event( + project_id=project_id, event_name=event_name, distinct_id=distinct_id, timeout=30 + ) + + # Verify event was found + assert event is not None, f"Event {event_name} not found after 30 seconds" + logger.info("Event found in query API") + logger.debug("Retrieved event: %s", event) + + # Step 6: Verify event properties + logger.info("Step 6: Verifying event properties") + assert ( + event.get("event") == event_name + ), f"Event name mismatch: expected {event_name}, got {event.get('event')}" + logger.debug("Event name matches: %s", event_name) + + assert ( + event.get("distinct_id") == distinct_id + ), f"Distinct ID mismatch: expected {distinct_id}, got {event.get('distinct_id')}" + logger.debug("Distinct ID matches: %s", distinct_id) + + # Check if properties match + event_properties = event.get("properties", {}) + logger.debug("Event properties: %s", event_properties) + + for key, value in test_properties.items(): + assert key in event_properties, f"Property {key} not found in event" + assert ( + event_properties[key] == value + ), f"Property {key} value mismatch: expected {value}, got {event_properties[key]}" + logger.debug("Property %s = %s", key, value) + + logger.info("All event properties verified successfully") + logger.info("Test completed successfully") + logger.info("=" * 60) + + finally: + # Cleanup + logger.info("Step 7: Cleaning up test resources") + + if project: + try: + client.delete_project(project["id"]) + logger.info("Cleaned up project: %s", project["id"]) + except Exception as e: + logger.exception("Failed to delete project: %s", e) + + if org: + try: + client.delete_organization(org["id"]) + logger.info("Cleaned up organization: %s", org["id"]) + except Exception as e: + logger.exception("Failed to delete organization: %s", e) + + logger.info("\n" + "=" * 60) diff --git a/common/ingestion/acceptance_tests/test_llm_analytics.py b/common/ingestion/acceptance_tests/test_llm_analytics.py new file mode 100644 index 0000000000..3f97417bf7 --- /dev/null +++ b/common/ingestion/acceptance_tests/test_llm_analytics.py @@ -0,0 +1,812 @@ +"""LLM Analytics capture tests - tests multipart blob upload and S3 storage.""" + +import gzip +import json +import uuid +import logging + +import pytest + +import requests +from requests_toolbelt import MultipartEncoder + +logger = logging.getLogger(__name__) + + +def assert_part_details(part, expected_name, expected_length, expected_content_type, expected_content_encoding=None): + """Assert comprehensive details about a multipart part.""" + assert part["name"] == expected_name, f"Expected part name '{expected_name}', got '{part['name']}'" + assert part["length"] == expected_length, f"Expected part length {expected_length}, got {part['length']}" + assert ( + part["content-type"] == expected_content_type + ), f"Expected content-type '{expected_content_type}', got '{part['content-type']}'" + assert ( + part["content-encoding"] == expected_content_encoding + ), f"Expected content-encoding '{expected_content_encoding}', got '{part['content-encoding']}'" + + +def assert_parts_order_and_details(response_data, expected_parts): + """Assert that parts are in the correct order and have correct details.""" + assert "accepted_parts" in response_data, "Response should contain accepted_parts" + assert isinstance(response_data["accepted_parts"], list), "accepted_parts should be a list" + + actual_parts = response_data["accepted_parts"] + assert len(actual_parts) == len(expected_parts), f"Expected {len(expected_parts)} parts, got {len(actual_parts)}" + + for i, (actual_part, expected_part) in enumerate(zip(actual_parts, expected_parts)): + expected_name, expected_length, expected_content_type, expected_content_encoding = expected_part + assert_part_details( + actual_part, expected_name, expected_length, expected_content_type, expected_content_encoding + ) + logger.debug(f"Part {i}: {actual_part['name']} - {actual_part['length']} bytes - {actual_part['content-type']}") + + +@pytest.mark.usefixtures("shared_org_project") +class TestLLMAnalytics: + """Test LLM Analytics capture flow with multipart requests and S3 storage.""" + + # ============================================================================ + # PHASE 1: HTTP ENDPOINT + # ============================================================================ + + # ---------------------------------------------------------------------------- + # Scenario 1.1: Event Processing Verification + # ---------------------------------------------------------------------------- + + def test_basic_ai_generation_event(self, shared_org_project): + """Test that we can capture an $ai_generation event with blob data via multipart request.""" + logger.info("\n" + "=" * 60) + logger.info("STARTING TEST: Basic $ai_generation Event Capture") + logger.info("=" * 60) + + client = shared_org_project["client"] + project_id = shared_org_project["project_id"] + project_api_key = shared_org_project["api_key"] + + # Step 1: Using shared organization and project + logger.info("Step 1: Using shared organization and project") + + # Step 2: Prepare test event data + logger.info("Step 2: Preparing $ai_generation event") + distinct_id = f"test_user_{uuid.uuid4().hex[:8]}" + + event_data = { + "event": "$ai_generation", + "distinct_id": distinct_id, + "timestamp": "2024-01-15T10:30:00Z", + "properties": { + "$ai_model": "gpt-4", + "$ai_provider": "openai", + "$ai_completion_tokens": 150, + "$ai_prompt_tokens": 50, + "custom_property": "test_value", + }, + } + + # Prepare blob data + input_blob = { + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is the capital of France?"}, + ], + "temperature": 0.7, + "max_tokens": 200, + } + + output_blob = { + "choices": [ + { + "message": {"role": "assistant", "content": "The capital of France is Paris."}, + "finish_reason": "stop", + "index": 0, + } + ], + "model": "gpt-4", + "usage": {"prompt_tokens": 50, "completion_tokens": 150, "total_tokens": 200}, + } + + # Step 3: Create multipart request + logger.info("Step 3: Creating multipart request") + + # Create multipart encoder with proper boundary + boundary = f"----WebKitFormBoundary{uuid.uuid4().hex[:16]}" + + fields = { + "event": ("event", json.dumps(event_data), "application/json"), + "event.properties.$ai_input": ( + f"blob_{uuid.uuid4().hex[:8]}", + json.dumps(input_blob), + "application/json", + ), + "event.properties.$ai_output_choices": ( + f"blob_{uuid.uuid4().hex[:8]}", + json.dumps(output_blob), + "application/json", + ), + } + + multipart_data = MultipartEncoder(fields=fields, boundary=boundary) + + # Step 4: Send multipart request to /i/v0/ai endpoint + logger.info("Step 4: Sending multipart request to /i/v0/ai endpoint") + + capture_url = f"{client.base_url}/i/v0/ai" + headers = {"Content-Type": multipart_data.content_type, "Authorization": f"Bearer {project_api_key}"} + + logger.debug("POST %s", capture_url) + logger.debug("Content-Type: %s", headers["Content-Type"]) + + response = requests.post(capture_url, data=multipart_data, headers=headers) + + logger.debug("Response status: %s", response.status_code) + logger.debug("Response body: %s", response.text) + + # Check if request was successful + response.raise_for_status() + logger.info("Multipart request sent successfully") + + # Verify response contains accepted parts + response_data = response.json() + assert "accepted_parts" in response_data + accepted_parts = response_data["accepted_parts"] + assert len(accepted_parts) == 3, f"Expected 3 parts, got {len(accepted_parts)}" + + # Verify each part has correct details + event_json = json.dumps(event_data) + input_json = json.dumps(input_blob) + output_json = json.dumps(output_blob) + + assert accepted_parts[0]["name"] == "event" + assert accepted_parts[0]["length"] == len(event_json) + assert accepted_parts[0]["content-type"] == "application/json" + + assert accepted_parts[1]["name"] == "event.properties.$ai_input" + assert accepted_parts[1]["length"] == len(input_json) + assert accepted_parts[1]["content-type"] == "application/json" + + assert accepted_parts[2]["name"] == "event.properties.$ai_output_choices" + assert accepted_parts[2]["length"] == len(output_json) + assert accepted_parts[2]["content-type"] == "application/json" + + logger.info("Response validation successful: all parts accepted with correct lengths") + + # Step 5: Wait for event to appear in query API + logger.info("Step 5: Waiting for event to be processed") + event = client.wait_for_event( + project_id=project_id, event_name="$ai_generation", distinct_id=distinct_id, timeout=30 + ) + + # Verify event was found + assert event is not None, "$ai_generation event not found after 30 seconds" + logger.info("Event found in query API") + logger.debug("Retrieved event: %s", event) + + # Step 6: Verify event properties + logger.info("Step 6: Verifying event properties") + assert event.get("event") == "$ai_generation" + assert event.get("distinct_id") == distinct_id + + event_properties = event.get("properties", {}) + + # Verify standard properties + assert event_properties.get("$ai_model") == "gpt-4" + assert event_properties.get("$ai_provider") == "openai" + assert event_properties.get("$ai_completion_tokens") == 150 + assert event_properties.get("$ai_prompt_tokens") == 50 + assert event_properties.get("custom_property") == "test_value" + + # Verify blob properties were replaced with S3 URLs + assert "$ai_input" in event_properties, "$ai_input property not found" + assert "$ai_output_choices" in event_properties, "$ai_output_choices property not found" + + ai_input_url = event_properties["$ai_input"] + ai_output_url = event_properties["$ai_output_choices"] + + logger.debug("$ai_input URL: %s", ai_input_url) + logger.debug("$ai_output_choices URL: %s", ai_output_url) + + # Verify URLs are S3 URLs with range parameters + assert ai_input_url.startswith("s3://"), "$ai_input should be an S3 URL" + assert ai_output_url.startswith("s3://"), "$ai_output_choices should be an S3 URL" + assert "range=" in ai_input_url, "$ai_input URL should contain range parameter" + assert "range=" in ai_output_url, "$ai_output_choices URL should contain range parameter" + + # Verify URLs point to same multipart file but different ranges + input_base = ai_input_url.split("?")[0] + output_base = ai_output_url.split("?")[0] + assert input_base == output_base, "Both URLs should point to same multipart file" + + logger.info("All event properties verified successfully") + logger.info("S3 URLs generated correctly with byte ranges") + logger.info("Test completed successfully") + logger.info("=" * 60) + + def test_ai_generation_event_with_separate_properties(self, shared_org_project): + """Test $ai_generation event with properties in a separate multipart part.""" + logger.info("\n" + "=" * 60) + logger.info("STARTING TEST: $ai_generation Event with Separate Properties") + logger.info("=" * 60) + + client = shared_org_project["client"] + project_id = shared_org_project["project_id"] + project_api_key = shared_org_project["api_key"] + + logger.info("Step 1: Using shared organization and project") + + logger.info("Step 2: Preparing $ai_generation event with separate properties") + distinct_id = f"test_user_{uuid.uuid4().hex[:8]}" + + event_data = { + "event": "$ai_generation", + "distinct_id": distinct_id, + "timestamp": "2024-01-15T10:30:00Z", + } + + properties_data = { + "$ai_model": "gpt-3.5-turbo", + "$ai_provider": "openai", + "$ai_completion_tokens": 100, + "$ai_prompt_tokens": 25, + "custom_property": "separate_test", + } + + input_blob = { + "messages": [ + {"role": "user", "content": "Tell me a joke."}, + ], + "temperature": 0.9, + } + + output_blob = { + "choices": [ + { + "message": { + "role": "assistant", + "content": "Why did the chicken cross the road? To get to the other side!", + }, + "finish_reason": "stop", + } + ], + } + + logger.info("Step 3: Creating multipart request with separate properties part") + boundary = f"----WebKitFormBoundary{uuid.uuid4().hex[:16]}" + + fields = { + "event": ("event", json.dumps(event_data), "application/json"), + "event.properties": ("event.properties", json.dumps(properties_data), "application/json"), + "event.properties.$ai_input": ( + f"blob_{uuid.uuid4().hex[:8]}", + json.dumps(input_blob), + "application/json", + ), + "event.properties.$ai_output_choices": ( + f"blob_{uuid.uuid4().hex[:8]}", + json.dumps(output_blob), + "application/json", + ), + } + + multipart_data = MultipartEncoder(fields=fields, boundary=boundary) + + logger.info("Step 4: Sending multipart request to /i/v0/ai endpoint") + capture_url = f"{client.base_url}/i/v0/ai" + headers = {"Content-Type": multipart_data.content_type, "Authorization": f"Bearer {project_api_key}"} + + response = requests.post(capture_url, data=multipart_data, headers=headers) + response.raise_for_status() + logger.info("Multipart request sent successfully") + + # Verify response contains accepted parts + response_data = response.json() + assert "accepted_parts" in response_data + accepted_parts = response_data["accepted_parts"] + assert len(accepted_parts) == 4, f"Expected 4 parts, got {len(accepted_parts)}" + + # Verify each part has correct details + event_json = json.dumps(event_data) + properties_json = json.dumps(properties_data) + input_json = json.dumps(input_blob) + output_json = json.dumps(output_blob) + + assert accepted_parts[0]["name"] == "event" + assert accepted_parts[0]["length"] == len(event_json) + assert accepted_parts[0]["content-type"] == "application/json" + + assert accepted_parts[1]["name"] == "event.properties" + assert accepted_parts[1]["length"] == len(properties_json) + assert accepted_parts[1]["content-type"] == "application/json" + + assert accepted_parts[2]["name"] == "event.properties.$ai_input" + assert accepted_parts[2]["length"] == len(input_json) + assert accepted_parts[2]["content-type"] == "application/json" + + assert accepted_parts[3]["name"] == "event.properties.$ai_output_choices" + assert accepted_parts[3]["length"] == len(output_json) + assert accepted_parts[3]["content-type"] == "application/json" + + logger.info("Response validation successful: all parts accepted with correct lengths") + + logger.info("Step 5: Waiting for event to be processed") + event = client.wait_for_event( + project_id=project_id, event_name="$ai_generation", distinct_id=distinct_id, timeout=30 + ) + + assert event is not None, "$ai_generation event not found after 30 seconds" + logger.info("Event found in query API") + + logger.info("Step 6: Verifying event properties") + assert event.get("event") == "$ai_generation" + assert event.get("distinct_id") == distinct_id + + event_properties = event.get("properties", {}) + + assert event_properties.get("$ai_model") == "gpt-3.5-turbo" + assert event_properties.get("$ai_provider") == "openai" + assert event_properties.get("$ai_completion_tokens") == 100 + assert event_properties.get("$ai_prompt_tokens") == 25 + assert event_properties.get("custom_property") == "separate_test" + + assert "$ai_input" in event_properties + assert "$ai_output_choices" in event_properties + + ai_input_url = event_properties["$ai_input"] + ai_output_url = event_properties["$ai_output_choices"] + + assert ai_input_url.startswith("s3://") + assert ai_output_url.startswith("s3://") + assert "range=" in ai_input_url + assert "range=" in ai_output_url + + input_base = ai_input_url.split("?")[0] + output_base = ai_output_url.split("?")[0] + assert input_base == output_base + + logger.info("All event properties verified successfully") + logger.info("Separate properties part handled correctly") + logger.info("Test completed successfully") + logger.info("=" * 60) + + def test_all_accepted_ai_event_types(self, shared_org_project): + """Test that all six accepted AI event types are successfully captured and stored.""" + client = shared_org_project["client"] + project_id = shared_org_project["project_id"] + api_key = shared_org_project["api_key"] + + base_distinct_id = f"user_{uuid.uuid4()}" + + # Define all event types with their specific properties + events_to_test = [ + { + "event_type": "$ai_generation", + "distinct_id": f"{base_distinct_id}_generation", + "properties": { + "$ai_model": "test-model", + "$ai_provider": "test-provider", + "$ai_input_tokens": 100, + "$ai_output_tokens": 50, + }, + }, + { + "event_type": "$ai_trace", + "distinct_id": f"{base_distinct_id}_trace", + "properties": { + "$ai_model": "test-model", + "$ai_provider": "test-provider", + "$ai_trace_id": str(uuid.uuid4()), + }, + }, + { + "event_type": "$ai_span", + "distinct_id": f"{base_distinct_id}_span", + "properties": { + "$ai_model": "test-model", + "$ai_provider": "test-provider", + "$ai_trace_id": str(uuid.uuid4()), + "$ai_span_id": str(uuid.uuid4()), + }, + }, + { + "event_type": "$ai_embedding", + "distinct_id": f"{base_distinct_id}_embedding", + "properties": { + "$ai_model": "test-model", + "$ai_provider": "test-provider", + "$ai_input_tokens": 75, + }, + }, + { + "event_type": "$ai_metric", + "distinct_id": f"{base_distinct_id}_metric", + "properties": { + "$ai_model": "test-model", + "$ai_provider": "test-provider", + "$ai_metric_type": "latency", + "$ai_metric_value": 1.23, + }, + }, + { + "event_type": "$ai_feedback", + "distinct_id": f"{base_distinct_id}_feedback", + "properties": { + "$ai_model": "test-model", + "$ai_provider": "test-provider", + "$ai_feedback_score": 5, + "$ai_feedback_comment": "Great response", + }, + }, + ] + + # Send all events + for event_spec in events_to_test: + event_type = event_spec["event_type"] + distinct_id = event_spec["distinct_id"] + logger.info(f"Sending {event_type} event") + + event_data = { + "event": event_type, + "distinct_id": distinct_id, + "$set": {"test_user": True, "event_type_test": event_type}, + } + + fields = { + "event": ("event", json.dumps(event_data), "application/json"), + "event.properties": ("event.properties", json.dumps(event_spec["properties"]), "application/json"), + } + + multipart_data = MultipartEncoder(fields=fields) + response = requests.post( + f"{client.base_url}/i/v0/ai", + data=multipart_data, + headers={"Content-Type": multipart_data.content_type, "Authorization": f"Bearer {api_key}"}, + ) + + assert ( + response.status_code == 200 + ), f"Expected 200 for {event_type}, got {response.status_code}: {response.text}" + response_data = response.json() + assert len(response_data["accepted_parts"]) == 2 + logger.info(f"{event_type} event sent successfully") + + logger.info("All event types sent successfully, now querying to verify storage") + + # Query and verify all events + for event_spec in events_to_test: + event_type = event_spec["event_type"] + distinct_id = event_spec["distinct_id"] + logger.info(f"Querying {event_type} event with distinct_id {distinct_id}") + + event = client.wait_for_event(project_id, event_type, distinct_id) + assert event is not None, f"Event {event_type} not found" + assert event["event"] == event_type + assert event["distinct_id"] == distinct_id + assert event["properties"]["$ai_model"] == "test-model" + assert event["properties"]["$ai_provider"] == "test-provider" + logger.info(f"{event_type} event verified successfully") + + logger.info("All six AI event types verified successfully") + + # ============================================================================ + # PHASE 4: MULTIPART FILE PROCESSING + # ============================================================================ + + # ---------------------------------------------------------------------------- + # Scenario 4.3: Content Type Handling + # ---------------------------------------------------------------------------- + + def test_ai_generation_event_with_different_content_types(self, shared_org_project): + """Test sending events with blobs using different supported content types.""" + client = shared_org_project["client"] + project_id = shared_org_project["project_id"] + api_key = shared_org_project["api_key"] + + base_distinct_id = f"user_{uuid.uuid4()}" + + # Send Event 1: application/json blob + logger.info("Sending event with application/json blob") + distinct_id_json = f"{base_distinct_id}_json" + event_data_json = { + "event": "$ai_generation", + "distinct_id": distinct_id_json, + "$set": {"test_user": True, "content_type_test": "json"}, + } + properties_data_json = { + "$ai_model": "gpt-4", + "$ai_model_parameters": {"temperature": 0.7}, + } + json_blob_data = {"context": "This is JSON formatted LLM input", "tokens": 150} + + fields_json = { + "event": ("event", json.dumps(event_data_json), "application/json"), + "event.properties": ("event.properties", json.dumps(properties_data_json), "application/json"), + "event.properties.$ai_input": ("blob_json", json.dumps(json_blob_data), "application/json"), + } + + multipart_data_json = MultipartEncoder(fields=fields_json) + response_json = requests.post( + f"{client.base_url}/i/v0/ai", + data=multipart_data_json, + headers={"Content-Type": multipart_data_json.content_type, "Authorization": f"Bearer {api_key}"}, + ) + assert response_json.status_code == 200, f"Expected 200, got {response_json.status_code}: {response_json.text}" + response_data_json = response_json.json() + assert len(response_data_json["accepted_parts"]) == 3 + parts_by_name_json = {part["name"]: part for part in response_data_json["accepted_parts"]} + assert parts_by_name_json["event.properties.$ai_input"]["content-type"] == "application/json" + + # Send Event 2: text/plain blob + logger.info("Sending event with text/plain blob") + distinct_id_text = f"{base_distinct_id}_text" + event_data_text = { + "event": "$ai_generation", + "distinct_id": distinct_id_text, + "$set": {"test_user": True, "content_type_test": "text"}, + } + properties_data_text = { + "$ai_model": "gpt-4", + "$ai_model_parameters": {"temperature": 0.5}, + } + text_blob_data = "This is plain text LLM output with multiple lines.\nSecond line here.\nThird line." + + fields_text = { + "event": ("event", json.dumps(event_data_text), "application/json"), + "event.properties": ("event.properties", json.dumps(properties_data_text), "application/json"), + "event.properties.$ai_output": ("blob_text", text_blob_data, "text/plain"), + } + + multipart_data_text = MultipartEncoder(fields=fields_text) + response_text = requests.post( + f"{client.base_url}/i/v0/ai", + data=multipart_data_text, + headers={"Content-Type": multipart_data_text.content_type, "Authorization": f"Bearer {api_key}"}, + ) + assert response_text.status_code == 200, f"Expected 200, got {response_text.status_code}: {response_text.text}" + response_data_text = response_text.json() + assert len(response_data_text["accepted_parts"]) == 3 + parts_by_name_text = {part["name"]: part for part in response_data_text["accepted_parts"]} + assert parts_by_name_text["event.properties.$ai_output"]["content-type"] == "text/plain" + + # Send Event 3: application/octet-stream blob + logger.info("Sending event with application/octet-stream blob") + distinct_id_binary = f"{base_distinct_id}_binary" + event_data_binary = { + "event": "$ai_generation", + "distinct_id": distinct_id_binary, + "$set": {"test_user": True, "content_type_test": "binary"}, + } + properties_data_binary = { + "$ai_model": "gpt-4", + "$ai_model_parameters": {"temperature": 0.9}, + } + binary_blob_data = bytes([0x00, 0x01, 0x02, 0x03, 0x04, 0xFF, 0xFE, 0xFD]) + + fields_binary = { + "event": ("event", json.dumps(event_data_binary), "application/json"), + "event.properties": ("event.properties", json.dumps(properties_data_binary), "application/json"), + "event.properties.$ai_embedding_vector": ("blob_binary", binary_blob_data, "application/octet-stream"), + } + + multipart_data_binary = MultipartEncoder(fields=fields_binary) + response_binary = requests.post( + f"{client.base_url}/i/v0/ai", + data=multipart_data_binary, + headers={"Content-Type": multipart_data_binary.content_type, "Authorization": f"Bearer {api_key}"}, + ) + assert ( + response_binary.status_code == 200 + ), f"Expected 200, got {response_binary.status_code}: {response_binary.text}" + response_data_binary = response_binary.json() + assert len(response_data_binary["accepted_parts"]) == 3 + parts_by_name_binary = {part["name"]: part for part in response_data_binary["accepted_parts"]} + assert ( + parts_by_name_binary["event.properties.$ai_embedding_vector"]["content-type"] == "application/octet-stream" + ) + + logger.info("All three events sent successfully, now querying to verify storage") + + # Query and verify Event 1 (JSON blob) + event_json = client.wait_for_event(project_id, "$ai_generation", distinct_id_json) + assert event_json is not None, "Event with JSON blob not found" + assert event_json["properties"]["$ai_model"] == "gpt-4" + logger.info(f"Event 1 (JSON blob) verified: {distinct_id_json}") + + # Query and verify Event 2 (text blob) + event_text = client.wait_for_event(project_id, "$ai_generation", distinct_id_text) + assert event_text is not None, "Event with text blob not found" + assert event_text["properties"]["$ai_model"] == "gpt-4" + logger.info(f"Event 2 (text blob) verified: {distinct_id_text}") + + # Query and verify Event 3 (binary blob) + event_binary = client.wait_for_event(project_id, "$ai_generation", distinct_id_binary) + assert event_binary is not None, "Event with binary blob not found" + assert event_binary["properties"]["$ai_model"] == "gpt-4" + logger.info(f"Event 3 (binary blob) verified: {distinct_id_binary}") + + logger.info("All three content type events verified successfully") + + # TODO: Verify blob properties have S3 URLs once S3 upload is implemented + + # ============================================================================ + # PHASE 5: AUTHORIZATION + # ============================================================================ + + # ---------------------------------------------------------------------------- + # Scenario 5.1: API Key Authentication + # ---------------------------------------------------------------------------- + + def test_ai_endpoint_invalid_auth_returns_401(self, function_test_client): + """Test that requests with invalid API key return 401 Unauthorized.""" + client = function_test_client + + event_data = { + "event": "$ai_generation", + "distinct_id": f"test_user_{uuid.uuid4().hex[:8]}", + } + + properties_data = {"$ai_model": "test"} + + fields = { + "event": ("event", json.dumps(event_data), "application/json"), + "event.properties": ("event.properties", json.dumps(properties_data), "application/json"), + } + + multipart_data = MultipartEncoder(fields=fields) + response = requests.post( + f"{client.base_url}/i/v0/ai", + data=multipart_data, + headers={"Content-Type": multipart_data.content_type, "Authorization": "Bearer invalid_key_123"}, + ) + assert response.status_code == 401, f"Expected 401, got {response.status_code}" + + # ============================================================================ + # PHASE 7: COMPRESSION + # ============================================================================ + + # ---------------------------------------------------------------------------- + # Scenario 7.1: Mixed Compression + # ---------------------------------------------------------------------------- + + def test_ai_generation_event_with_gzip_compression(self, shared_org_project): + """Test $ai_generation event with gzip compression for the entire request.""" + logger.info("\n" + "=" * 60) + logger.info("STARTING TEST: $ai_generation Event with Gzip Compression") + logger.info("=" * 60) + + client = shared_org_project["client"] + project_id = shared_org_project["project_id"] + project_api_key = shared_org_project["api_key"] + + logger.info("Step 1: Using shared organization and project") + + logger.info("Step 2: Preparing $ai_generation event") + distinct_id = f"test_user_{uuid.uuid4().hex[:8]}" + + event_data = { + "event": "$ai_generation", + "distinct_id": distinct_id, + "timestamp": "2024-01-15T10:30:00Z", + "properties": { + "$ai_model": "gpt-4-compressed", + "$ai_provider": "openai", + "$ai_completion_tokens": 75, + "$ai_prompt_tokens": 30, + "compression": "gzip", + }, + } + + input_blob = { + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Explain compression in simple terms."}, + ], + } + + output_blob = { + "choices": [ + { + "message": { + "role": "assistant", + "content": "Compression reduces data size by encoding information more efficiently.", + }, + "finish_reason": "stop", + } + ], + } + + logger.info("Step 3: Creating multipart request") + boundary = f"----WebKitFormBoundary{uuid.uuid4().hex[:16]}" + + fields = { + "event": ("event", json.dumps(event_data), "application/json"), + "event.properties.$ai_input": ( + f"blob_{uuid.uuid4().hex[:8]}", + json.dumps(input_blob), + "application/json", + ), + "event.properties.$ai_output_choices": ( + f"blob_{uuid.uuid4().hex[:8]}", + json.dumps(output_blob), + "application/json", + ), + } + + multipart_data = MultipartEncoder(fields=fields, boundary=boundary) + + logger.info("Step 4: Compressing request body with gzip") + uncompressed_body = multipart_data.to_string() + compressed_body = gzip.compress(uncompressed_body) + + logger.debug("Uncompressed size: %d bytes", len(uncompressed_body)) + logger.debug("Compressed size: %d bytes", len(compressed_body)) + logger.debug("Compression ratio: %.2f%%", (1 - len(compressed_body) / len(uncompressed_body)) * 100) + + logger.info("Step 5: Sending compressed multipart request to /i/v0/ai endpoint") + capture_url = f"{client.base_url}/i/v0/ai" + headers = { + "Content-Type": multipart_data.content_type, + "Content-Encoding": "gzip", + "Authorization": f"Bearer {project_api_key}", + } + + response = requests.post(capture_url, data=compressed_body, headers=headers) + response.raise_for_status() + logger.info("Compressed multipart request sent successfully") + + # Verify response contains accepted parts + response_data = response.json() + assert "accepted_parts" in response_data + accepted_parts = response_data["accepted_parts"] + assert len(accepted_parts) == 3, f"Expected 3 parts, got {len(accepted_parts)}" + + # Verify each part has correct details (lengths should match uncompressed data) + event_json = json.dumps(event_data) + input_json = json.dumps(input_blob) + output_json = json.dumps(output_blob) + + assert accepted_parts[0]["name"] == "event" + assert accepted_parts[0]["length"] == len(event_json) + assert accepted_parts[0]["content-type"] == "application/json" + + assert accepted_parts[1]["name"] == "event.properties.$ai_input" + assert accepted_parts[1]["length"] == len(input_json) + assert accepted_parts[1]["content-type"] == "application/json" + + assert accepted_parts[2]["name"] == "event.properties.$ai_output_choices" + assert accepted_parts[2]["length"] == len(output_json) + assert accepted_parts[2]["content-type"] == "application/json" + + logger.info("Response validation successful: decompressed parts have correct lengths") + + logger.info("Step 6: Waiting for event to be processed") + event = client.wait_for_event( + project_id=project_id, event_name="$ai_generation", distinct_id=distinct_id, timeout=30 + ) + + assert event is not None, "$ai_generation event not found after 30 seconds" + logger.info("Event found in query API") + + logger.info("Step 7: Verifying event properties") + assert event.get("event") == "$ai_generation" + assert event.get("distinct_id") == distinct_id + + event_properties = event.get("properties", {}) + + assert event_properties.get("$ai_model") == "gpt-4-compressed" + assert event_properties.get("$ai_provider") == "openai" + assert event_properties.get("compression") == "gzip" + + assert "$ai_input" in event_properties + assert "$ai_output_choices" in event_properties + + ai_input_url = event_properties["$ai_input"] + ai_output_url = event_properties["$ai_output_choices"] + + assert ai_input_url.startswith("s3://") + assert ai_output_url.startswith("s3://") + assert "range=" in ai_input_url + assert "range=" in ai_output_url + + logger.info("All event properties verified successfully") + logger.info("Gzip compression handled correctly") + logger.info("Test completed successfully") + logger.info("=" * 60) diff --git a/common/ingestion/acceptance_tests/utils.py b/common/ingestion/acceptance_tests/utils.py new file mode 100644 index 0000000000..2d6b416055 --- /dev/null +++ b/common/ingestion/acceptance_tests/utils.py @@ -0,0 +1,17 @@ +"""Utilities for acceptance tests.""" + +import os + + +def get_service_url(service: str = "proxy") -> str: + """Get the URL for a service.""" + if base_url := os.environ.get("POSTHOG_TEST_BASE_URL"): + return base_url + + service_urls = { + "proxy": "http://localhost:8010", + "s3": "http://localhost:19000", + "clickhouse": "http://localhost:8123", + } + + return service_urls.get(service, "http://localhost:8010") diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index 4a219809ac..b295f06f22 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -36,6 +36,8 @@ services: path /batch* path /capture path /capture* + path /ai + path /ai/* } @flags { diff --git a/rust/Cargo.lock b/rust/Cargo.lock index db9f39c65f..a6307ff280 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -1528,6 +1528,7 @@ dependencies = [ "lz-str", "metrics", "metrics-exporter-prometheus", + "multer", "once_cell", "opentelemetry 0.22.0", "opentelemetry-otlp", @@ -4850,6 +4851,23 @@ dependencies = [ "uuid", ] +[[package]] +name = "multer" +version = "3.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83e87776546dc87511aa5ee218730c92b666d7264ab6ed41f9d215af9cd5224b" +dependencies = [ + "bytes", + "encoding_rs", + "futures-util", + "http 1.1.0", + "httparse", + "memchr", + "mime", + "spin 0.9.8", + "version_check", +] + [[package]] name = "native-tls" version = "0.2.11" @@ -6406,6 +6424,7 @@ dependencies = [ "js-sys", "log", "mime", + "mime_guess", "native-tls", "once_cell", "percent-encoding", diff --git a/rust/capture/Cargo.toml b/rust/capture/Cargo.toml index db36ab16ec..aea737d5ff 100644 --- a/rust/capture/Cargo.toml +++ b/rust/capture/Cargo.toml @@ -52,6 +52,8 @@ jiff = "0.1" dateparser = "0.2" lz-str = { workspace = true } regex = { workspace = true } +multer = "3.0" +futures = { workspace = true } [dev-dependencies] assert-json-diff = { workspace = true } @@ -61,5 +63,5 @@ futures = { workspace = true } once_cell = { workspace = true } rand = { workspace = true } rdkafka = { workspace = true } -reqwest = { workspace = true } +reqwest = { workspace = true, features = ["multipart"] } serde_json = { workspace = true } diff --git a/rust/capture/docs/llma-acceptance-test-suite.md b/rust/capture/docs/llma-acceptance-test-suite.md new file mode 100644 index 0000000000..e77924a9c9 --- /dev/null +++ b/rust/capture/docs/llma-acceptance-test-suite.md @@ -0,0 +1,218 @@ +# LLMA Acceptance Test Suite + +## Overview + +This document describes the Python acceptance test suite for the LLM Analytics pipeline. These tests validate **end-to-end functionality** that requires real PostHog infrastructure: database, S3 storage, Kafka, authentication services, and the full ingestion pipeline. + +**Implementation Requirement**: Each phase in the implementation plan must pass its corresponding acceptance tests before proceeding to the next phase. This ensures incremental validation and prevents regression as new features are added. + +## Test Architecture + +### Test Environment + +- **Local PostHog Deployment**: Complete local PostHog setup including capture service with `/i/v0/ai` endpoint +- **S3-Compatible Storage**: Local S3-compatible storage for blob storage with direct access for verification +- **PostHog Query API**: Used to fetch processed events from the ingestion pipeline for validation +- **Direct S3 Access**: Test suite has direct S3 client access to verify blob storage and retrieval +- **Database**: PostgreSQL for team/token storage and event metadata +- **Kafka**: Event streaming for ingestion pipeline +- **ClickHouse**: Event storage and query API + +### Test Framework + +- **Acceptance Tests**: Full pipeline tests from HTTP request through ingestion to event storage +- **PostHog API Client**: Direct integration with PostHog query API to verify event processing +- **S3 Client**: Direct access to verify S3 blob storage, metadata, and retrieval +- **Parameterized Tests**: Test variations across different event types, blob sizes, and configurations +- **Async Testing**: Support for testing concurrent requests and large payload processing + +### Test Data + +- **AI Event Types**: `$ai_trace`, `$ai_span`, `$ai_generation`, `$ai_embedding`, `$ai_metric`, `$ai_feedback` +- **Blob Sizes**: Small (1KB), Medium (100KB), Large (1MB), Extra Large (10MB+) +- **Content Types**: `application/json`, `text/plain`, `application/octet-stream` +- **Compression**: Gzipped and uncompressed blobs +- **Encoding**: Raw binary, text, and base64-encoded blobs +- **Multipart Boundaries**: Various boundary strings to test collision handling + +## Test Scenarios + +**Phase Completion Requirement**: All acceptance tests for a phase must pass before implementation can proceed to the next phase. This gate ensures quality and prevents compound issues from multiple incomplete features. + +### Phase 1: HTTP Endpoint + +#### Scenario 1.1: Event Processing Verification + +- **Test**: Send multipart request and verify event reaches PostHog query API +- **Validation**: Use PostHog query API to fetch processed event, verify blob placeholders correctly inserted +- **Tests Implemented**: + - `test_basic_ai_generation_event`: Full end-to-end event processing + - `test_all_accepted_ai_event_types`: Verify all six supported AI event types are accepted and stored + +### Phase 2: Basic S3 Uploads + +#### Scenario 2.1: Individual Blob Upload + +- **Test**: Upload blobs of various sizes as separate S3 objects +- **Validation**: Verify each blob stored correctly, S3 URLs generated in event properties +- **Variations**: Small/medium/large blobs, different content types + +#### Scenario 2.2: S3 URL Generation and Access + +- **Test**: Verify generated S3 URLs in PostHog events point to accessible objects +- **Validation**: Query PostHog API for events, extract S3 URLs, verify blobs retrievable from S3 + +#### Scenario 2.3: Blob Metadata Storage + +- **Test**: Verify S3 object metadata is stored correctly +- **Validation**: Use S3 client to inspect object metadata - Content-Type, size, team_id present + +#### Scenario 2.4: Team Data Isolation + +- **Test**: Multiple teams uploading simultaneously +- **Validation**: Verify S3 key prefixes are team-scoped, no cross-team data access, proper S3 path isolation + +### Phase 4: Multipart File Processing + +#### Scenario 4.1: Multipart File Creation + +- **Test**: Upload events with multiple blobs, verify multipart/mixed format +- **Validation**: Use S3 client to verify single S3 file contains all blobs, proper MIME boundaries, metadata preserved +- **Variations**: 2-10 blobs per event, mixed content types, different blob sizes + +#### Scenario 4.2: Byte Range URLs and Access + +- **Test**: Verify S3 URLs in PostHog events include correct byte range parameters +- **Validation**: Query PostHog API for events, verify URLs contain range parameters, use S3 client to test range requests + +#### Scenario 4.3: Content Type Handling + +- **Test**: Mix of JSON, text, and binary blobs in single multipart file +- **Validation**: Content types preserved in multipart format, correctly parsed + +### Phase 5: Authorization + +#### Scenario 5.1: API Key Authentication + +- **Test**: Send requests with valid/invalid/missing API keys +- **Validation**: Valid keys accepted, invalid keys rejected with 401, proper error messages +- **Tests Implemented**: + - `test_ai_endpoint_invalid_auth_returns_401`: Invalid token validation + +### Phase 7: Compression + +#### Scenario 7.1: Mixed Compression + +- **Test**: Single request with both compressed and uncompressed blobs +- **Validation**: Each blob handled according to its compression state + +### Phase 9: Limits (Optional) - DO NOT IMPLEMENT, TBD + +### Phase 10: Data Deletion (Optional) - DO NOT IMPLEMENT, TBD + +### Cross-Team Isolation Testing + +## Error Recovery and Edge Cases + +### Edge Case Scenarios + +#### Scenario E.1: S3 Service Interruption + +- **Test**: Simulate S3 unavailability during uploads +- **Validation**: Proper error responses, retry logic works, no data loss + +#### Scenario E.2: Kafka Unavailability + +- **Test**: Simulate Kafka unavailability during event publishing +- **Validation**: Appropriate error handling, request failure communicated to client + +## Local Development Testing + +### Test Implementation + +The acceptance test suite is implemented in Python using pytest to test against full PostHog infrastructure. + +#### Test Structure + +- **Location**: `common/ingestion/acceptance_tests/test_llm_analytics.py` +- **Framework**: pytest with async support +- **Dependencies**: + - `requests` for HTTP client operations + - `boto3` for S3 client operations (when needed) + - PostHog SDK or API for event querying + - Django test utilities for setup + +### Local Test Environment Setup + +#### Prerequisites + +- **Local PostHog Instance**: Full PostHog deployment running locally with all services (Django, capture, Kafka, ClickHouse, PostgreSQL) +- **Capture Service**: Running with `/i/v0/ai` endpoint enabled +- **Personal API Key**: PostHog personal API key for creating test organizations/projects + +#### Environment Configuration + +```bash +# PostHog Instance (defaults to http://localhost:8010 if not set) +export POSTHOG_TEST_BASE_URL="http://localhost:8010" + +# Personal API Key (required - no default) +export POSTHOG_PERSONAL_API_KEY="your_personal_api_key_here" +``` + +**Creating a Personal API Key:** + +1. Navigate to your PostHog instance (e.g., `http://localhost:8010`) +2. Go to **Settings** (sidebar) → **Account** → **Personal API Keys** +3. Click **Create personal API key** +4. Configure the key: + - **Organization & project access**: Set to **All** (to avoid permission issues) + - **Scopes**: Set to **All access** (required for creating/deleting test organizations and projects) +5. Copy the generated key and set it as `POSTHOG_PERSONAL_API_KEY` + +**Note**: The test suite automatically creates temporary organizations and projects for each test class and cleans them up after tests complete. S3 configuration is handled by the PostHog instance itself. + +### Test Execution + +#### Running Tests + +```bash +# Run all acceptance tests using the test runner +cd common/ingestion/acceptance_tests +python run_tests.py + +# Or run pytest directly for specific tests +pytest test_llm_analytics.py::TestLLMAnalytics::test_basic_ai_generation_event -v + +# Run specific test class +pytest test_llm_analytics.py::TestLLMAnalytics -v +``` + +The `run_tests.py` script automatically: + +- Configures pytest with appropriate logging and verbosity settings +- Runs tests in parallel using `--numprocesses=auto` for faster execution +- Shows detailed debug logs during test execution + +#### Test Utilities + +Each test phase will include common utilities for: + +- **Multipart Request Builder**: Construct multipart/form-data requests with event JSON and blob parts +- **S3 Client Wrapper**: Direct S3 operations for validation and cleanup +- **PostHog API Client**: Query PostHog API to verify event processing +- **Test Data Generators**: Create various blob sizes, content types, and event payloads +- **Cleanup Helpers**: Remove test data from S3 and PostHog between test runs + +#### Test Data Management + +- **Isolated Test Teams**: Each test uses unique team IDs to prevent interference +- **Cleanup Between Tests**: Automatic cleanup of S3 objects and PostHog test data +- **Fixture Data**: Predefined multipart requests and blob data for consistent testing +- **Random Data Generation**: Configurable blob sizes and content for stress testing + +## Phase Gating + +- **Mandatory Testing**: All acceptance tests for a phase must pass before proceeding to implementation of the next phase +- **Regression Prevention**: Previous phase tests continue to run to ensure no regression +- **Incremental Validation**: Each phase builds upon validated functionality from previous phases diff --git a/rust/capture/docs/llma-capture-implementation-plan.md b/rust/capture/docs/llma-capture-implementation-plan.md index 790169d5e3..7d71be3797 100644 --- a/rust/capture/docs/llma-capture-implementation-plan.md +++ b/rust/capture/docs/llma-capture-implementation-plan.md @@ -10,13 +10,13 @@ This document outlines the implementation steps for the LLM Analytics capture pi #### 0.1 Routing Configuration -- [ ] Create new `/ai` endpoint in capture service -- [ ] Set up routing for `/ai` endpoint to capture service +- [x] Create new `/i/v0/ai` endpoint in capture service +- [ ] Set up routing for `/i/v0/ai` endpoint to capture service #### 0.2 End-to-End Integration Tests -- [ ] Implement end-to-end integration tests for the full LLM analytics pipeline -- [ ] Create test scenarios with multipart requests and blob data +- [x] Implement Rust integration tests for multipart parsing and validation +- [x] Create Python acceptance test scenarios with multipart requests and blob data - [ ] Test Kafka message output and S3 storage integration - [ ] Set up automated test suite for continuous validation @@ -24,20 +24,25 @@ This document outlines the implementation steps for the LLM Analytics capture pi #### 1.1 HTTP Endpoint Foundation -- [ ] Implement multipart/form-data request parsing -- [ ] Add server-side boundary validation +- [x] Implement multipart/form-data request parsing +- [x] Add server-side boundary validation +- [x] Support separate `event.properties` multipart part +- [x] Implement gzip decompression for compressed requests - [ ] Output events with blob placeholders to Kafka -- [ ] Implement error schema +- [x] Implement error schema #### 1.2 Basic Validation -- [ ] Implement `$ai_` event name prefix validation -- [ ] Validate blob part names against event properties -- [ ] Prevent blob overwriting of existing properties +- [x] Implement specific AI event type validation ($ai_generation, $ai_trace, $ai_span, $ai_embedding, $ai_metric, $ai_feedback) +- [x] Validate blob part names against event properties +- [x] Prevent blob overwriting of existing properties (reject if both embedded and separate properties) +- [x] Validate event part is first in multipart request +- [x] Validate required fields (event name, distinct_id, $ai_model) +- [x] Implement size limits (32KB event, 960KB combined, 25MB total, 27.5MB request body) #### 1.3 Initial Deployment -- [ ] Deploy capture-ai service to production with basic `/ai` endpoint +- [ ] Deploy capture-ai service to production with basic `/i/v0/ai` endpoint - [ ] Test basic multipart parsing and Kafka output functionality - [ ] Verify endpoint responds correctly to AI events @@ -81,6 +86,7 @@ This document outlines the implementation steps for the LLM Analytics capture pi #### 5.1 Request Signature Verification +- [x] Implement basic API key validation (Bearer token authentication) - [ ] Implement PostHog API key authentication - [ ] Add request signature verification - [ ] Validate API key before processing multipart data @@ -96,8 +102,8 @@ This document outlines the implementation steps for the LLM Analytics capture pi #### 6.2 Alerting - [ ] Configure alerts for S3 upload failures -- [ ] Set up alerts for high error rates on `/ai` endpoint -- [ ] Set up alerts for high latency on `/ai` endpoint +- [ ] Set up alerts for high error rates on `/i/v0/ai` endpoint +- [ ] Set up alerts for high latency on `/i/v0/ai` endpoint #### 6.3 Runbooks @@ -107,28 +113,37 @@ This document outlines the implementation steps for the LLM Analytics capture pi #### 7.1 Compression Support -- [ ] Parse Content-Encoding headers from SDK requests -- [ ] Implement server-side compression for uncompressed text/JSON -- [ ] Add compression metadata to multipart files -- [ ] Handle mixed compressed/uncompressed blobs +- [x] Parse Content-Encoding: gzip header for request-level compression +- [x] Implement streaming gzip decompression for compressed requests +- [x] Test with gzip-compressed multipart requests +- [ ] Implement server-side compression for uncompressed blobs before S3 storage +- [ ] Add compression metadata to S3 objects - [ ] Track compression ratio effectiveness ### Phase 8: Schema Validation #### 8.1 Schema Validation -- [ ] Create strict schema definitions for each AI event type -- [ ] Add schema validation for event payloads -- [ ] Validate Content-Type headers on blob parts -- [ ] Add Content-Length validation +- [x] Validate Content-Type headers on blob parts (required: application/json, text/plain, application/octet-stream) +- [x] Validate event JSON structure (event, distinct_id, properties fields) +- [x] Validate required AI properties ($ai_model) +- [x] Test with different supported content types +- [ ] Create comprehensive schema definitions for each AI event type +- [ ] Add detailed schema validation for event-specific properties +- [ ] Add Content-Length validation beyond size limits ### Phase 9: Limits (Optional) #### 9.1 Request Validation & Limits -- [ ] Add request size limits and validation +- [x] Add request size limits and validation (configurable via `ai_max_sum_of_parts_bytes`) +- [x] Implement event part size limit (32KB) +- [x] Implement combined event+properties size limit (960KB) +- [x] Implement total parts size limit (25MB default, configurable) +- [x] Implement request body size limit (110% of total parts limit) +- [x] Return 413 Payload Too Large for size violations - [ ] Add request rate limiting per team -- [ ] Implement payload size limits per team +- [ ] Implement per-team payload size limits ### Phase 10: Data Deletion (Optional) diff --git a/rust/capture/docs/llma-capture-overview.md b/rust/capture/docs/llma-capture-overview.md index 100aae92b9..f3bdd7776a 100644 --- a/rust/capture/docs/llma-capture-overview.md +++ b/rust/capture/docs/llma-capture-overview.md @@ -13,43 +13,101 @@ This approach allows us to capture comprehensive LLM usage data without impactin ## Supported Events -### Events with Large Context Payloads +The LLM Analytics capture endpoint supports four primary AI event types. Events are sent to the `/i/v0/ai` endpoint with multipart payloads to handle large context data efficiently. -These events contain substantial LLM context that requires blob storage: +### `$ai_generation` -- **`$ai_trace`** - - `$ai_input_state` - - `$ai_output_state` +A generation represents a single call to an LLM (e.g., a chat completion request). -- **`$ai_span`** - - `$ai_input_state` - - `$ai_output_state` +**Core Properties:** -- **`$ai_generation`** - - `$ai_input` - Can contain 300,000+ LLM tokens, making blob storage essential - - `$ai_output_choices` +- `$ai_trace_id` (required) - UUID to group AI events (e.g., conversation_id) +- `$ai_model` (required) - The model used (e.g., "gpt-4o", "claude-3-opus") +- `$ai_provider` (required) - The LLM provider (e.g., "openai", "anthropic", "gemini") +- `$ai_input` - List of messages sent to the LLM (can be stored as blob) + - Can contain 300,000+ tokens, making blob storage essential + - Each message has a `role` ("user", "system", or "assistant") and `content` array + - Content types: text, image URLs, function calls +- `$ai_output_choices` - List of response choices from the LLM (can be stored as blob) + - Each choice has a `role` and `content` array +- `$ai_input_tokens` - Number of tokens in the input +- `$ai_output_tokens` - Number of tokens in the output +- `$ai_span_id` (optional) - Unique identifier for this generation +- `$ai_span_name` (optional) - Name given to this generation +- `$ai_parent_id` (optional) - Parent span ID for tree view grouping +- `$ai_latency` (optional) - LLM call latency in seconds +- `$ai_http_status` (optional) - HTTP status code of the response +- `$ai_base_url` (optional) - Base URL of the LLM provider +- `$ai_request_url` (optional) - Full URL of the request +- `$ai_is_error` (optional) - Boolean indicating if the request was an error +- `$ai_error` (optional) - Error message or object -- **`$ai_embedding`** - - `$ai_input` - - Note: Output data is not currently included in the event payload, though this may be added in future iterations +**Cost Properties** (optional, auto-calculated from model and token counts if not provided): -### Standard Events +- `$ai_input_cost_usd` - Cost in USD of input tokens +- `$ai_output_cost_usd` - Cost in USD of output tokens +- `$ai_total_cost_usd` - Total cost in USD -These events can be processed through the regular pipeline without blob storage: +**Cache Properties** (optional): -- **`$ai_metric`** - Lightweight metric data that doesn't require offloading -- **`$ai_feedback`** - User feedback events that remain small enough for standard processing +- `$ai_cache_read_input_tokens` - Number of tokens read from cache +- `$ai_cache_creation_input_tokens` - Number of tokens written to cache (Anthropic-specific) -### Future Considerations +**Model Parameters** (optional): -The event schema is designed to accommodate future multimodal content types, including: +- `$ai_temperature` - Temperature parameter used +- `$ai_stream` - Whether the response was streamed +- `$ai_max_tokens` - Maximum tokens setting +- `$ai_tools` - Tools/functions available to the LLM -- Images -- Audio -- Video -- Files +### `$ai_trace` -These additions will leverage the same blob storage infrastructure when implemented. +A trace represents a complete AI interaction flow (e.g., a full conversation or agent execution). + +**Key Properties:** + +- `$ai_trace_id` (required) - UUID identifying this trace +- `$ai_input_state` - Initial state of the trace (can be stored as blob) +- `$ai_output_state` - Final state of the trace (can be stored as blob) + +### `$ai_span` + +A span represents a logical unit of work within a trace (e.g., a tool call, a retrieval step). + +**Key Properties:** + +- `$ai_trace_id` (required) - Parent trace UUID +- `$ai_span_id` (required) - Unique identifier for this span +- `$ai_parent_id` (optional) - Parent span ID for nesting +- `$ai_span_name` - Name describing this span +- `$ai_input_state` - Input state for this span (can be stored as blob) +- `$ai_output_state` - Output state for this span (can be stored as blob) + +### `$ai_embedding` + +An embedding event captures vector generation for semantic search or RAG systems. + +**Key Properties:** + +- `$ai_trace_id` (required) - Parent trace UUID +- `$ai_model` (required) - Embedding model used +- `$ai_provider` (required) - Provider (e.g., "openai", "cohere") +- `$ai_input` - Text or data being embedded (can be stored as blob) +- `$ai_input_tokens` - Number of tokens in the input +- Note: Output vectors are typically not captured in events + +### Standard Events (No Blob Storage Required) + +These events are lightweight and processed through the regular pipeline: + +- **`$ai_metric`** - Performance metrics, usage statistics +- **`$ai_feedback`** - User feedback on AI responses + +### Blob Storage Strategy + +Properties that can contain large payloads (marked as "can be stored as blob" above) should be sent as separate multipart parts with names like `event.properties.$ai_input` or `event.properties.$ai_output_choices`. This keeps the event JSON small while allowing arbitrarily large context data to be stored efficiently in S3. + +**Reference:** [PostHog LLM Analytics Manual Capture Documentation](https://posthog.com/docs/llm-analytics/manual-capture) ## General Architecture @@ -58,7 +116,7 @@ The LLM Analytics capture system implements a specialized data flow that efficie ### Data Flow 1. **Event Ingestion** - - Events are transmitted using server-side PostHog SDKs via HTTP to a dedicated `/ai` endpoint + - Events are transmitted using server-side PostHog SDKs via HTTP to a dedicated `/i/v0/ai` endpoint - Requests utilize multipart payloads containing: - Event payload (metadata and standard properties) - Binary blobs containing LLM context (e.g., input state, output state property values) @@ -85,7 +143,7 @@ The LLM Analytics capture system implements a specialized data flow that efficie ### HTTP Endpoint -The `/ai` endpoint accepts multipart POST requests with the following structure: +The `/i/v0/ai` endpoint accepts multipart POST requests with the following structure: #### Request Format @@ -97,22 +155,35 @@ The `/ai` endpoint accepts multipart POST requests with the following structure: **Multipart Parts:** 1. **Event Part** (required) - - `Content-Disposition: form-data; name="event"` - - `Content-Type: application/json` - - Body: Standard PostHog event JSON payload + - `Content-Disposition: form-data; name="event"` (required) + - `Content-Type: application/json` (required) + - Body: Standard PostHog event JSON payload (without properties, or with properties that will be rejected if `event.properties` part is also present) -2. **Blob Parts** (optional, multiple allowed) - - `Content-Disposition: form-data; name="event.properties."; filename=""` - - `Content-Type: application/octet-stream` (or `application/json`, `text/plain`, etc.) - - `Content-Encoding: gzip` (optional, for compressed data) - - `Content-Length: ` (size of the blob part in bytes) - - Body: Binary blob data (optionally gzip compressed) +2. **Event Properties Part** (optional) + - `Content-Disposition: form-data; name="event.properties"` (required) + - `Content-Type: application/json` (required) + - Body: JSON object containing event properties + - Cannot be used together with embedded properties in the event part (request will be rejected with 400 Bad Request) + - Properties from this part are merged into the event as the `properties` field + +3. **Blob Parts** (optional, multiple allowed) + - `Content-Disposition: form-data; name="event.properties."; filename=""` (required) + - `Content-Type: application/octet-stream | application/json | text/plain` (required) + - Body: Binary blob data - The part name follows the JSON path in the event object (e.g., `event.properties.$ai_input_state`) +**Allowed Part Headers:** + +- `Content-Disposition` (required for all parts) +- `Content-Type` (required for all parts) +- No other headers are supported on individual parts (e.g., `Content-Encoding` is not allowed on parts) + +**Note:** Individual parts cannot have their own compression. To compress the entire request payload, use the `Content-Encoding: gzip` header at the HTTP request level. + #### Example Request Structure ```http -POST /ai HTTP/1.1 +POST /i/v0/ai HTTP/1.1 Content-Type: multipart/form-data; boundary=----boundary123 ------boundary123 @@ -121,32 +192,34 @@ Content-Type: application/json { "event": "$ai_generation", - "properties": { - "model": "gpt-4", - "completion_tokens": 150 - }, + "distinct_id": "user_123", "timestamp": "2024-01-15T10:30:00Z" } +------boundary123 +Content-Disposition: form-data; name="event.properties" +Content-Type: application/json + +{ + "$ai_model": "gpt-4", + "completion_tokens": 150 +} + ------boundary123 Content-Disposition: form-data; name="event.properties.$ai_input"; filename="blob_abc123" Content-Type: application/json -Content-Encoding: gzip -Content-Length: 2048 -[Gzipped JSON LLM input data] +[JSON LLM input data] ------boundary123 Content-Disposition: form-data; name="event.properties.$ai_output_choices"; filename="blob_def456" Content-Type: application/json -Content-Length: 5120 -[Uncompressed JSON LLM output data] +[JSON LLM output data] ------boundary123 Content-Disposition: form-data; name="event.properties.$ai_embedding_vector"; filename="blob_ghi789" Content-Type: application/octet-stream -Content-Length: 16384 [Binary embedding vector data] ------boundary123-- @@ -162,18 +235,40 @@ To prevent LLM data from accidentally containing the multipart boundary sequence #### Processing Flow -1. Parse multipart request -2. Extract event JSON from the "event" part -3. Collect all blob parts: - - Extract property path from each part name - - Verify the property doesn't already exist in the event JSON +1. **Parse multipart request** + - Validate that the first part is the `event` part + - Extract event JSON from the "event" part + +2. **Handle event properties** + - If `event.properties` part exists: extract properties JSON from it + - If embedded properties exist in the event part AND `event.properties` part exists: reject with 400 Bad Request + - Merge properties into the event (from `event.properties` part if present, otherwise use embedded properties) + +3. **Validate event structure** + - Check event name starts with `$ai_` + - Verify required fields (distinct_id, properties, etc.) + - Validate required AI properties (e.g., `$ai_model`) + +4. **Collect all blob parts** + - Extract property path from each part name (e.g., `event.properties.$ai_input`) - Store blob data with metadata (property path, content type, size) -4. Create multipart file containing all blobs with index -5. Upload single multipart file to S3: + - Check for duplicate blob property names + +5. **Validate size limits** + - Event part ≤ 32KB + - Event + properties combined ≤ 960KB + - Sum of all parts ≤ 25MB (configurable) + +6. **Create multipart file containing all blobs with index** + +7. **Upload single multipart file to S3** - Generate S3 key using team_id, event_id, and random string - Include blob index in S3 object metadata -6. Add properties to event with S3 URLs including byte ranges -7. Send modified event to Kafka + +8. **Replace blob properties with S3 URLs** + - Add properties to event with S3 URLs including byte ranges + +9. **Send modified event to Kafka** ### S3 Storage @@ -269,52 +364,58 @@ s3://posthog-llm-analytics/llma/1y/789/2024-01-15/event_678_t1u3v.multipart ### Content Types -#### Supported Content Types +#### Supported Content Types for Blob Parts The following content types are accepted for blob parts: -- `application/octet-stream` - Default for binary data -- `application/json` - JSON formatted LLM context -- `text/plain` - Plain text LLM inputs/outputs +- `application/octet-stream` - For binary data +- `application/json` - For JSON formatted LLM context +- `text/plain` - For plain text LLM inputs/outputs + +The event and event.properties parts must use `application/json`. #### Content Type Handling -- Blob parts must include a Content-Type header +- All parts must include a Content-Type header +- Blob parts with unsupported content types are rejected with 400 Bad Request +- Blob parts missing Content-Type header are rejected with 400 Bad Request - The Content-Type is stored within the multipart file for each part - Content-Type is used by the evaluation service to determine how to parse each blob within the multipart file ### Compression -#### Client-side Compression +#### Request-Level Compression -SDKs should compress blob payloads before transmission to reduce bandwidth usage: +The endpoint supports request-level gzip compression to reduce bandwidth usage: -- Compression algorithm: gzip -- Compressed parts should include `Content-Encoding: gzip` header -- Original Content-Type should be preserved (e.g., `Content-Type: application/json` with `Content-Encoding: gzip`) +- **Compression algorithm**: gzip +- **How to compress**: Add `Content-Encoding: gzip` header to the HTTP request and compress the entire multipart request body +- **Server behavior**: The capture service will detect the `Content-Encoding: gzip` header and decompress the entire request before processing the multipart data +- **Recommendation**: SDKs should compress large requests (e.g., > 10KB) to minimize network transfer time -#### Server-side Compression +**Example Compressed Request:** -For uncompressed data received from SDKs: +```http +POST /i/v0/ai HTTP/1.1 +Content-Type: multipart/form-data; boundary=----boundary123 +Content-Encoding: gzip -- The capture service will automatically compress the following content types: +[Gzipped multipart request body] +``` + +The entire multipart body (including all parts) is compressed as a single gzip stream. + +#### Server-side Compression (S3 Storage) + +For data received from SDKs (after request decompression, if any): + +- The capture service will automatically compress the following content types before storing in S3: - `application/json` - `text/*` (all text subtypes) - Binary formats (`application/octet-stream`) will not be automatically compressed - Compression is applied before storing in S3 - S3 object metadata will indicate if server-side compression was applied - -#### Example Headers - -Compressed blob part from SDK: - -```http -Content-Disposition: form-data; name="event.properties.$ai_input"; filename="blob_abc123" -Content-Type: application/json -Content-Encoding: gzip - -[Gzipped JSON data] -``` +- This compression is transparent to clients and reduces storage costs ## Reliability Concerns @@ -330,7 +431,7 @@ Content-Encoding: gzip ### Preventing Malicious Uploads -- All requests to the `/ai` endpoint must be authenticated using the project's private API key +- All requests to the `/i/v0/ai` endpoint must be authenticated using the project's private API key - The capture service validates the API key before processing any multipart data - This prevents unauthorized uploads and ensures blob storage is only used by legitimate PostHog projects @@ -340,7 +441,7 @@ The authentication process for LLM analytics events follows these steps: 1. **API Key Extraction** - Extract the API key from the request headers (e.g., `Authorization: Bearer `) - - API key must be present for all requests to `/ai` + - API key must be present for all requests to `/i/v0/ai` 2. **Early Validation** - Validate the API key format and existence before parsing multipart data @@ -402,7 +503,7 @@ Three approaches for handling data deletion requests: The capture service enforces strict validation on incoming events: 1. **Event Name Validation** - - All events sent to `/ai` must have an event name starting with `$ai_` + - All events sent to `/i/v0/ai` must have an event name starting with `$ai_` - Requests with non-AI events are rejected with 400 Bad Request - This ensures the endpoint is only used for its intended purpose @@ -420,10 +521,14 @@ The capture service enforces strict validation on incoming events: - Nested property paths are supported (e.g., `event.properties.nested.$ai_input`) 5. **Size Limits** - - Maximum total payload size enforced for security (e.g., 100MB default) - - Limits can be configured per team to accommodate different use cases - - Event JSON payload has a separate maximum size limit - - Individual blob parts have maximum size limits + All size limit violations return 413 Payload Too Large: + - **Request body**: Maximum 27.5MB (110% of sum of all parts limit, enforced by Axum) + - Computed as 110% of `AI_MAX_SUM_OF_PARTS_BYTES` to account for multipart overhead + - This is the first check, applied before any request processing + - **Event part**: Maximum 32KB (enforced by handler) + - **Event + properties combined**: Maximum 960KB (1MB - 64KB, enforced by handler) + - **Sum of all parts** (event, properties, and all blobs): Maximum 25MB (default, configurable via `AI_MAX_SUM_OF_PARTS_BYTES`, enforced by handler) + - These limits are configurable via environment variables and can be adjusted per deployment 6. **Strict Schema Validation** - Each `$ai_` event type has a strictly defined schema @@ -432,10 +537,6 @@ The capture service enforces strict validation on incoming events: - Blob properties must match expected blob fields for each event type - Non-conforming events are rejected with detailed validation errors -## Open Questions - -- Should the capture service validate Content-Types of blob parts against a whitelist, or accept any Content-Type provided by the client? - ## Rejected Solutions ### WarpStream-based Processing diff --git a/rust/capture/docs/llma-integration-test-suite.md b/rust/capture/docs/llma-integration-test-suite.md index 81f2f6a4e7..cb419bd23c 100644 --- a/rust/capture/docs/llma-integration-test-suite.md +++ b/rust/capture/docs/llma-integration-test-suite.md @@ -2,36 +2,33 @@ ## Overview -This document describes the high-level architecture and test scenarios for the LLM Analytics capture pipeline integration test suite. The tests validate the complete end-to-end flow from multipart request ingestion through S3 storage to event processing. +This document describes the Rust integration test suite for the LLM Analytics capture service. These tests validate HTTP endpoint behavior, multipart parsing, and validation logic without requiring external dependencies. **Implementation Requirement**: Each phase in the implementation plan must pass its corresponding integration tests before proceeding to the next phase. This ensures incremental validation and prevents regression as new features are added. +**See Also**: `llma-acceptance-test-suite.md` for end-to-end tests that require full PostHog infrastructure. + ## Test Architecture ### Test Environment -- **Local PostHog Deployment**: Complete local PostHog setup including capture service with `/ai` endpoint -- **S3-Compatible Storage**: Local S3-compatible storage for blob storage with direct access for verification -- **PostHog Query API**: Used to fetch processed events from the ingestion pipeline for validation -- **Direct S3 Access**: Test suite has direct S3 client access to verify blob storage and retrieval -- **Test Fixtures**: Predefined multipart requests with various blob sizes and types +- **In-Memory Router**: Axum router running in test process using `axum-test-helper::TestClient` +- **Multipart Construction**: Using `reqwest::multipart::Form` to build proper multipart bodies +- **Mock Dependencies**: Mock Redis, time source, and event sink for isolation +- **No External Services**: No database, S3, Kafka, or real authentication required ### Test Framework -- **Integration Tests**: Full pipeline tests from HTTP request through ingestion to event storage -- **PostHog API Client**: Direct integration with PostHog query API to verify event processing -- **S3 Client**: Direct access to verify S3 blob storage, metadata, and retrieval +- **Integration Tests**: Capture service tests from HTTP request through parsing and validation - **Parameterized Tests**: Test variations across different event types, blob sizes, and configurations - **Async Testing**: Support for testing concurrent requests and large payload processing ### Test Data -- **AI Event Types**: `$ai_trace`, `$ai_span`, `$ai_generation`, `$ai_embedding`, `$ai_metric`, `$ai_feedback` -- **Blob Sizes**: Small (1KB), Medium (100KB), Large (1MB), Extra Large (10MB+) +- **AI Event Types**: Primarily `$ai_generation` for validation testing +- **Blob Sizes**: Small (< 1KB), Medium (~100KB), Empty - **Content Types**: `application/json`, `text/plain`, `application/octet-stream` -- **Compression**: Gzipped and uncompressed blobs -- **Encoding**: Raw binary, text, and base64-encoded blobs -- **Multipart Boundaries**: Various boundary strings to test collision handling +- **Multipart Boundaries**: Various boundary strings including custom boundaries ## Test Scenarios @@ -41,91 +38,64 @@ This document describes the high-level architecture and test scenarios for the L #### Scenario 1.1: Basic Routing -- **Test**: Verify `/ai` endpoint is accessible and returns correct response codes +- **Test**: Verify `/i/v0/ai` endpoint is accessible and returns correct response codes - **Validation**: HTTP 200 for valid requests, proper error codes for invalid requests +- **Tests Implemented**: + - `test_ai_endpoint_get_returns_405`: GET requests return 405 + - `test_ai_endpoint_put_returns_405`: PUT requests return 405 + - `test_ai_endpoint_delete_returns_405`: DELETE requests return 405 + - `test_ai_endpoint_no_auth_returns_401`: No auth header returns 401 #### Scenario 1.2: Multipart Parsing - **Test**: Send multipart requests with various boundary strings and blob configurations -- **Validation**: All parts parsed correctly, blob data extracted without corruption§ +- **Validation**: All parts parsed correctly, blob data extracted without corruption - **Variations**: Different boundary formats, multiple blobs, mixed content types +- **Tests Implemented**: + - `test_multipart_parsing_with_multiple_blobs`: Parse 4 parts (event + 3 blobs) + - `test_multipart_parsing_with_mixed_content_types`: JSON, text, binary + - `test_multipart_parsing_with_large_blob`: Large blob (~100KB) + - `test_multipart_parsing_with_empty_blob`: Empty blob handling #### Scenario 1.3: Boundary Validation - **Test**: Send requests with malformed boundaries, missing boundaries, boundary collisions - **Validation**: Appropriate error responses, no server crashes, proper error logging +- **Tests Implemented**: + - `test_multipart_missing_boundary_returns_400`: Missing boundary parameter + - `test_multipart_corrupted_boundary_returns_400`: Mismatched boundary -#### Scenario 1.4: Event Processing Verification +#### Scenario 1.4: Basic Validation -- **Test**: Send multipart request and verify event reaches PostHog query API -- **Validation**: Use PostHog query API to fetch processed event, verify blob placeholders correctly inserted +- **Test**: Send events with valid/invalid event types, missing required properties, duplicate blob properties +- **Validation**: Only accepted AI event types (`$ai_generation`, `$ai_trace`, `$ai_span`, `$ai_embedding`, `$ai_metric`, `$ai_feedback`) are processed; invalid events rejected with proper error messages +- **Tests Implemented**: + - `test_all_allowed_ai_event_types_accepted`: All six accepted event types pass validation + - `test_invalid_ai_event_type_returns_400`: Invalid AI event types rejected (e.g., `$ai_unknown`, `$ai_custom`) + - `test_invalid_event_name_not_ai_prefix_returns_400`: Non-AI event names rejected + - `test_invalid_event_name_regular_event_returns_400`: Regular events rejected (e.g., `$pageview`) + - `test_invalid_event_name_custom_event_returns_400`: Custom events rejected + - `test_missing_required_ai_properties_returns_400`: Missing `$ai_model` + - `test_empty_event_name_returns_400`: Empty event names + - `test_missing_distinct_id_returns_400`: Missing distinct_id + - `test_multipart_event_not_first_returns_400`: Event part ordering -#### Scenario 1.5: Basic Validation +#### Scenario 1.5: Content Type Validation -- **Test**: Send events with invalid names (not starting with `$ai_`), duplicate blob properties -- **Validation**: Invalid events rejected, valid events processed, proper error messages - -### Phase 2: Basic S3 Uploads - -#### Scenario 2.1: Individual Blob Upload - -- **Test**: Upload blobs of various sizes as separate S3 objects -- **Validation**: Verify each blob stored correctly, S3 URLs generated in event properties -- **Variations**: Small/medium/large blobs, different content types - -#### Scenario 2.2: S3 URL Generation and Access - -- **Test**: Verify generated S3 URLs in PostHog events point to accessible objects -- **Validation**: Query PostHog API for events, extract S3 URLs, verify blobs retrievable from S3 - -#### Scenario 2.3: Blob Metadata Storage - -- **Test**: Verify S3 object metadata is stored correctly -- **Validation**: Use S3 client to inspect object metadata - Content-Type, size, team_id present - -#### Scenario 2.4: Team Data Isolation - -- **Test**: Multiple teams uploading simultaneously -- **Validation**: Verify S3 key prefixes are team-scoped, no cross-team data access, proper S3 path isolation - -### Phase 3: S3 Infrastructure & Deployment - -#### Scenario 3.1: S3 Bucket Configuration - -- **Test**: Verify S3 bucket structure and lifecycle policies -- **Validation**: Use S3 client to verify correct `llma/` prefix structure, retention policies configured - -### Phase 4: Multipart File Processing - -#### Scenario 4.1: Multipart File Creation - -- **Test**: Upload events with multiple blobs, verify multipart/mixed format -- **Validation**: Use S3 client to verify single S3 file contains all blobs, proper MIME boundaries, metadata preserved -- **Variations**: 2-10 blobs per event, mixed content types, different blob sizes - -#### Scenario 4.2: Byte Range URLs and Access - -- **Test**: Verify S3 URLs in PostHog events include correct byte range parameters -- **Validation**: Query PostHog API for events, verify URLs contain range parameters, use S3 client to test range requests - -#### Scenario 4.3: Content Type Handling - -- **Test**: Mix of JSON, text, and binary blobs in single multipart file -- **Validation**: Content types preserved in multipart format, correctly parsed +- **Test**: Send requests with wrong content type or empty body +- **Validation**: Only `multipart/form-data` accepted +- **Tests Implemented**: + - `test_ai_endpoint_wrong_content_type_returns_400`: Non-multipart type + - `test_ai_endpoint_empty_body_returns_400`: Empty body ### Phase 5: Authorization -#### Scenario 5.1: API Key Authentication - -- **Test**: Send requests with valid/invalid/missing API keys -- **Validation**: Valid keys accepted, invalid keys rejected with 401, proper error messages - -#### Scenario 5.2: Request Signature Verification +#### Scenario 5.1: Request Signature Verification - **Test**: Test signature validation for various request formats - **Validation**: Valid signatures accepted, invalid signatures rejected -#### Scenario 5.3: Pre-processing Authentication +#### Scenario 5.2: Pre-processing Authentication - **Test**: Verify authentication occurs before multipart parsing - **Validation**: Invalid auth rejected immediately, no resource consumption for unauthorized requests @@ -142,11 +112,6 @@ This document describes the high-level architecture and test scenarios for the L - **Test**: Send uncompressed JSON/text blobs - **Validation**: Server compresses before S3 storage, compression metadata preserved -#### Scenario 7.3: Mixed Compression - -- **Test**: Single request with both compressed and uncompressed blobs -- **Validation**: Each blob handled according to its compression state - ### Phase 8: Schema Validation #### Scenario 8.1: Event Schema Validation @@ -165,12 +130,6 @@ This document describes the high-level architecture and test scenarios for the L - **Test**: Mismatched Content-Length headers and actual blob sizes - **Validation**: Mismatches detected and handled appropriately -### Phase 9: Limits (Optional) - DO NOT IMPLEMENT, TBD - -### Phase 10: Data Deletion (Optional) - DO NOT IMPLEMENT, TBD - -### Cross-Team Isolation Testing - ## Error Recovery and Edge Cases ### Edge Case Scenarios @@ -180,213 +139,42 @@ This document describes the high-level architecture and test scenarios for the L - **Test**: Invalid JSON, corrupted multipart data, missing required headers - **Validation**: Graceful error handling, no server crashes, proper error responses -#### Scenario E.2: S3 Service Interruption - -- **Test**: Simulate S3 unavailability during uploads -- **Validation**: Proper error responses, retry logic works, no data loss - -#### Scenario E.3: Kafka Unavailability - -- **Test**: Simulate Kafka unavailability during event publishing -- **Validation**: Appropriate error handling, request failure communicated to client - ## Local Development Testing ### Test Implementation -The integration test suite will be implemented in Rust to align with the capture service's existing toolchain and avoid introducing additional dependencies. +**Location**: `rust/capture/tests/integration_ai_endpoint.rs` -#### Test Structure +**Framework**: Tokio async tests with `axum-test-helper` -- **Location**: `tests/integration/llma/` directory within the capture service codebase -- **Framework**: Standard Rust testing framework with `tokio-test` for async operations -- **Dependencies**: - - `reqwest` for HTTP client operations - - `aws-sdk-s3` for S3 client operations - - `serde_json` for JSON manipulation - - `multipart` for constructing test requests +**Dependencies**: -#### Test Organization +- `reqwest` with `multipart` feature for constructing test requests +- `axum-test-helper` for in-memory HTTP testing +- `serde_json` for JSON manipulation -```text -tests/ -└── integration/ - └── llma/ - ├── mod.rs # Common test utilities and setup - ├── phase_01_http.rs # Phase 1: HTTP Endpoint tests - ├── phase_02_s3.rs # Phase 2: Basic S3 Upload tests - ├── phase_03_infra.rs # Phase 3: S3 Infrastructure tests - ├── phase_04_multipart.rs # Phase 4: Multipart File tests - ├── phase_05_auth.rs # Phase 5: Authorization tests - ├── phase_07_compression.rs # Phase 7: Compression tests - ├── phase_08_validation.rs # Phase 8: Schema Validation tests - └── fixtures/ # Test data and multipart request templates -``` - -### Local Test Environment Setup - -#### Prerequisites - -- **Local PostHog Instance**: Full PostHog deployment running locally -- **Local S3 Storage**: S3-compatible storage (configured via PostHog local setup) -- **Capture Service**: Running with `/ai` endpoint enabled -- **Test Configuration**: Environment variables for service endpoints and credentials - -#### Environment Configuration +### Running Tests ```bash -# PostHog Local Instance -export POSTHOG_HOST="http://localhost:8000" -export POSTHOG_API_KEY="test_api_key_123" -export POSTHOG_PROJECT_ID="1" +# Run all AI endpoint integration tests +cargo test --test integration_ai_endpoint -# Local S3 Configuration -export AWS_ENDPOINT_URL="http://localhost:9000" # Local S3-compatible endpoint -export AWS_ACCESS_KEY_ID="minioadmin" -export AWS_SECRET_ACCESS_KEY="minioadmin" -export AWS_DEFAULT_REGION="us-east-1" -export LLMA_S3_BUCKET="posthog-llma-test" - -# Capture Service -export CAPTURE_ENDPOINT="http://localhost:3000" -export LLMA_TEST_MODE="local" -``` - -### Test Execution - -#### Running Tests - -```bash -# Run all LLMA integration tests -cargo test --test llma_integration - -# Run specific phase tests -cargo test --test llma_integration phase_01 -cargo test --test llma_integration phase_02 +# Run specific test +cargo test --test integration_ai_endpoint test_multipart_parsing_with_multiple_blobs # Run with detailed output -cargo test --test llma_integration -- --nocapture - -# Run tests in sequence (important for phase gating) -cargo test --test llma_integration -- --test-threads=1 +cargo test --test integration_ai_endpoint -- --nocapture ``` -#### Test Utilities +### Test Characteristics -Each test phase will include common utilities for: - -- **Multipart Request Builder**: Construct multipart/form-data requests with event JSON and blob parts -- **S3 Client Wrapper**: Direct S3 operations for validation and cleanup -- **PostHog API Client**: Query PostHog API to verify event processing -- **Test Data Generators**: Create various blob sizes, content types, and event payloads -- **Cleanup Helpers**: Remove test data from S3 and PostHog between test runs - -#### Test Data Management - -- **Isolated Test Teams**: Each test uses unique team IDs to prevent interference -- **Cleanup Between Tests**: Automatic cleanup of S3 objects and PostHog test data -- **Fixture Data**: Predefined multipart requests and blob data for consistent testing -- **Random Data Generation**: Configurable blob sizes and content for stress testing +- Run in-memory with no external dependencies +- Mock Redis, time source, and event sink +- Fast execution suitable for CI/CD +- Test HTTP request/response handling only ## Phase Gating - **Mandatory Testing**: All integration tests for a phase must pass before proceeding to implementation of the next phase - **Regression Prevention**: Previous phase tests continue to run to ensure no regression - **Incremental Validation**: Each phase builds upon validated functionality from previous phases - -## Production Testing - -### Overview - -For validating the LLM Analytics capture pipeline in production environments, the test suite can be configured to run against live PostHog instances with real AWS S3 storage. - -### Configuration Requirements - -#### PostHog Credentials - -- **Project API Key**: PostHog project private API key for authentication -- **PostHog URL**: PostHog instance URL (cloud or self-hosted) -- **Project ID**: PostHog project identifier for query API access - -#### AWS S3 Credentials - -- **AWS Access Key ID**: Limited IAM user with read-only S3 access -- **AWS Secret Access Key**: Corresponding secret key -- **S3 Bucket Name**: Production S3 bucket name -- **Region**: AWS region for S3 bucket - -### IAM Policy for S3 Read Access - -The following IAM policy provides minimal read-only access for a specific team prefix: - -```json -{ - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "s3:GetObject", - "s3:GetObjectMetadata", - "s3:ListBucket" - ], - "Resource": [ - "arn:aws:s3:::your-llma-bucket/llma/TEAM_ID/*", - "arn:aws:s3:::your-llma-bucket" - ], - "Condition": { - "StringLike": { - "s3:prefix": "llma/TEAM_ID/*" - } - } - } - ] -} -``` - -### AWS CLI Script for S3 Key Generation - -A separate script (`generate-s3-test-keys.sh`) will be implemented to generate limited S3 read-only credentials for LLMA testing. The script will create IAM users with team-specific permissions and output the necessary environment variables for testing. - -### Production Test Configuration - -#### Environment Variables - -```bash -# PostHog Configuration -export POSTHOG_PROJECT_API_KEY="your_posthog_api_key" -export POSTHOG_HOST="https://app.posthog.com" # or your self-hosted URL -export POSTHOG_PROJECT_ID="12345" - -# AWS S3 Configuration -export AWS_ACCESS_KEY_ID="your_limited_access_key" -export AWS_SECRET_ACCESS_KEY="your_limited_secret_key" -export AWS_DEFAULT_REGION="us-east-1" -export LLMA_S3_BUCKET="your-llma-bucket" -export LLMA_TEAM_ID="123" - -# Test Configuration -export LLMA_TEST_MODE="production" -``` - -### Production Test Execution - -#### Safety Measures - -- **Read-Only Operations**: Production tests only read data, never write or modify -- **Team Isolation**: Tests only access data for the specified team ID -- **Rate Limiting**: Production tests include delays to avoid overwhelming services -- **Data Validation**: Verify S3 objects exist and are accessible without downloading large payloads - -#### Usage Example - -```bash -# Generate S3 test credentials (script to be implemented) -./generate-s3-test-keys.sh 123 posthog-llm-analytics - -# Configure environment -source production-test.env - -# Run production validation tests -pytest tests/integration/production/ -v --tb=short -``` diff --git a/rust/capture/src/ai_endpoint.rs b/rust/capture/src/ai_endpoint.rs new file mode 100644 index 0000000000..273ed4a0ea --- /dev/null +++ b/rust/capture/src/ai_endpoint.rs @@ -0,0 +1,476 @@ +use axum::body::Bytes; +use axum::extract::State; +use axum::http::HeaderMap; +use axum::response::Json; +use flate2::read::GzDecoder; +use futures::stream; +use multer::{parse_boundary, Multipart}; +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use std::collections::HashSet; +use std::io::Read; +use tracing::{debug, warn}; + +use crate::api::{CaptureError, CaptureResponse, CaptureResponseCode}; +use crate::router::State as AppState; +use crate::token::validate_token; + +#[derive(Debug, Serialize, Deserialize)] +pub struct PartInfo { + pub name: String, + pub length: usize, + #[serde(rename = "content-type")] + pub content_type: Option, + #[serde(rename = "content-encoding")] + pub content_encoding: Option, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct AIEndpointResponse { + pub accepted_parts: Vec, +} + +pub async fn ai_handler( + State(state): State, + headers: HeaderMap, + body: Bytes, +) -> Result, CaptureError> { + debug!("Received request to /i/v0/ai endpoint"); + + // Check for empty body + if body.is_empty() { + warn!("AI endpoint received empty body"); + return Err(CaptureError::EmptyPayload); + } + + // Note: Request body size limit is enforced by Axum's DefaultBodyLimit layer + // (110% of ai_max_sum_of_parts_bytes to account for multipart overhead) + + // Check for Content-Encoding header and decompress if needed + let content_encoding = headers + .get("content-encoding") + .and_then(|v| v.to_str().ok()) + .unwrap_or(""); + + let decompressed_body = if content_encoding.eq_ignore_ascii_case("gzip") { + debug!("Decompressing gzip-encoded request body"); + decompress_gzip(&body)? + } else { + body + }; + + // Check content type - must be multipart/form-data + let content_type = headers + .get("content-type") + .and_then(|v| v.to_str().ok()) + .unwrap_or(""); + + if !content_type.starts_with("multipart/form-data") { + warn!( + "AI endpoint received non-multipart content type: {}", + content_type + ); + return Err(CaptureError::RequestDecodingError( + "Content-Type must be multipart/form-data".to_string(), + )); + } + + // Extract boundary from Content-Type header using multer's built-in parser + let boundary = parse_boundary(content_type).map_err(|e| { + warn!("Failed to parse boundary from Content-Type: {}", e); + CaptureError::RequestDecodingError(format!("Invalid boundary in Content-Type: {e}")) + })?; + + // Check for authentication + let auth_header = headers + .get("authorization") + .and_then(|v| v.to_str().ok()) + .unwrap_or(""); + + if !auth_header.starts_with("Bearer ") { + warn!("AI endpoint missing or invalid Authorization header"); + return Err(CaptureError::NoTokenError); + } + + // Extract and validate token + let token = &auth_header[7..]; // Remove "Bearer " prefix + validate_token(token)?; + + // Parse multipart data and collect part information + let accepted_parts = parse_multipart_data( + &decompressed_body, + &boundary, + state.ai_max_sum_of_parts_bytes, + ) + .await?; + + // Log request details for debugging + debug!("AI endpoint request validated and parsed successfully"); + debug!("Body size: {} bytes", decompressed_body.len()); + debug!("Content-Type: {}", content_type); + debug!("Boundary: {}", boundary); + debug!("Token: {}...", &token[..std::cmp::min(8, token.len())]); + debug!("Accepted parts: {}", accepted_parts.len()); + + // TODO: Process AI events and upload to S3 + // For now, return the accepted parts information + let response = AIEndpointResponse { accepted_parts }; + + Ok(Json(response)) +} + +pub async fn options() -> Result { + Ok(CaptureResponse { + status: CaptureResponseCode::Ok, + quota_limited: None, + }) +} + +/// Decompress gzip-encoded body using streaming decompression +fn decompress_gzip(compressed: &Bytes) -> Result { + let mut decoder = GzDecoder::new(&compressed[..]); + let mut decompressed = Vec::new(); + + decoder.read_to_end(&mut decompressed).map_err(|e| { + warn!("Failed to decompress gzip body: {}", e); + CaptureError::RequestDecodingError(format!("Failed to decompress gzip body: {e}")) + })?; + + debug!( + "Decompressed {} bytes to {} bytes", + compressed.len(), + decompressed.len() + ); + Ok(Bytes::from(decompressed)) +} + +/// Validate blob part content type +fn is_valid_blob_content_type(content_type: &str) -> bool { + // Supported content types for blob parts + content_type == "application/octet-stream" + || content_type == "application/json" + || content_type == "text/plain" + || content_type.starts_with("text/plain;") // Allow text/plain with charset +} + +/// Parse multipart data and validate structure +async fn parse_multipart_data( + body: &[u8], + boundary: &str, + max_sum_of_parts_bytes: usize, +) -> Result, CaptureError> { + // Size limits + const MAX_EVENT_SIZE: usize = 32 * 1024; // 32KB + const MAX_COMBINED_SIZE: usize = 1024 * 1024 - 64 * 1024; // 1MB - 64KB = 960KB + + // Create a stream from the body data - need to own the data + let body_owned = body.to_vec(); + let body_stream = stream::once(async move { Ok::, std::io::Error>(body_owned) }); + + // Create multipart parser + let mut multipart = Multipart::new(body_stream, boundary); + + let mut part_count = 0; + let mut has_event_part = false; + let mut first_part_processed = false; + let mut accepted_parts = Vec::new(); + let mut seen_property_names = HashSet::new(); + let mut event_json: Option = None; + let mut properties_json: Option = None; + let mut event_size: usize = 0; + let mut properties_size: usize = 0; + let mut sum_of_parts_bytes: usize = 0; + + // Parse each part + while let Some(field) = multipart.next_field().await.map_err(|e| { + warn!("Multipart parsing error: {}", e); + CaptureError::RequestDecodingError(format!("Multipart parsing failed: {e}")) + })? { + part_count += 1; + + // Extract all field information before consuming the field + let field_name = field.name().unwrap_or("unknown").to_string(); + let content_type = field.content_type().map(|ct| ct.to_string()); + let content_encoding = field + .headers() + .get("content-encoding") + .and_then(|v| v.to_str().ok()) + .map(|s| s.to_string()); + + debug!( + "Processing multipart field: {} (part #{})", + field_name, part_count + ); + + // Check if this is the first part + if !first_part_processed { + first_part_processed = true; + + // Validate that the first part is the event part + if field_name != "event" { + return Err(CaptureError::RequestDecodingError(format!( + "First part must be 'event', got '{field_name}'" + ))); + } + + debug!("First part is 'event' as expected"); + } + + // Read the field data to get the length (this consumes the field) + let field_data = field.bytes().await.map_err(|e| { + warn!("Failed to read field data for '{}': {}", field_name, e); + CaptureError::RequestDecodingError(format!("Failed to read field data: {e}")) + })?; + + // Track sum of all part sizes + sum_of_parts_bytes += field_data.len(); + + // Check if this is the event JSON part + if field_name == "event" { + has_event_part = true; + event_size = field_data.len(); + + // Check event size limit + if event_size > MAX_EVENT_SIZE { + return Err(CaptureError::EventTooBig(format!( + "Event part size ({event_size} bytes) exceeds maximum allowed size ({MAX_EVENT_SIZE} bytes)" + ))); + } + + // Parse the event JSON (without validating properties yet) + let event_json_str = std::str::from_utf8(&field_data).map_err(|e| { + warn!("Event part is not valid UTF-8: {}", e); + CaptureError::RequestDecodingError("Event part must be valid UTF-8".to_string()) + })?; + + event_json = Some(serde_json::from_str(event_json_str).map_err(|e| { + warn!("Event part is not valid JSON: {}", e); + CaptureError::RequestDecodingError("Event part must be valid JSON".to_string()) + })?); + + debug!("Event part parsed successfully"); + } else if field_name == "event.properties" { + properties_size = field_data.len(); + + // Parse the properties JSON + let properties_json_str = std::str::from_utf8(&field_data).map_err(|e| { + warn!("Properties part is not valid UTF-8: {}", e); + CaptureError::RequestDecodingError( + "Properties part must be valid UTF-8".to_string(), + ) + })?; + + properties_json = Some(serde_json::from_str(properties_json_str).map_err(|e| { + warn!("Properties part is not valid JSON: {}", e); + CaptureError::RequestDecodingError("Properties part must be valid JSON".to_string()) + })?); + + debug!("Properties part parsed successfully"); + } else if field_name.starts_with("event.properties.") { + // This is a blob part - check for duplicates + if !seen_property_names.insert(field_name.clone()) { + return Err(CaptureError::RequestDecodingError(format!( + "Duplicate blob property: {field_name}" + ))); + } + + // Validate content type for blob parts - it's required + match &content_type { + Some(ref ct) => { + let ct_lower = ct.to_lowercase(); + if !is_valid_blob_content_type(&ct_lower) { + return Err(CaptureError::RequestDecodingError( + format!("Unsupported content type for blob part '{field_name}': '{ct}'. Supported types: application/octet-stream, application/json, text/plain"), + )); + } + } + None => { + return Err(CaptureError::RequestDecodingError(format!( + "Missing required Content-Type header for blob part '{field_name}'" + ))); + } + } + + debug!("Blob part '{}' processed successfully", field_name); + } else { + warn!("Unknown multipart field: {}", field_name); + } + + // Create and store part info after all validation + let part_info = PartInfo { + name: field_name.clone(), + length: field_data.len(), + content_type, + content_encoding, + }; + accepted_parts.push(part_info); + } + + // Validate that we have at least the event part + if !has_event_part { + return Err(CaptureError::RequestDecodingError( + "Missing required 'event' part in multipart data".to_string(), + )); + } + + // Check combined size limit + let combined_size = event_size + properties_size; + if combined_size > MAX_COMBINED_SIZE { + return Err(CaptureError::EventTooBig(format!( + "Combined event and properties size ({combined_size} bytes) exceeds maximum allowed size ({MAX_COMBINED_SIZE} bytes)" + ))); + } + + // Check sum of all parts limit + if sum_of_parts_bytes > max_sum_of_parts_bytes { + return Err(CaptureError::EventTooBig(format!( + "Sum of all parts ({sum_of_parts_bytes} bytes) exceeds maximum allowed size ({max_sum_of_parts_bytes} bytes)" + ))); + } + + // Merge properties into the event + let mut event = event_json.unwrap(); + + // Check for conflicting properties sources + let has_embedded_properties = event + .as_object() + .and_then(|obj| obj.get("properties")) + .is_some(); + + if has_embedded_properties && properties_json.is_some() { + return Err(CaptureError::RequestDecodingError( + "Event cannot have both embedded properties and a separate 'event.properties' part" + .to_string(), + )); + } + + // Determine which properties to use: + // - If there's a separate event.properties part, use it + // - If there's no separate part, use embedded properties from the event (if any) + // - If neither exists, use empty object + let properties = if let Some(props) = properties_json { + props + } else { + // No separate part - check for embedded properties + if let Some(event_obj) = event.as_object() { + event_obj + .get("properties") + .cloned() + .unwrap_or(serde_json::json!({})) + } else { + serde_json::json!({}) + } + }; + + // Insert/replace properties in the event object + if let Some(event_obj) = event.as_object_mut() { + event_obj.insert("properties".to_string(), properties); + } else { + return Err(CaptureError::RequestDecodingError( + "Event must be a JSON object".to_string(), + )); + } + + // Now validate the complete event structure + validate_event_structure(&event)?; + + debug!( + "Multipart parsing completed: {} parts processed", + part_count + ); + Ok(accepted_parts) +} + +/// Validate the structure and content of an AI event +fn validate_event_structure(event: &Value) -> Result<(), CaptureError> { + // Check if event is an object + let event_obj = event.as_object().ok_or_else(|| { + warn!("Event must be a JSON object"); + CaptureError::RequestDecodingError("Event must be a JSON object".to_string()) + })?; + + // Validate event name + let event_name = event_obj + .get("event") + .and_then(|v| v.as_str()) + .ok_or_else(|| { + warn!("Event missing 'event' field"); + CaptureError::RequestDecodingError("Event missing 'event' field".to_string()) + })?; + + if event_name.is_empty() { + return Err(CaptureError::RequestDecodingError( + "Event name cannot be empty".to_string(), + )); + } + + // Only accept specific AI event types + const ALLOWED_AI_EVENTS: [&str; 6] = [ + "$ai_generation", + "$ai_trace", + "$ai_span", + "$ai_embedding", + "$ai_metric", + "$ai_feedback", + ]; + + if !ALLOWED_AI_EVENTS.contains(&event_name) { + return Err(CaptureError::RequestDecodingError(format!( + "Event name must be one of: {}, got '{}'", + ALLOWED_AI_EVENTS.join(", "), + event_name + ))); + } + + // Validate distinct_id + let distinct_id = event_obj + .get("distinct_id") + .and_then(|v| v.as_str()) + .ok_or_else(|| { + warn!("Event missing 'distinct_id' field"); + CaptureError::RequestDecodingError("Event missing 'distinct_id' field".to_string()) + })?; + + if distinct_id.is_empty() { + return Err(CaptureError::RequestDecodingError( + "distinct_id cannot be empty".to_string(), + )); + } + + // Validate properties object + let properties = event_obj + .get("properties") + .and_then(|v| v.as_object()) + .ok_or_else(|| { + warn!("Event missing 'properties' field"); + CaptureError::RequestDecodingError("Event missing 'properties' field".to_string()) + })?; + + // Validate required AI properties + if !properties.contains_key("$ai_model") { + return Err(CaptureError::RequestDecodingError( + "Event properties must contain '$ai_model'".to_string(), + )); + } + + let ai_model = properties + .get("$ai_model") + .and_then(|v| v.as_str()) + .ok_or_else(|| { + warn!("$ai_model must be a string"); + CaptureError::RequestDecodingError("$ai_model must be a string".to_string()) + })?; + + if ai_model.is_empty() { + return Err(CaptureError::RequestDecodingError( + "$ai_model cannot be empty".to_string(), + )); + } + + debug!( + "Event validation passed: event='{}', distinct_id='{}', ai_model='{}'", + event_name, distinct_id, ai_model + ); + + Ok(()) +} diff --git a/rust/capture/src/config.rs b/rust/capture/src/config.rs index ff6d22ec7a..1778cbab04 100644 --- a/rust/capture/src/config.rs +++ b/rust/capture/src/config.rs @@ -105,6 +105,10 @@ pub struct Config { // deploy var [0.0..100.0] to sample behavior of interest for verbose logging #[envconfig(default = "0.0")] pub verbose_sample_percent: f32, + + // AI endpoint size limits + #[envconfig(default = "26214400")] // 25MB in bytes + pub ai_max_sum_of_parts_bytes: usize, } #[derive(Envconfig, Clone)] diff --git a/rust/capture/src/lib.rs b/rust/capture/src/lib.rs index c826f0d54c..72adb7d015 100644 --- a/rust/capture/src/lib.rs +++ b/rust/capture/src/lib.rs @@ -1,3 +1,4 @@ +pub mod ai_endpoint; pub mod api; pub mod config; pub mod limiters; diff --git a/rust/capture/src/router.rs b/rust/capture/src/router.rs index 8e4fcd4aae..f831caf5af 100644 --- a/rust/capture/src/router.rs +++ b/rust/capture/src/router.rs @@ -15,7 +15,7 @@ use tower_http::trace::TraceLayer; use crate::metrics_middleware::track_metrics; use crate::test_endpoint; -use crate::{sinks, time::TimeSource, v0_endpoint}; +use crate::{ai_endpoint, sinks, time::TimeSource, v0_endpoint}; use common_redis::Client; use limiters::token_dropper::TokenDropper; @@ -39,6 +39,7 @@ pub struct State { pub capture_mode: CaptureMode, pub is_mirror_deploy: bool, pub verbose_sample_percent: f32, + pub ai_max_sum_of_parts_bytes: usize, } #[derive(Clone)] @@ -114,6 +115,7 @@ pub fn router< historical_tokens_keys: Option, is_mirror_deploy: bool, verbose_sample_percent: f32, + ai_max_sum_of_parts_bytes: usize, ) -> Router { let state = State { sink: Arc::new(sink), @@ -130,6 +132,7 @@ pub fn router< capture_mode: capture_mode.clone(), is_mirror_deploy, verbose_sample_percent, + ai_max_sum_of_parts_bytes, }; // Very permissive CORS policy, as old SDK versions @@ -253,11 +256,26 @@ pub fn router< ) .layer(DefaultBodyLimit::max(RECORDING_BODY_SIZE)); + // AI endpoint body limit is 110% of max sum of parts to account for multipart overhead + let ai_body_limit = (state.ai_max_sum_of_parts_bytes as f64 * 1.1) as usize; + + let ai_router = Router::new() + .route( + "/i/v0/ai", + post(ai_endpoint::ai_handler).options(ai_endpoint::options), + ) + .route( + "/i/v0/ai/", + post(ai_endpoint::ai_handler).options(ai_endpoint::options), + ) + .layer(DefaultBodyLimit::max(ai_body_limit)); + let mut router = match capture_mode { CaptureMode::Events => Router::new() .merge(batch_router) .merge(event_router) - .merge(test_router), + .merge(test_router) + .merge(ai_router), CaptureMode::Recordings => Router::new().merge(recordings_router), }; diff --git a/rust/capture/src/server.rs b/rust/capture/src/server.rs index 68895b3ede..d40ecd5d0d 100644 --- a/rust/capture/src/server.rs +++ b/rust/capture/src/server.rs @@ -185,6 +185,7 @@ where config.historical_tokens_keys, config.is_mirror_deploy, config.verbose_sample_percent, + config.ai_max_sum_of_parts_bytes, ); // run our app with hyper diff --git a/rust/capture/tests/common/integration_utils.rs b/rust/capture/tests/common/integration_utils.rs index d5df54f61a..9cfa2a7781 100644 --- a/rust/capture/tests/common/integration_utils.rs +++ b/rust/capture/tests/common/integration_utils.rs @@ -994,6 +994,7 @@ fn setup_capture_router(unit: &TestCase) -> (Router, MemorySink) { historical_tokens_keys, is_mirror_deploy, verbose_sample_percent, + 26_214_400, // 25MB default for AI endpoint ), sink, ) diff --git a/rust/capture/tests/common/utils.rs b/rust/capture/tests/common/utils.rs index 9ceee1c1bb..7166f204be 100644 --- a/rust/capture/tests/common/utils.rs +++ b/rust/capture/tests/common/utils.rs @@ -80,6 +80,7 @@ pub static DEFAULT_CONFIG: Lazy = Lazy::new(|| Config { s3_fallback_endpoint: None, s3_fallback_prefix: String::new(), healthcheck_strategy: HealthStrategy::All, + ai_max_sum_of_parts_bytes: 26_214_400, // 25MB default }); static TRACING_INIT: Once = Once::new(); diff --git a/rust/capture/tests/integration_ai_endpoint.rs b/rust/capture/tests/integration_ai_endpoint.rs new file mode 100644 index 0000000000..a3899bb34e --- /dev/null +++ b/rust/capture/tests/integration_ai_endpoint.rs @@ -0,0 +1,1404 @@ +#[path = "common/integration_utils.rs"] +mod integration_utils; + +use async_trait::async_trait; +use axum::http::StatusCode; +use axum::Router; +use axum_test_helper::TestClient; +use capture::api::CaptureError; +use capture::config::CaptureMode; +use capture::limiters::CaptureQuotaLimiter; +use capture::router::router; +use capture::sinks::Event; +use capture::time::TimeSource; +use capture::v0_request::ProcessedEvent; +use chrono::{DateTime, Utc}; +use common_redis::MockRedisClient; +use futures::StreamExt; +use health::HealthRegistry; +use integration_utils::{DEFAULT_CONFIG, DEFAULT_TEST_TIME}; +use limiters::token_dropper::TokenDropper; +use reqwest::multipart::{Form, Part}; +use serde_json::{json, Value}; +use std::io::Write; +use std::sync::Arc; +use std::time::Duration; + +// Fixed time source for tests +struct FixedTime { + pub time: DateTime, +} + +impl TimeSource for FixedTime { + fn current_time(&self) -> DateTime { + self.time + } +} + +// Simple memory sink for tests +#[derive(Clone, Default)] +struct TestSink; + +#[async_trait] +impl Event for TestSink { + async fn send(&self, _event: ProcessedEvent) -> Result<(), CaptureError> { + Ok(()) + } + + async fn send_batch(&self, _events: Vec) -> Result<(), CaptureError> { + Ok(()) + } +} + +// Helper to build multipart form and send request +async fn send_multipart_request( + client: &TestClient, + form: Form, + auth_token: Option<&str>, +) -> axum_test_helper::TestResponse { + // Get the boundary from the form + let boundary = form.boundary().to_string(); + let content_type = format!("multipart/form-data; boundary={boundary}"); + + // Use into_stream() to get the body bytes + let mut stream = form.into_stream(); + let mut body = Vec::new(); + + while let Some(chunk) = stream.next().await { + body.extend_from_slice(&chunk.unwrap()); + } + + let mut request = client + .post("/i/v0/ai") + .header("Content-Type", content_type) + .body(body); + + if let Some(token) = auth_token { + request = request.header("Authorization", format!("Bearer {token}")); + } + + request.send().await +} + +/// Helper to create a basic AI event with properties in separate parts +fn create_ai_event_form(event_name: &str, distinct_id: &str, properties: Value) -> Form { + let event_data = json!({ + "event": event_name, + "distinct_id": distinct_id + }); + + Form::new() + .part( + "event", + Part::bytes(serde_json::to_vec(&event_data).unwrap()) + .mime_str("application/json") + .unwrap(), + ) + .part( + "event.properties", + Part::bytes(serde_json::to_vec(&properties).unwrap()) + .mime_str("application/json") + .unwrap(), + ) +} + +// Helper to setup test router +fn setup_ai_test_router() -> Router { + let liveness = HealthRegistry::new("ai_endpoint_tests"); + let sink = TestSink; + let timesource = FixedTime { + time: DateTime::parse_from_rfc3339(DEFAULT_TEST_TIME) + .expect("Invalid fixed time format") + .with_timezone(&Utc), + }; + let redis = Arc::new(MockRedisClient::new()); + + let mut cfg = DEFAULT_CONFIG.clone(); + cfg.capture_mode = CaptureMode::Events; + + let quota_limiter = + CaptureQuotaLimiter::new(&cfg, redis.clone(), Duration::from_secs(60 * 60 * 24 * 7)); + + router( + timesource, + liveness, + sink, + redis, + quota_limiter, + TokenDropper::default(), + false, + CaptureMode::Events, + None, + 25 * 1024 * 1024, + false, + 1_i64, + None, + false, + 0.0_f32, + 26_214_400, // 25MB default for AI endpoint + ) +} + +// ============================================================================ +// PHASE 1: HTTP ENDPOINT +// ============================================================================ + +// ---------------------------------------------------------------------------- +// Scenario 1.1: HTTP Method Validation +// ---------------------------------------------------------------------------- + +#[tokio::test] +async fn test_ai_endpoint_get_returns_405() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let response = test_client + .get("/i/v0/ai") + .header("Authorization", "Bearer test_token") + .send() + .await; + + assert_eq!(response.status(), StatusCode::METHOD_NOT_ALLOWED); +} + +#[tokio::test] +async fn test_ai_endpoint_put_returns_405() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let response = test_client + .put("/i/v0/ai") + .header("Authorization", "Bearer test_token") + .body("test") + .send() + .await; + + assert_eq!(response.status(), StatusCode::METHOD_NOT_ALLOWED); +} + +#[tokio::test] +async fn test_ai_endpoint_delete_returns_405() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let response = test_client + .delete("/i/v0/ai") + .header("Authorization", "Bearer test_token") + .send() + .await; + + assert_eq!(response.status(), StatusCode::METHOD_NOT_ALLOWED); +} + +// ---------------------------------------------------------------------------- +// Scenario 1.1: Authentication Validation +// ---------------------------------------------------------------------------- + +#[tokio::test] +async fn test_ai_endpoint_no_auth_returns_401() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let properties = json!({ + "$ai_model": "test" + }); + + let form = create_ai_event_form("$ai_generation", "test_user", properties); + + let response = send_multipart_request(&test_client, form, None).await; + assert_eq!(response.status(), StatusCode::UNAUTHORIZED); +} + +// ---------------------------------------------------------------------------- +// Scenario 1.1: Content Type Validation +// ---------------------------------------------------------------------------- + +#[tokio::test] +async fn test_ai_endpoint_wrong_content_type_returns_400() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let event_data = json!({ + "event": "$ai_generation", + "distinct_id": "test_user" + }); + + let response = test_client + .post("/i/v0/ai") + .header("Content-Type", "application/json") + .header( + "Authorization", + "Bearer phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3", + ) + .json(&event_data) + .send() + .await; + + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +#[tokio::test] +async fn test_ai_endpoint_empty_body_returns_400() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let response = test_client + .post("/i/v0/ai") + .header("Content-Type", "multipart/form-data; boundary=test") + .header( + "Authorization", + "Bearer phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3", + ) + .send() + .await; + + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +// ---------------------------------------------------------------------------- +// Scenario 1.2: Multipart Parsing +// ---------------------------------------------------------------------------- + +#[tokio::test] +async fn test_multipart_parsing_with_multiple_blobs() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let event_data = json!({ + "event": "$ai_generation", + "distinct_id": "test_user" + }); + + let properties = json!({ + "$ai_model": "test-multi-blob" + }); + + let input_blob = json!({"messages": [{"role": "user", "content": "Hello"}]}); + let output_blob = json!({"choices": [{"message": {"content": "Hi there"}}]}); + let metadata_blob = json!({"model_version": "1.0", "temperature": 0.7}); + + let form = Form::new() + .part( + "event", + Part::bytes(serde_json::to_vec(&event_data).unwrap()) + .mime_str("application/json") + .unwrap(), + ) + .part( + "event.properties", + Part::bytes(serde_json::to_vec(&properties).unwrap()) + .mime_str("application/json") + .unwrap(), + ) + .part( + "event.properties.$ai_input", + Part::bytes(serde_json::to_vec(&input_blob).unwrap()) + .mime_str("application/json") + .unwrap(), + ) + .part( + "event.properties.$ai_output", + Part::bytes(serde_json::to_vec(&output_blob).unwrap()) + .mime_str("application/json") + .unwrap(), + ) + .part( + "event.properties.$ai_metadata", + Part::bytes(serde_json::to_vec(&metadata_blob).unwrap()) + .mime_str("application/json") + .unwrap(), + ); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!(response.status(), StatusCode::OK); + + let response_json: serde_json::Value = response.json::().await; + let accepted_parts = response_json["accepted_parts"].as_array().unwrap(); + assert_eq!(accepted_parts.len(), 5); + assert_eq!(accepted_parts[0]["name"], "event"); + assert_eq!(accepted_parts[0]["length"].as_u64().unwrap(), 52); + assert_eq!(accepted_parts[1]["name"], "event.properties"); + assert_eq!(accepted_parts[1]["length"].as_u64().unwrap(), 31); + assert_eq!(accepted_parts[2]["name"], "event.properties.$ai_input"); + assert_eq!(accepted_parts[2]["length"].as_u64().unwrap(), 48); + assert_eq!(accepted_parts[3]["name"], "event.properties.$ai_output"); + assert_eq!(accepted_parts[3]["length"].as_u64().unwrap(), 48); + assert_eq!(accepted_parts[4]["name"], "event.properties.$ai_metadata"); + assert_eq!(accepted_parts[4]["length"].as_u64().unwrap(), 41); +} + +#[tokio::test] +async fn test_multipart_parsing_with_mixed_content_types() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let event_data = json!({ + "event": "$ai_generation", + "distinct_id": "test_user" + }); + + let properties = json!({ + "$ai_model": "test-mixed-types" + }); + + let form = Form::new() + .part( + "event", + Part::bytes(serde_json::to_vec(&event_data).unwrap()) + .mime_str("application/json") + .unwrap(), + ) + .part( + "event.properties", + Part::bytes(serde_json::to_vec(&properties).unwrap()) + .mime_str("application/json") + .unwrap(), + ) + .part( + "event.properties.$ai_json_blob", + Part::bytes(serde_json::to_vec(&json!({"type": "json"})).unwrap()) + .mime_str("application/json") + .unwrap(), + ) + .part( + "event.properties.$ai_text_blob", + Part::bytes(b"This is plain text content".to_vec()) + .mime_str("text/plain") + .unwrap(), + ) + .part( + "event.properties.$ai_binary_blob", + Part::bytes(vec![0x00, 0x01, 0x02, 0x03, 0x04, 0x05]) + .mime_str("application/octet-stream") + .unwrap(), + ); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!(response.status(), StatusCode::OK); + + let response_json: serde_json::Value = response.json::().await; + let accepted_parts = response_json["accepted_parts"].as_array().unwrap(); + assert_eq!(accepted_parts.len(), 5); + assert_eq!(accepted_parts[0]["length"].as_u64().unwrap(), 52); + assert_eq!(accepted_parts[1]["length"].as_u64().unwrap(), 32); + assert_eq!(accepted_parts[2]["content-type"], "application/json"); + assert_eq!(accepted_parts[2]["length"].as_u64().unwrap(), 15); + assert_eq!(accepted_parts[3]["content-type"], "text/plain"); + assert_eq!(accepted_parts[3]["length"].as_u64().unwrap(), 26); + assert_eq!( + accepted_parts[4]["content-type"], + "application/octet-stream" + ); + assert_eq!(accepted_parts[4]["length"].as_u64().unwrap(), 6); +} + +#[tokio::test] +async fn test_multipart_parsing_with_large_blob() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let event_data = json!({ + "event": "$ai_generation", + "distinct_id": "test_user" + }); + + let properties = json!({ + "$ai_model": "test-large" + }); + + // Create a large JSON blob (100KB) + let large_blob = json!({ + "messages": (0..100).map(|_| json!({"role": "user", "content": "x".repeat(1000)})).collect::>() + }); + + let form = Form::new() + .part( + "event", + Part::bytes(serde_json::to_vec(&event_data).unwrap()) + .mime_str("application/json") + .unwrap(), + ) + .part( + "event.properties", + Part::bytes(serde_json::to_vec(&properties).unwrap()) + .mime_str("application/json") + .unwrap(), + ) + .part( + "event.properties.$ai_large_input", + Part::bytes(serde_json::to_vec(&large_blob).unwrap()) + .mime_str("application/json") + .unwrap(), + ); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!(response.status(), StatusCode::OK); + + let response_json: serde_json::Value = response.json::().await; + let accepted_parts = response_json["accepted_parts"].as_array().unwrap(); + assert_eq!(accepted_parts.len(), 3); + assert_eq!(accepted_parts[0]["length"].as_u64().unwrap(), 52); + assert_eq!(accepted_parts[1]["length"].as_u64().unwrap(), 26); + assert_eq!(accepted_parts[2]["length"].as_u64().unwrap(), 102914); +} + +#[tokio::test] +async fn test_multipart_parsing_with_empty_blob() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let event_data = json!({ + "event": "$ai_generation", + "distinct_id": "test_user" + }); + + let properties = json!({ + "$ai_model": "test-empty" + }); + + let form = Form::new() + .part( + "event", + Part::bytes(serde_json::to_vec(&event_data).unwrap()) + .mime_str("application/json") + .unwrap(), + ) + .part( + "event.properties", + Part::bytes(serde_json::to_vec(&properties).unwrap()) + .mime_str("application/json") + .unwrap(), + ) + .part( + "event.properties.$ai_empty", + Part::bytes(Vec::new()) + .mime_str("application/json") + .unwrap(), + ); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!(response.status(), StatusCode::OK); + + let response_json: serde_json::Value = response.json::().await; + let accepted_parts = response_json["accepted_parts"].as_array().unwrap(); + assert_eq!(accepted_parts[2]["length"], 0); +} + +// ---------------------------------------------------------------------------- +// Scenario 1.3: Boundary Validation +// ---------------------------------------------------------------------------- + +#[tokio::test] +async fn test_multipart_missing_boundary_returns_400() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let event_data = json!({ + "event": "$ai_generation", + "distinct_id": "test_user", + "properties": { + "$ai_model": "test-missing-boundary" + } + }); + + let form = Form::new().part( + "event", + Part::bytes(serde_json::to_vec(&event_data).unwrap()) + .file_name("event.json") + .mime_str("application/json") + .unwrap(), + ); + + // Use into_stream() to get the body bytes + let mut stream = form.into_stream(); + let mut body = Vec::new(); + + while let Some(chunk) = stream.next().await { + body.extend_from_slice(&chunk.unwrap()); + } + + // Send with missing boundary parameter in Content-Type + let response = test_client + .post("/i/v0/ai") + .header("Content-Type", "multipart/form-data") + .header( + "Authorization", + "Bearer phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3", + ) + .body(body) + .send() + .await; + + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +#[tokio::test] +async fn test_multipart_corrupted_boundary_returns_400() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let event_data = json!({ + "event": "$ai_generation", + "distinct_id": "test_user", + "properties": { + "$ai_model": "test-corrupted-boundary" + } + }); + + // Manually construct a request with corrupted boundary + let body = format!( + "------WebKitFormBoundary1234567890abcdef\r\nContent-Disposition: form-data; name=\"event\"; filename=\"event.json\"\r\nContent-Type: application/json\r\n\r\n{}\r\n------WebKitFormBoundary1234567890abcdef--\r\n", + serde_json::to_string(&event_data).unwrap() + ); + + let response = test_client + .post("/i/v0/ai") + .header( + "Content-Type", + "multipart/form-data; boundary=corrupted------WebKitFormBoundary1234567890abcdef", + ) + .header( + "Authorization", + "Bearer phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3", + ) + .body(body) + .send() + .await; + + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +// ---------------------------------------------------------------------------- +// Scenario 1.4: Event Processing Verification +// ---------------------------------------------------------------------------- + +#[tokio::test] +async fn test_multipart_event_not_first_returns_400() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let event_data = json!({ + "event": "$ai_generation", + "distinct_id": "test_user", + "properties": { + "$ai_model": "test-event-not-first" + } + }); + + // Create multipart data with blob part first, then event part + let form = Form::new() + .part( + "event.properties.$ai_input", + Part::bytes( + serde_json::to_vec(&json!({"messages": [{"role": "user", "content": "test"}]})) + .unwrap(), + ) + .file_name("input.json") + .mime_str("application/json") + .unwrap(), + ) + .part( + "event", + Part::bytes(serde_json::to_vec(&event_data).unwrap()) + .file_name("event.json") + .mime_str("application/json") + .unwrap(), + ); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +// ---------------------------------------------------------------------------- +// Scenario 1.5: Basic Validation +// ---------------------------------------------------------------------------- + +#[tokio::test] +async fn test_invalid_event_name_not_ai_prefix_returns_400() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let event_data = json!({ + "event": "invalid_event_name", + "distinct_id": "test_user", + "properties": { + "$ai_model": "test-invalid-name" + } + }); + + let form = Form::new().part( + "event", + Part::bytes(serde_json::to_vec(&event_data).unwrap()) + .file_name("event.json") + .mime_str("application/json") + .unwrap(), + ); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +#[tokio::test] +async fn test_invalid_event_name_regular_event_returns_400() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let event_data = json!({ + "event": "$pageview", + "distinct_id": "test_user", + "properties": { + "page": "/test" + } + }); + + let form = Form::new().part( + "event", + Part::bytes(serde_json::to_vec(&event_data).unwrap()) + .file_name("event.json") + .mime_str("application/json") + .unwrap(), + ); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +#[tokio::test] +async fn test_invalid_event_name_custom_event_returns_400() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let event_data = json!({ + "event": "button_clicked", + "distinct_id": "test_user", + "properties": { + "button": "submit" + } + }); + + let form = Form::new().part( + "event", + Part::bytes(serde_json::to_vec(&event_data).unwrap()) + .file_name("event.json") + .mime_str("application/json") + .unwrap(), + ); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +#[tokio::test] +async fn test_all_allowed_ai_event_types_accepted() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let allowed_events = vec![ + "$ai_generation", + "$ai_trace", + "$ai_span", + "$ai_embedding", + "$ai_metric", + "$ai_feedback", + ]; + + for event_name in allowed_events { + let properties = json!({ + "$ai_model": "test-model" + }); + + let form = create_ai_event_form(event_name, "test_user", properties); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!( + response.status(), + StatusCode::OK, + "Event type {event_name} should be accepted" + ); + } +} + +#[tokio::test] +async fn test_invalid_ai_event_type_returns_400() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let invalid_events = vec!["$ai_unknown", "$ai_custom", "$ai_"]; + + for event_name in invalid_events { + let properties = json!({ + "$ai_model": "test-model" + }); + + let form = create_ai_event_form(event_name, "test_user", properties); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!( + response.status(), + StatusCode::BAD_REQUEST, + "Event type {event_name} should be rejected" + ); + } +} + +#[tokio::test] +async fn test_missing_required_ai_properties_returns_400() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let event_data = json!({ + "event": "$ai_generation", + "distinct_id": "test_user", + "properties": { + "custom_property": "test_value" + } + }); + + let form = Form::new().part( + "event", + Part::bytes(serde_json::to_vec(&event_data).unwrap()) + .file_name("event.json") + .mime_str("application/json") + .unwrap(), + ); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +#[tokio::test] +async fn test_empty_event_name_returns_400() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let event_data = json!({ + "event": "", + "distinct_id": "test_user", + "properties": { + "$ai_model": "test-empty-name" + } + }); + + let form = Form::new().part( + "event", + Part::bytes(serde_json::to_vec(&event_data).unwrap()) + .file_name("event.json") + .mime_str("application/json") + .unwrap(), + ); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +#[tokio::test] +async fn test_missing_distinct_id_returns_400() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let event_data = json!({ + "event": "$ai_generation", + "properties": { + "$ai_model": "test-missing-distinct-id" + } + }); + + let form = Form::new().part( + "event", + Part::bytes(serde_json::to_vec(&event_data).unwrap()) + .file_name("event.json") + .mime_str("application/json") + .unwrap(), + ); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +#[tokio::test] +async fn test_ai_endpoint_returns_200_for_valid_request() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let properties = json!({ + "$ai_model": "test" + }); + + let form = create_ai_event_form("$ai_generation", "test_user", properties); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!(response.status(), StatusCode::OK); + + let response_json: serde_json::Value = response.json::().await; + assert!(response_json["accepted_parts"].is_array()); + let accepted_parts = response_json["accepted_parts"].as_array().unwrap(); + assert_eq!(accepted_parts.len(), 2); + assert_eq!(accepted_parts[0]["name"], "event"); + assert_eq!(accepted_parts[0]["content-type"], "application/json"); + assert_eq!(accepted_parts[0]["length"].as_u64().unwrap(), 52); + assert_eq!(accepted_parts[1]["name"], "event.properties"); + assert_eq!(accepted_parts[1]["content-type"], "application/json"); + assert_eq!(accepted_parts[1]["length"].as_u64().unwrap(), 20); +} + +// ---------------------------------------------------------------------------- +// Properties Handling Tests +// ---------------------------------------------------------------------------- + +#[tokio::test] +async fn test_properties_in_event_part_only() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let event_data = json!({ + "event": "$ai_generation", + "distinct_id": "test_user", + "properties": { + "$ai_model": "embedded-model", + "custom_field": "embedded-value" + } + }); + + let form = Form::new().part( + "event", + Part::bytes(serde_json::to_vec(&event_data).unwrap()) + .mime_str("application/json") + .unwrap(), + ); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!(response.status(), StatusCode::OK); + + let response_json: serde_json::Value = response.json::().await; + let accepted_parts = response_json["accepted_parts"].as_array().unwrap(); + assert_eq!(accepted_parts.len(), 1); + assert_eq!(accepted_parts[0]["name"], "event"); + assert_eq!(accepted_parts[0]["length"].as_u64().unwrap(), 128); +} + +#[tokio::test] +async fn test_properties_in_separate_part_only() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let event_data = json!({ + "event": "$ai_generation", + "distinct_id": "test_user" + }); + + let properties = json!({ + "$ai_model": "separate-model", + "custom_field": "separate-value" + }); + + let form = Form::new() + .part( + "event", + Part::bytes(serde_json::to_vec(&event_data).unwrap()) + .mime_str("application/json") + .unwrap(), + ) + .part( + "event.properties", + Part::bytes(serde_json::to_vec(&properties).unwrap()) + .mime_str("application/json") + .unwrap(), + ); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!(response.status(), StatusCode::OK); + + let response_json: serde_json::Value = response.json::().await; + let accepted_parts = response_json["accepted_parts"].as_array().unwrap(); + assert_eq!(accepted_parts.len(), 2); + assert_eq!(accepted_parts[0]["name"], "event"); + assert_eq!(accepted_parts[0]["length"].as_u64().unwrap(), 52); + assert_eq!(accepted_parts[1]["name"], "event.properties"); + assert_eq!(accepted_parts[1]["length"].as_u64().unwrap(), 62); +} + +#[tokio::test] +async fn test_properties_both_embedded_and_separate_returns_400() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let event_data = json!({ + "event": "$ai_generation", + "distinct_id": "test_user", + "properties": { + "$ai_model": "embedded-model", + "custom_field": "embedded-value" + } + }); + + let properties = json!({ + "$ai_model": "override-model", + "custom_field": "override-value" + }); + + let form = Form::new() + .part( + "event", + Part::bytes(serde_json::to_vec(&event_data).unwrap()) + .mime_str("application/json") + .unwrap(), + ) + .part( + "event.properties", + Part::bytes(serde_json::to_vec(&properties).unwrap()) + .mime_str("application/json") + .unwrap(), + ); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +// ---------------------------------------------------------------------------- +// Size Limit Tests +// ---------------------------------------------------------------------------- + +#[tokio::test] +async fn test_event_exceeds_32kb_returns_413() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + // Create event with large properties that exceed 32KB + let large_value = "x".repeat(33 * 1024); + let event_data = json!({ + "event": "$ai_generation", + "distinct_id": "test_user", + "properties": { + "$ai_model": "test", + "large_field": large_value + } + }); + + let form = Form::new().part( + "event", + Part::bytes(serde_json::to_vec(&event_data).unwrap()) + .mime_str("application/json") + .unwrap(), + ); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!(response.status(), StatusCode::PAYLOAD_TOO_LARGE); +} + +#[tokio::test] +async fn test_combined_event_properties_exceeds_960kb_returns_413() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + // Create event and properties that together exceed 960KB + // 961KB to ensure we exceed the limit + let large_value = "x".repeat(961 * 1024); + let event_data = json!({ + "event": "$ai_generation", + "distinct_id": "test_user" + }); + + let properties = json!({ + "$ai_model": "test", + "large_field": large_value + }); + + let form = Form::new() + .part( + "event", + Part::bytes(serde_json::to_vec(&event_data).unwrap()) + .mime_str("application/json") + .unwrap(), + ) + .part( + "event.properties", + Part::bytes(serde_json::to_vec(&properties).unwrap()) + .mime_str("application/json") + .unwrap(), + ); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!(response.status(), StatusCode::PAYLOAD_TOO_LARGE); +} + +#[tokio::test] +async fn test_sum_of_all_parts_exceeds_25mb_returns_413() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let event_data = json!({ + "event": "$ai_generation", + "distinct_id": "test_user" + }); + + let properties = json!({ + "$ai_model": "test" + }); + + // Create a blob that, combined with event and properties, exceeds 25MB + // Event ~52 bytes + properties ~24 bytes + blob ~25MB = exceeds limit + let large_blob = vec![0u8; 25 * 1024 * 1024]; + + let form = Form::new() + .part( + "event", + Part::bytes(serde_json::to_vec(&event_data).unwrap()) + .mime_str("application/json") + .unwrap(), + ) + .part( + "event.properties", + Part::bytes(serde_json::to_vec(&properties).unwrap()) + .mime_str("application/json") + .unwrap(), + ) + .part( + "event.properties.$ai_input", + Part::bytes(large_blob) + .mime_str("application/octet-stream") + .unwrap(), + ); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!(response.status(), StatusCode::PAYLOAD_TOO_LARGE); +} + +#[tokio::test] +async fn test_request_body_exceeds_110_percent_limit_returns_413() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let event_data = json!({ + "event": "$ai_generation", + "distinct_id": "test_user" + }); + + let properties = json!({ + "$ai_model": "test" + }); + + // Create a blob that's just barely over 25MB + // This will result in a request body that exceeds 27.5MB (110% limit) + // once multipart overhead is added + let large_blob = vec![0u8; (25 * 1024 * 1024) + (3 * 1024 * 1024)]; // 28MB + + let form = Form::new() + .part( + "event", + Part::bytes(serde_json::to_vec(&event_data).unwrap()) + .mime_str("application/json") + .unwrap(), + ) + .part( + "event.properties", + Part::bytes(serde_json::to_vec(&properties).unwrap()) + .mime_str("application/json") + .unwrap(), + ) + .part( + "event.properties.$ai_input", + Part::bytes(large_blob) + .mime_str("application/octet-stream") + .unwrap(), + ); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + + // Axum's DefaultBodyLimit returns 413 Payload Too Large when the body exceeds the limit + assert_eq!(response.status(), StatusCode::PAYLOAD_TOO_LARGE); +} + +// ---------------------------------------------------------------------------- +// Content Type Validation Tests +// ---------------------------------------------------------------------------- + +#[tokio::test] +async fn test_blob_with_application_octet_stream_content_type() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let properties = json!({ + "$ai_model": "test" + }); + + let blob_data = vec![0u8, 1u8, 2u8, 3u8]; + + let form = create_ai_event_form("$ai_generation", "test_user", properties).part( + "event.properties.$ai_binary_data", + Part::bytes(blob_data) + .mime_str("application/octet-stream") + .unwrap(), + ); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!(response.status(), StatusCode::OK); +} + +#[tokio::test] +async fn test_blob_with_application_json_content_type() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let properties = json!({ + "$ai_model": "test" + }); + + let blob_data = json!({"key": "value"}).to_string(); + + let form = create_ai_event_form("$ai_generation", "test_user", properties).part( + "event.properties.$ai_input", + Part::bytes(blob_data.into_bytes()) + .mime_str("application/json") + .unwrap(), + ); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!(response.status(), StatusCode::OK); +} + +#[tokio::test] +async fn test_blob_with_text_plain_content_type() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let properties = json!({ + "$ai_model": "test" + }); + + let blob_data = "This is plain text data"; + + let form = create_ai_event_form("$ai_generation", "test_user", properties).part( + "event.properties.$ai_output", + Part::bytes(blob_data.as_bytes().to_vec()) + .mime_str("text/plain") + .unwrap(), + ); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!(response.status(), StatusCode::OK); +} + +#[tokio::test] +async fn test_blob_with_invalid_content_type_returns_400() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let properties = json!({ + "$ai_model": "test" + }); + + let blob_data = vec![0u8, 1u8, 2u8, 3u8]; + + let form = create_ai_event_form("$ai_generation", "test_user", properties).part( + "event.properties.$ai_data", + Part::bytes(blob_data).mime_str("application/xml").unwrap(), + ); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +#[tokio::test] +async fn test_blob_without_content_type_returns_400() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let properties = json!({ + "$ai_model": "test" + }); + + let blob_data = vec![0u8, 1u8, 2u8, 3u8]; + + // Create a part without Content-Type (reqwest doesn't set it if we don't call mime_str) + let form = create_ai_event_form("$ai_generation", "test_user", properties) + .part("event.properties.$ai_data", Part::bytes(blob_data)); + + let response = send_multipart_request( + &test_client, + form, + Some("phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3"), + ) + .await; + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +// ---------------------------------------------------------------------------- +// Compression Tests +// ---------------------------------------------------------------------------- + +#[tokio::test] +async fn test_gzip_compressed_request() { + let router = setup_ai_test_router(); + let test_client = TestClient::new(router); + + let properties = json!({ + "$ai_model": "test-gzip" + }); + + let form = create_ai_event_form("$ai_generation", "test_user", properties); + + // Get the multipart body + let boundary = form.boundary().to_string(); + let content_type = format!("multipart/form-data; boundary={boundary}"); + + let mut stream = form.into_stream(); + let mut body = Vec::new(); + + while let Some(chunk) = stream.next().await { + body.extend_from_slice(&chunk.unwrap()); + } + + // Compress the body with gzip + let mut encoder = flate2::write::GzEncoder::new(Vec::new(), flate2::Compression::default()); + encoder.write_all(&body).unwrap(); + let compressed_body = encoder.finish().unwrap(); + + // Send compressed request + let response = test_client + .post("/i/v0/ai") + .header("Content-Type", content_type) + .header("Content-Encoding", "gzip") + .header( + "Authorization", + "Bearer phc_VXRzc3poSG9GZm1JenRianJ6TTJFZGh4OWY2QXzx9f3", + ) + .body(compressed_body) + .send() + .await; + + assert_eq!(response.status(), StatusCode::OK); + + // Verify response (lengths should match uncompressed data) + let response_json: serde_json::Value = response.json::().await; + let accepted_parts = response_json["accepted_parts"].as_array().unwrap(); + assert_eq!(accepted_parts.len(), 2); + assert_eq!(accepted_parts[0]["name"], "event"); + assert_eq!(accepted_parts[0]["length"].as_u64().unwrap(), 52); + assert_eq!(accepted_parts[1]["name"], "event.properties"); + assert_eq!(accepted_parts[1]["length"].as_u64().unwrap(), 25); +} diff --git a/rust/capture/tests/limiters.rs b/rust/capture/tests/limiters.rs index 5901baea30..59b9748c0b 100644 --- a/rust/capture/tests/limiters.rs +++ b/rust/capture/tests/limiters.rs @@ -139,6 +139,7 @@ async fn setup_router_with_limits( None, // historical_tokens_keys false, // is_mirror_deploy 0.0, // verbose_sample_percent + 26_214_400, // ai_max_sum_of_parts_bytes (25MB) ); (app, sink) @@ -1182,6 +1183,7 @@ async fn test_survey_quota_cross_batch_first_submission_allowed() { None, false, 0.0, + 26_214_400, ); let client = TestClient::new(app); @@ -1257,6 +1259,7 @@ async fn test_survey_quota_cross_batch_duplicate_submission_dropped() { None, false, 0.0, + 26_214_400, ); let client = TestClient::new(app); @@ -1336,6 +1339,7 @@ async fn test_survey_quota_cross_batch_redis_error_fail_open() { None, false, 0.0, + 26_214_400, ); let client = TestClient::new(app); @@ -1752,6 +1756,7 @@ async fn test_ai_quota_cross_batch_redis_error_fail_open() { None, false, 0.0, + 26_214_400, ); let client = TestClient::new(app);