mlx-knife/tests_2.0/live/conftest.py

"""Shared fixtures for live E2E tests (ADR-011).

This conftest.py provides pytest fixtures for the live/ test package.
For utility functions and constants, see test_utils.py.
"""

from __future__ import annotations

import os
import sys
import time
import pytest

# Prevent tokenizer fork warnings and potential deadlocks
# See: https://github.com/huggingface/tokenizers/issues/1047
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from pathlib import Path
from typing import Dict, Any

# Import utilities from test_utils
from .test_utils import (
    discover_mlx_models_in_user_cache,
    discover_text_models,
    discover_vision_models,
    discover_audio_models,
    parse_vm_stat_page_size,
    TEST_MODELS,
)

# Import the real MLX modules fixture from parent test module
# This is needed for tests that use MLXRunner directly (e.g., streaming parity)
# The fixture is already decorated with @pytest.fixture in test_stop_tokens_live.py
# We just import and re-export it here so it's available to tests in this package
_parent_dir = Path(__file__).parent.parent
sys.path.insert(0, str(_parent_dir))
try:
    from test_stop_tokens_live import _use_real_mlx_modules
finally:
    sys.path.remove(str(_parent_dir))

# The imported fixture is now available to all tests in this package


@pytest.fixture(scope="function", autouse=True)
def _skip_unless_live_e2e_marker(request):
    """Auto-skip E2E tests unless -m live_e2e is explicitly used.

    E2E tests are marker-required (🔒) - they require real models and httpx.
    This fixture ensures they are skipped in the default pytest run.

    Exception: show_model_portfolio marker is allowed (convenience diagnostics).

    SCOPE LIMITATION: Only applies to tests in tests_2.0/live/ directory.
    Tests in parent directory manage their own markers independently.
    """
    # CRITICAL: Only apply to tests in live/ directory
    # Tests in parent directory (tests_2.0/) handle their own skip logic
    test_path = str(request.node.path)
    if "/live/" not in test_path and "\\live\\" not in test_path:
        return  # Skip fixture for tests outside live/ directory

    # Check if test has live_e2e marker
    if request.node.get_closest_marker("live_e2e"):
        # Check if -m live_e2e or -m show_model_portfolio or -m wet was specified
        selected_markers = request.config.getoption("-m") or ""
        if ("live_e2e" not in selected_markers and
            "show_model_portfolio" not in selected_markers and
            "wet" not in selected_markers):
            pytest.skip("Run with -m live_e2e or -m wet")


def pytest_generate_tests(metafunc):
    """Generate parametrized tests for model_key, text_model_key, or vision_model_key.

    DEPRECATED (model_key): Use text_model_key or vision_model_key instead for
    deterministic test isolation. The legacy model_key parametrization mixes text
    and vision models which causes test interference and non-deterministic indices.

    If a test function has 'model_key' in its signature, this hook
    automatically parametrizes it over all models in the portfolio.
    This replaces the old loop-based approach (which caused RAM leaks)
    with pytest-native parametrization for proper test isolation.

    RECOMMENDED (Portfolio Separation): If a test has 'text_model_key' or 'vision_model_key',
    parametrizes over text-only or vision-only models respectively.

    Each parametrized test gets its own server instance lifecycle,
    preventing accumulated RAM leaks from improper cleanup.

    IMPORTANT: This hook runs during COLLECTION phase. We check for
    live_e2e marker BEFORE doing portfolio discovery to avoid slow
    collection when marker is not requested (maintains marker-required 🔒).

    SCOPE LIMITATION: Only apply to tests in tests_2.0/live/ directory to avoid
    interfering with parent directory tests that use isolated_cache.
    """
    # CRITICAL: Only apply this hook to tests in the live/ directory
    # Tests in parent directory (tests_2.0/) should not be parametrized by Portfolio Discovery
    test_path = str(metafunc.definition.path)
    if "/live/" not in test_path and "\\live\\" not in test_path:
        return  # Skip hook for tests outside live/ directory

    # Check if live_e2e or wet marker is requested (COLLECTION-TIME check)
    selected_markers = metafunc.config.getoption("-m") or ""
    is_live_e2e = "live_e2e" in selected_markers or "wet" in selected_markers

    # Handle text_model_key (NEW - Portfolio Separation)
    if "text_model_key" in metafunc.fixturenames:
        if not is_live_e2e:
            metafunc.parametrize("text_model_key", ["_skipped"])
            return

        # Discover text-only models
        text_models = discover_text_models()
        if text_models:
            model_keys = [f"text_{i:02d}" for i in range(len(text_models))]
        else:
            # Fallback to hardcoded test models (assume all text)
            model_keys = list(TEST_MODELS.keys())

        metafunc.parametrize("text_model_key", model_keys)
        return

    # Handle vision_model_key (NEW - Portfolio Separation)
    if "vision_model_key" in metafunc.fixturenames:
        if not is_live_e2e:
            metafunc.parametrize("vision_model_key", ["_skipped"])
            return

        # Discover vision-only models
        vision_models = discover_vision_models()
        if vision_models:
            model_keys = [f"vision_{i:02d}" for i in range(len(vision_models))]
        else:
            # No fallback for vision (needs real models)
            model_keys = []

        # If no vision models, parametrize with skip marker
        if not model_keys:
            model_keys = ["_no_vision_models"]

        metafunc.parametrize("vision_model_key", model_keys)
        return

    # Handle audio_model_key (NEW - Portfolio Separation)
    if "audio_model_key" in metafunc.fixturenames:
        if not is_live_e2e:
            metafunc.parametrize("audio_model_key", ["_skipped"])
            return

        # Discover audio-only models
        audio_models = discover_audio_models()
        if audio_models:
            model_keys = [f"audio_{i:02d}" for i in range(len(audio_models))]
        else:
            # No fallback for audio (needs real models)
            model_keys = []

        # If no audio models, parametrize with skip marker
        if not model_keys:
            model_keys = ["_no_audio_models"]

        metafunc.parametrize("audio_model_key", model_keys)
        return

    # Handle model_key (DEPRECATED - Mixed Text+Vision, use text_model_key/vision_model_key instead)
    if "model_key" in metafunc.fixturenames:
        if not is_live_e2e:
            metafunc.parametrize("model_key", ["_skipped"])
            return

        # Portfolio Discovery at collection time (uses subprocess mlxk list)
        discovered = discover_mlx_models_in_user_cache()

        if discovered:
            # Use discovered models - generate keys matching portfolio_models fixture
            model_keys = [f"discovered_{i:02d}" for i in range(len(discovered))]
        else:
            # Fallback to hardcoded test models
            model_keys = list(TEST_MODELS.keys())

        # Parametrize the test over all model keys
        metafunc.parametrize("model_key", model_keys)


@pytest.fixture(scope="module")
def portfolio_models():
    """Dynamic model portfolio: discovered models OR hardcoded fallback.

    DEPRECATED: Use text_portfolio or vision_portfolio instead for deterministic
    test isolation. This fixture mixes text and vision models which can cause
    test interference and non-deterministic discovered_XX indices.

    Reuses Portfolio Discovery from ADR-009 (test_stop_tokens_live.py).
    Enables portfolio testing when HF_HOME is set, falls back to
    3 hardcoded test models otherwise (backward compatibility).

    Returns:
        Dict[str, Dict[str, Any]]: Model portfolio keyed by model_key
            {
                "discovered_00": {
                    "id": "mlx-community/Llama-3.2-3B-Instruct-4bit",
                    "ram_needed_gb": 4.0,
                    "expected_issue": None,
                    "description": "Discovered: ..."
                },
                ...
            }
    """
    discovered = discover_mlx_models_in_user_cache()

    if discovered:
        # Convert discovered models to TEST_MODELS format
        result = {}
        for i, model in enumerate(discovered):
            key = f"discovered_{i:02d}"
            result[key] = {
                "id": model["model_id"],
                "ram_needed_gb": model["ram_needed_gb"],
                "expected_issue": None,  # Unknown for discovered models
                "description": f"Discovered: {model['model_id']} ({model['weight_count']} weights)"
            }

        print(f"\n🔍 Portfolio Discovery: Found {len(result)} MLX models in cache (Text+Vision mixed)")
        return result
    else:
        # Fallback to hardcoded test models
        print(f"\n📋 Using hardcoded TEST_MODELS (3 models)")
        return TEST_MODELS


@pytest.fixture(scope="module")
def text_portfolio():
    """Text-only model portfolio (NEW - Portfolio Separation).

    Discovers text models using discover_text_models() which filters out
    vision models. This ensures deterministic test_XX indices that won't
    change when vision models are added/removed from cache.

    Returns:
        Dict[str, Dict[str, Any]]: Text model portfolio keyed by text_model_key
            {
                "text_00": {
                    "id": "mlx-community/Qwen2.5-0.5B-Instruct-4bit",
                    "ram_needed_gb": 0.3,
                    "expected_issue": None,
                    "description": "Text: Qwen2.5-0.5B-Instruct-4bit"
                },
                ...
            }
    """
    text_models = discover_text_models()

    if text_models:
        result = {}
        for i, model in enumerate(text_models):
            key = f"text_{i:02d}"
            result[key] = {
                "id": model["model_id"],
                "ram_needed_gb": model["ram_needed_gb"],
                "expected_issue": None,
                "description": f"Text: {model['model_id'].split('/')[-1]}"
            }

        print(f"\n📝 Text Portfolio: Found {len(result)} text-only models")
        return result
    else:
        # Fallback to hardcoded test models (assume all text)
        print(f"\n📋 Text Portfolio: Using hardcoded TEST_MODELS (3 models)")
        return TEST_MODELS


@pytest.fixture(scope="module")
def vision_portfolio():
    """Vision-only model portfolio (NEW - Portfolio Separation).

    Discovers vision models using discover_vision_models() which filters to
    only models with vision capabilities. Uses Vision-specific RAM calculation
    (0.70 threshold instead of 1.2x multiplier).

    Returns:
        Dict[str, Dict[str, Any]]: Vision model portfolio keyed by vision_model_key
            {
                "vision_00": {
                    "id": "mlx-community/Llama-3.2-11B-Vision-Instruct-4bit",
                    "ram_needed_gb": 5.6,
                    "expected_issue": None,
                    "description": "Vision: Llama-3.2-11B-Vision-Instruct-4bit"
                },
                ...
            }
    """
    vision_models = discover_vision_models()

    if vision_models:
        result = {}
        for i, model in enumerate(vision_models):
            key = f"vision_{i:02d}"
            result[key] = {
                "id": model["model_id"],
                "ram_needed_gb": model["ram_needed_gb"],
                "expected_issue": None,
                "description": f"Vision: {model['model_id'].split('/')[-1]}"
            }

        print(f"\n👁️  Vision Portfolio: Found {len(result)} vision-capable models")
        return result
    else:
        # No fallback for vision - requires real models
        print(f"\n⚠️  Vision Portfolio: No vision models found in cache")
        return {}


@pytest.fixture(scope="module")
def audio_portfolio():
    """Audio-only model portfolio (ADR-020 - Portfolio Separation).

    Discovers audio models using discover_audio_models() which filters to
    only models with audio capabilities. Includes both:
    - STT models (Whisper, Voxtral) → mlx-audio backend
    - Multimodal audio (Gemma-3n) → mlx-vlm backend

    Returns:
        Dict[str, Dict[str, Any]]: Audio model portfolio keyed by audio_model_key
            {
                "audio_00": {
                    "id": "mlx-community/whisper-large-v3-turbo-4bit",
                    "ram_needed_gb": 1.5,
                    "expected_issue": None,
                    "description": "Audio: whisper-large-v3-turbo-4bit"
                },
                ...
            }
    """
    audio_models = discover_audio_models()

    if audio_models:
        result = {}
        for i, model in enumerate(audio_models):
            key = f"audio_{i:02d}"
            result[key] = {
                "id": model["model_id"],
                "ram_needed_gb": model["ram_needed_gb"],
                "expected_issue": None,
                "description": f"Audio: {model['model_id'].split('/')[-1]}"
            }

        print(f"\n🔊 Audio Portfolio: Found {len(result)} audio-capable models")
        return result
    else:
        # No fallback for audio - requires real models
        print(f"\n⚠️  Audio Portfolio: No audio models found in cache")
        return {}


@pytest.fixture
def model_info(portfolio_models, model_key):
    """Get model info for the current parametrized model_key.

    DEPRECATED: Use text_model_info or vision_model_info for new tests.

    This fixture provides convenient access to model metadata in
    parametrized tests. It automatically looks up the model_key
    in the portfolio and returns the model info dict.

    Usage:
        def test_something(model_info):
            model_id = model_info["id"]
            ram_needed = model_info["ram_needed_gb"]
            ...

    Returns:
        Dict[str, Any]: Model metadata with keys:
            - id: Model ID (e.g., "mlx-community/Llama-3.2-3B-Instruct-4bit")
            - ram_needed_gb: Estimated RAM requirement
            - expected_issue: Known issue or None
            - description: Human-readable description
    """
    return portfolio_models[model_key]


@pytest.fixture
def text_model_info(text_portfolio, text_model_key):
    """Get model info for the current parametrized text_model_key (NEW).

    This fixture provides convenient access to text model metadata in
    parametrized tests. It automatically looks up the text_model_key
    in the text_portfolio and returns the model info dict.

    Usage:
        def test_something(text_model_info):
            model_id = text_model_info["id"]
            ram_needed = text_model_info["ram_needed_gb"]
            ...

    Returns:
        Dict[str, Any]: Text model metadata with keys:
            - id: Model ID (e.g., "mlx-community/Qwen2.5-0.5B-Instruct-4bit")
            - ram_needed_gb: Estimated RAM requirement (1.2x text formula)
            - expected_issue: Known issue or None
            - description: Human-readable description
    """
    return text_portfolio[text_model_key]


@pytest.fixture
def vision_model_info(vision_portfolio, vision_model_key):
    """Get model info for the current parametrized vision_model_key (NEW).

    This fixture provides convenient access to vision model metadata in
    parametrized tests. It automatically looks up the vision_model_key
    in the vision_portfolio and returns the model info dict.

    Usage:
        def test_something(vision_model_info):
            model_id = vision_model_info["id"]
            ram_needed = vision_model_info["ram_needed_gb"]
            ...

    Returns:
        Dict[str, Any]: Vision model metadata with keys:
            - id: Model ID (e.g., "mlx-community/Llama-3.2-11B-Vision-Instruct-4bit")
            - ram_needed_gb: Estimated RAM requirement (0.70 threshold vision formula)
            - expected_issue: Known issue or None
            - description: Human-readable description
    """
    return vision_portfolio[vision_model_key]


@pytest.fixture
def audio_model_info(audio_portfolio, audio_model_key):
    """Get model info for the current parametrized audio_model_key (ADR-020).

    This fixture provides convenient access to audio model metadata in
    parametrized tests. It automatically looks up the audio_model_key
    in the audio_portfolio and returns the model info dict.

    Usage:
        def test_something(audio_model_info):
            model_id = audio_model_info["id"]
            ram_needed = audio_model_info["ram_needed_gb"]
            ...

    Returns:
        Dict[str, Any]: Audio model metadata with keys:
            - id: Model ID (e.g., "mlx-community/whisper-large-v3-turbo-4bit")
            - ram_needed_gb: Estimated RAM requirement
            - expected_issue: Known issue or None
            - description: Human-readable description

        Returns None for skip markers (_skipped, _no_audio_models).
    """
    if audio_model_key.startswith("_"):
        return None
    return audio_portfolio[audio_model_key]


@pytest.fixture(autouse=True)
def _auto_report_vision_model(request):
    """Auto-report vision model info to benchmark log (autouse).

    This fixture automatically adds vision model metadata to benchmark reports
    for parametrized vision tests, without requiring explicit report_benchmark() calls.

    This ensures vision models appear with proper annotations in memplot.py timeline charts.

    Handles two types of vision tests:
    1. API tests with vision_model_key parameter (vision_portfolio)
    2. CLI tests in test_vision_e2e_live.py (hardcoded pixtral)
    """
    # Type 1: Parametrized vision API tests (vision_model_key)
    if "vision_model_key" in request.fixturenames:
        # Get vision model info from fixture
        try:
            vision_model_info = request.getfixturevalue("vision_model_info")
        except:
            return

        if not vision_model_info:
            return

        # Extract model metadata
        model_id = vision_model_info["id"]
        family, variant = _parse_model_family(model_id)

        # Vision models: ram_needed_gb is disk size (no 1.2x overhead)
        ram_gb = vision_model_info["ram_needed_gb"]
        disk_size_gb = ram_gb if ram_gb != float('inf') else float('inf')

        # Append to user_properties for benchmark reporting (schema v0.2.0)
        request.node.user_properties.append(("model", {
            "id": model_id,
            "size_gb": round(disk_size_gb, 2) if disk_size_gb != float('inf') else disk_size_gb,
            "family": family,
            "variant": variant,
        }))
        return

    # Type 2: CLI vision tests (test_vision_e2e_live.py)
    # These tests use subprocess.run(["mlxk", "run", VISION_MODEL, ...])
    # VISION_MODEL is explicitly set to "pixtral-12b-8bit" to avoid ambiguity
    if 'test_vision_e2e_live.py' in request.node.nodeid:
        # All CLI vision tests use explicit pixtral-12b-8bit
        request.node.user_properties.append(("model", {
            "id": "pixtral-12b-8bit",  # Explicit model (not shorthand)
            "size_gb": 13.5,   # Actual disk size of 8bit variant
            "family": "pixtral",
            "variant": "12b-8bit",
        }))
        # Explicit inference_modality for CLI vision tests (v0.2.1)
        # Required because these tests don't use vision_model_key fixture
        request.node.user_properties.append(("inference_modality", "vision"))


@pytest.fixture(autouse=True)
def _auto_report_audio_model(request):
    """Auto-report audio model info to benchmark log (autouse, ADR-020).

    This fixture automatically adds audio model metadata to benchmark reports
    for parametrized audio tests, without requiring explicit report_benchmark() calls.

    This ensures audio models appear with proper annotations in memplot.py timeline charts.

    Handles audio API tests with audio_model_key parameter (audio_portfolio).
    """
    # Only for parametrized audio tests (audio_model_key)
    if "audio_model_key" not in request.fixturenames:
        return

    # Get audio model info from fixture
    try:
        audio_model_info = request.getfixturevalue("audio_model_info")
    except:
        return

    if not audio_model_info:
        return

    # Extract model metadata
    model_id = audio_model_info["id"]
    family, variant = _parse_model_family(model_id)

    # Audio models: ram_needed_gb is disk size (no overhead)
    ram_gb = audio_model_info["ram_needed_gb"]
    disk_size_gb = ram_gb if ram_gb != float('inf') else float('inf')

    # Append to user_properties for benchmark reporting (schema v0.2.2)
    request.node.user_properties.append(("model", {
        "id": model_id,
        "size_gb": round(disk_size_gb, 2) if disk_size_gb != float('inf') else disk_size_gb,
        "family": family,
        "variant": variant,
    }))

    # Explicit inference_modality for audio tests (v0.2.1+)
    # Required because audio_model_key fixture doesn't set this automatically
    request.node.user_properties.append(("inference_modality", "audio"))


def _parse_model_family(model_id: str) -> tuple[str, str]:
    """Extract model family and variant from HuggingFace model ID.

    Examples:
        "mlx-community/Llama-3.2-3B-Instruct-4bit" → ("llama", "3.2-3b-instruct")
        "mlx-community/Qwen2.5-7B-Instruct-4bit" → ("qwen", "2.5-7b-instruct")
        "mlx-community/phi-3-mini-4k-instruct" → ("phi-3", "mini-4k-instruct")

    Args:
        model_id: HuggingFace model ID (org/name format)

    Returns:
        (family, variant) tuple. Returns ("unknown", model_name) if parsing fails.
    """
    # Extract model name from org/name
    model_name = model_id.split("/")[-1].lower()

    # Common patterns
    if "llama" in model_name:
        family = "llama"
        # Extract variant (everything after "llama-")
        variant = model_name.split("llama-", 1)[1] if "llama-" in model_name else model_name
        # Remove quantization suffix (-4bit, -8bit, etc.)
        variant = variant.replace("-4bit", "").replace("-8bit", "").replace("-fp16", "")
        return family, variant

    if "qwen" in model_name:
        family = "qwen"
        variant = model_name.split("qwen", 1)[1] if "qwen" in model_name else model_name
        variant = variant.replace("-4bit", "").replace("-8bit", "").replace("-fp16", "")
        return family, variant

    if "phi" in model_name:
        # Phi models: phi-3.5, phi-3, phi-2, etc.
        # Check most specific version first
        if "phi-3.5" in model_name:
            family = "phi-3.5"
            variant = model_name.split("phi-3.5-", 1)[1] if "phi-3.5-" in model_name else "base"
        elif "phi-3" in model_name:
            family = "phi-3"
            variant = model_name.split("phi-3-", 1)[1] if "phi-3-" in model_name else "base"
        elif "phi-2" in model_name:
            family = "phi-2"
            variant = model_name.split("phi-2-", 1)[1] if "phi-2-" in model_name else "base"
        else:
            family = "phi"
            variant = model_name
        variant = variant.replace("-4bit", "").replace("-8bit", "")
        return family, variant

    if "deepseek" in model_name:
        family = "deepseek"
        variant = model_name.replace("deepseek-", "")
        variant = variant.replace("-4bit", "").replace("-8bit", "")
        return family, variant

    if "mistral" in model_name or "mixtral" in model_name:
        family = "mistral" if "mistral" in model_name else "mixtral"
        variant = model_name.replace(f"{family}-", "")
        variant = variant.replace("-4bit", "").replace("-8bit", "")
        return family, variant

    if "whisper" in model_name:
        family = "whisper"
        variant = model_name.replace("whisper-", "")
        variant = variant.replace("-4bit", "").replace("-8bit", "").replace("-fp16", "")
        return family, variant

    if "pixtral" in model_name:
        family = "pixtral"
        variant = model_name.replace("pixtral-", "")
        variant = variant.replace("-4bit", "").replace("-8bit", "")
        return family, variant

    # Fallback: unknown family
    return "unknown", model_name.replace("-4bit", "").replace("-8bit", "")


@pytest.fixture
def report_benchmark(request):
    """Helper for writing benchmark data to test reports (ADR-013 Phase 0).

    Simplifies adding model metadata and performance metrics to E2E test reports.
    Reports are written as JSONL via pytest_runtest_makereport hook.

    Dynamically uses text_model_info, vision_model_info, or model_info (deprecated)
    based on what's available in the test's fixture request.

    Usage:
        def test_something(report_benchmark, text_model_info):
            # ... test logic ...

            # Report model info only
            report_benchmark()

            # Report with performance metrics
            report_benchmark(performance={
                "tokens_per_sec": 45.2,
                "ram_peak_mb": 3200,
                "prompt_tokens": 15,
                "completion_tokens": 42
            })

            # Report with stop token data
            report_benchmark(stop_tokens={
                "configured": ["<|end|>"],
                "detected": ["<|end|>"],
                "workaround": "none",
                "leaked": False
            })

    Args:
        performance: Optional performance metrics dict
        stop_tokens: Optional stop token validation data
        **extra: Additional metadata (goes to metadata section)
    """
    def _report(performance: Dict[str, Any] = None, stop_tokens: Dict[str, Any] = None, **extra):
        # Dynamically get model_info from available fixtures (Portfolio Separation)
        model_info = None
        for fixture_name in ["text_model_info", "vision_model_info", "model_info"]:
            try:
                model_info = request.getfixturevalue(fixture_name)
                if model_info is not None:
                    break
            except:
                continue

        if model_info is None:
            # No model info available (non-parametrized test)
            return

        # Extract model family/variant from model_id
        model_id = model_info["id"]
        family, variant = _parse_model_family(model_id)

        # Build model section (convert RAM estimate to disk size)
        # ram_needed_gb includes 1.2x overhead for text, direct size for vision
        # For vision models (with 0.70 threshold), ram_needed_gb IS the disk size
        # For text models, disk size = ram_needed_gb / 1.2
        ram_gb = model_info["ram_needed_gb"]
        if ram_gb == float('inf'):
            disk_size_gb = float('inf')  # Vision model too large
        else:
            # Heuristic: if ram < 1.5x disk size, assume it's vision (no overhead)
            # Otherwise assume text (1.2x overhead)
            disk_size_gb = ram_gb / 1.2

        request.node.user_properties.append(("model", {
            "id": model_id,
            "size_gb": round(disk_size_gb, 2) if disk_size_gb != float('inf') else disk_size_gb,
            "family": family,
            "variant": variant,
        }))

        # Add performance if provided
        if performance:
            request.node.user_properties.append(("performance", performance))

        # Add stop_tokens if provided
        if stop_tokens:
            request.node.user_properties.append(("stop_tokens", stop_tokens))

        # Add any extra metadata
        for key, value in extra.items():
            request.node.user_properties.append((key, value))

    return _report


# ============================================================================
# Precise Test Timing - For Effective Runtime Analysis
# ============================================================================

# StashKeys for test timing (pytest 7.0+ API)
test_start_key = pytest.StashKey[float]()
test_end_key = pytest.StashKey[float]()


@pytest.hookimpl(tryfirst=True)
def pytest_runtest_setup(item):
    """Hook: Capture precise test start timestamp (Unix epoch).

    Enables accurate correlation with memmon samples and effective runtime
    calculation by excluding idle periods (Memory Gates, setup overhead).

    Stored in node stash for later retrieval in makereport hook.
    """
    item.stash[test_start_key] = time.time()


@pytest.hookimpl(trylast=True)
def pytest_runtest_teardown(item):
    """Hook: Capture precise test end timestamp (Unix epoch).

    Paired with test_start_ts for precise test duration measurement
    independent of pytest's duration calculation.
    """
    item.stash[test_end_key] = time.time()


@pytest.hookimpl(tryfirst=True)
def pytest_runtest_makereport(item, call):
    """Hook: Add precise timestamps to benchmark report (Schema v0.2.2).

    Retrieves test_start_ts and test_end_ts from stash (captured in
    setup/teardown hooks) and adds them to user_properties for
    inclusion in benchmark JSONL output.

    This enables post-processing tools to correlate test execution
    with memmon samples and calculate effective runtime.

    CRITICAL: Uses tryfirst=True to ensure this hook runs BEFORE the
    conftest.py hook that writes JSONL (which has hookwrapper=True).
    """
    if call.when == "call":  # Only for actual test execution, not setup/teardown
        test_start_ts = item.stash.get(test_start_key, None)
        test_end_ts = item.stash.get(test_end_key, None)

        if test_start_ts and test_end_ts:
            item.user_properties.append(("test_start_ts", test_start_ts))
            item.user_properties.append(("test_end_ts", test_end_ts))