mirror of
https://github.com/cloudstack-llc/mlx-knife.git
synced 2026-07-01 20:44:14 -04:00
bf7480d042
Major Features: - Audio transcription via mlx-audio backend (Whisper, >10min duration) - OpenAI /v1/audio/transcriptions endpoint - Memory Gate System (Vision: 8GB, Audio: 4GB) - Config-based backend routing (ADR-020) - Benchmark toolchain (memmon/memplot, Schema v0.2.2) Key Fixes: - EuroLLM tokenizer decoding - Vision-model text-only routing regression - Multimodal model context length detection - Memory cleanup bug (mx.metal.clear_cache) - Orphan process bug Test Results: - Unit tests: 647 passed, 11 skipped (Python 3.10-3.12) - wet-umbrella: 171 passed total See CHANGELOG.md for complete details and known issues.
780 lines
29 KiB
Python
780 lines
29 KiB
Python
"""Shared fixtures for live E2E tests (ADR-011).
|
|
|
|
This conftest.py provides pytest fixtures for the live/ test package.
|
|
For utility functions and constants, see test_utils.py.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import pytest
|
|
|
|
# Prevent tokenizer fork warnings and potential deadlocks
|
|
# See: https://github.com/huggingface/tokenizers/issues/1047
|
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
from pathlib import Path
|
|
from typing import Dict, Any
|
|
|
|
# Import utilities from test_utils
|
|
from .test_utils import (
|
|
discover_mlx_models_in_user_cache,
|
|
discover_text_models,
|
|
discover_vision_models,
|
|
discover_audio_models,
|
|
parse_vm_stat_page_size,
|
|
TEST_MODELS,
|
|
)
|
|
|
|
# Import the real MLX modules fixture from parent test module
|
|
# This is needed for tests that use MLXRunner directly (e.g., streaming parity)
|
|
# The fixture is already decorated with @pytest.fixture in test_stop_tokens_live.py
|
|
# We just import and re-export it here so it's available to tests in this package
|
|
_parent_dir = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(_parent_dir))
|
|
try:
|
|
from test_stop_tokens_live import _use_real_mlx_modules
|
|
finally:
|
|
sys.path.remove(str(_parent_dir))
|
|
|
|
# The imported fixture is now available to all tests in this package
|
|
|
|
|
|
@pytest.fixture(scope="function", autouse=True)
|
|
def _skip_unless_live_e2e_marker(request):
|
|
"""Auto-skip E2E tests unless -m live_e2e is explicitly used.
|
|
|
|
E2E tests are marker-required (🔒) - they require real models and httpx.
|
|
This fixture ensures they are skipped in the default pytest run.
|
|
|
|
Exception: show_model_portfolio marker is allowed (convenience diagnostics).
|
|
|
|
SCOPE LIMITATION: Only applies to tests in tests_2.0/live/ directory.
|
|
Tests in parent directory manage their own markers independently.
|
|
"""
|
|
# CRITICAL: Only apply to tests in live/ directory
|
|
# Tests in parent directory (tests_2.0/) handle their own skip logic
|
|
test_path = str(request.node.path)
|
|
if "/live/" not in test_path and "\\live\\" not in test_path:
|
|
return # Skip fixture for tests outside live/ directory
|
|
|
|
# Check if test has live_e2e marker
|
|
if request.node.get_closest_marker("live_e2e"):
|
|
# Check if -m live_e2e or -m show_model_portfolio or -m wet was specified
|
|
selected_markers = request.config.getoption("-m") or ""
|
|
if ("live_e2e" not in selected_markers and
|
|
"show_model_portfolio" not in selected_markers and
|
|
"wet" not in selected_markers):
|
|
pytest.skip("Run with -m live_e2e or -m wet")
|
|
|
|
|
|
def pytest_generate_tests(metafunc):
|
|
"""Generate parametrized tests for model_key, text_model_key, or vision_model_key.
|
|
|
|
DEPRECATED (model_key): Use text_model_key or vision_model_key instead for
|
|
deterministic test isolation. The legacy model_key parametrization mixes text
|
|
and vision models which causes test interference and non-deterministic indices.
|
|
|
|
If a test function has 'model_key' in its signature, this hook
|
|
automatically parametrizes it over all models in the portfolio.
|
|
This replaces the old loop-based approach (which caused RAM leaks)
|
|
with pytest-native parametrization for proper test isolation.
|
|
|
|
RECOMMENDED (Portfolio Separation): If a test has 'text_model_key' or 'vision_model_key',
|
|
parametrizes over text-only or vision-only models respectively.
|
|
|
|
Each parametrized test gets its own server instance lifecycle,
|
|
preventing accumulated RAM leaks from improper cleanup.
|
|
|
|
IMPORTANT: This hook runs during COLLECTION phase. We check for
|
|
live_e2e marker BEFORE doing portfolio discovery to avoid slow
|
|
collection when marker is not requested (maintains marker-required 🔒).
|
|
|
|
SCOPE LIMITATION: Only apply to tests in tests_2.0/live/ directory to avoid
|
|
interfering with parent directory tests that use isolated_cache.
|
|
"""
|
|
# CRITICAL: Only apply this hook to tests in the live/ directory
|
|
# Tests in parent directory (tests_2.0/) should not be parametrized by Portfolio Discovery
|
|
test_path = str(metafunc.definition.path)
|
|
if "/live/" not in test_path and "\\live\\" not in test_path:
|
|
return # Skip hook for tests outside live/ directory
|
|
|
|
# Check if live_e2e or wet marker is requested (COLLECTION-TIME check)
|
|
selected_markers = metafunc.config.getoption("-m") or ""
|
|
is_live_e2e = "live_e2e" in selected_markers or "wet" in selected_markers
|
|
|
|
# Handle text_model_key (NEW - Portfolio Separation)
|
|
if "text_model_key" in metafunc.fixturenames:
|
|
if not is_live_e2e:
|
|
metafunc.parametrize("text_model_key", ["_skipped"])
|
|
return
|
|
|
|
# Discover text-only models
|
|
text_models = discover_text_models()
|
|
if text_models:
|
|
model_keys = [f"text_{i:02d}" for i in range(len(text_models))]
|
|
else:
|
|
# Fallback to hardcoded test models (assume all text)
|
|
model_keys = list(TEST_MODELS.keys())
|
|
|
|
metafunc.parametrize("text_model_key", model_keys)
|
|
return
|
|
|
|
# Handle vision_model_key (NEW - Portfolio Separation)
|
|
if "vision_model_key" in metafunc.fixturenames:
|
|
if not is_live_e2e:
|
|
metafunc.parametrize("vision_model_key", ["_skipped"])
|
|
return
|
|
|
|
# Discover vision-only models
|
|
vision_models = discover_vision_models()
|
|
if vision_models:
|
|
model_keys = [f"vision_{i:02d}" for i in range(len(vision_models))]
|
|
else:
|
|
# No fallback for vision (needs real models)
|
|
model_keys = []
|
|
|
|
# If no vision models, parametrize with skip marker
|
|
if not model_keys:
|
|
model_keys = ["_no_vision_models"]
|
|
|
|
metafunc.parametrize("vision_model_key", model_keys)
|
|
return
|
|
|
|
# Handle audio_model_key (NEW - Portfolio Separation)
|
|
if "audio_model_key" in metafunc.fixturenames:
|
|
if not is_live_e2e:
|
|
metafunc.parametrize("audio_model_key", ["_skipped"])
|
|
return
|
|
|
|
# Discover audio-only models
|
|
audio_models = discover_audio_models()
|
|
if audio_models:
|
|
model_keys = [f"audio_{i:02d}" for i in range(len(audio_models))]
|
|
else:
|
|
# No fallback for audio (needs real models)
|
|
model_keys = []
|
|
|
|
# If no audio models, parametrize with skip marker
|
|
if not model_keys:
|
|
model_keys = ["_no_audio_models"]
|
|
|
|
metafunc.parametrize("audio_model_key", model_keys)
|
|
return
|
|
|
|
# Handle model_key (DEPRECATED - Mixed Text+Vision, use text_model_key/vision_model_key instead)
|
|
if "model_key" in metafunc.fixturenames:
|
|
if not is_live_e2e:
|
|
metafunc.parametrize("model_key", ["_skipped"])
|
|
return
|
|
|
|
# Portfolio Discovery at collection time (uses subprocess mlxk list)
|
|
discovered = discover_mlx_models_in_user_cache()
|
|
|
|
if discovered:
|
|
# Use discovered models - generate keys matching portfolio_models fixture
|
|
model_keys = [f"discovered_{i:02d}" for i in range(len(discovered))]
|
|
else:
|
|
# Fallback to hardcoded test models
|
|
model_keys = list(TEST_MODELS.keys())
|
|
|
|
# Parametrize the test over all model keys
|
|
metafunc.parametrize("model_key", model_keys)
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def portfolio_models():
|
|
"""Dynamic model portfolio: discovered models OR hardcoded fallback.
|
|
|
|
DEPRECATED: Use text_portfolio or vision_portfolio instead for deterministic
|
|
test isolation. This fixture mixes text and vision models which can cause
|
|
test interference and non-deterministic discovered_XX indices.
|
|
|
|
Reuses Portfolio Discovery from ADR-009 (test_stop_tokens_live.py).
|
|
Enables portfolio testing when HF_HOME is set, falls back to
|
|
3 hardcoded test models otherwise (backward compatibility).
|
|
|
|
Returns:
|
|
Dict[str, Dict[str, Any]]: Model portfolio keyed by model_key
|
|
{
|
|
"discovered_00": {
|
|
"id": "mlx-community/Llama-3.2-3B-Instruct-4bit",
|
|
"ram_needed_gb": 4.0,
|
|
"expected_issue": None,
|
|
"description": "Discovered: ..."
|
|
},
|
|
...
|
|
}
|
|
"""
|
|
discovered = discover_mlx_models_in_user_cache()
|
|
|
|
if discovered:
|
|
# Convert discovered models to TEST_MODELS format
|
|
result = {}
|
|
for i, model in enumerate(discovered):
|
|
key = f"discovered_{i:02d}"
|
|
result[key] = {
|
|
"id": model["model_id"],
|
|
"ram_needed_gb": model["ram_needed_gb"],
|
|
"expected_issue": None, # Unknown for discovered models
|
|
"description": f"Discovered: {model['model_id']} ({model['weight_count']} weights)"
|
|
}
|
|
|
|
print(f"\n🔍 Portfolio Discovery: Found {len(result)} MLX models in cache (Text+Vision mixed)")
|
|
return result
|
|
else:
|
|
# Fallback to hardcoded test models
|
|
print(f"\n📋 Using hardcoded TEST_MODELS (3 models)")
|
|
return TEST_MODELS
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def text_portfolio():
|
|
"""Text-only model portfolio (NEW - Portfolio Separation).
|
|
|
|
Discovers text models using discover_text_models() which filters out
|
|
vision models. This ensures deterministic test_XX indices that won't
|
|
change when vision models are added/removed from cache.
|
|
|
|
Returns:
|
|
Dict[str, Dict[str, Any]]: Text model portfolio keyed by text_model_key
|
|
{
|
|
"text_00": {
|
|
"id": "mlx-community/Qwen2.5-0.5B-Instruct-4bit",
|
|
"ram_needed_gb": 0.3,
|
|
"expected_issue": None,
|
|
"description": "Text: Qwen2.5-0.5B-Instruct-4bit"
|
|
},
|
|
...
|
|
}
|
|
"""
|
|
text_models = discover_text_models()
|
|
|
|
if text_models:
|
|
result = {}
|
|
for i, model in enumerate(text_models):
|
|
key = f"text_{i:02d}"
|
|
result[key] = {
|
|
"id": model["model_id"],
|
|
"ram_needed_gb": model["ram_needed_gb"],
|
|
"expected_issue": None,
|
|
"description": f"Text: {model['model_id'].split('/')[-1]}"
|
|
}
|
|
|
|
print(f"\n📝 Text Portfolio: Found {len(result)} text-only models")
|
|
return result
|
|
else:
|
|
# Fallback to hardcoded test models (assume all text)
|
|
print(f"\n📋 Text Portfolio: Using hardcoded TEST_MODELS (3 models)")
|
|
return TEST_MODELS
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def vision_portfolio():
|
|
"""Vision-only model portfolio (NEW - Portfolio Separation).
|
|
|
|
Discovers vision models using discover_vision_models() which filters to
|
|
only models with vision capabilities. Uses Vision-specific RAM calculation
|
|
(0.70 threshold instead of 1.2x multiplier).
|
|
|
|
Returns:
|
|
Dict[str, Dict[str, Any]]: Vision model portfolio keyed by vision_model_key
|
|
{
|
|
"vision_00": {
|
|
"id": "mlx-community/Llama-3.2-11B-Vision-Instruct-4bit",
|
|
"ram_needed_gb": 5.6,
|
|
"expected_issue": None,
|
|
"description": "Vision: Llama-3.2-11B-Vision-Instruct-4bit"
|
|
},
|
|
...
|
|
}
|
|
"""
|
|
vision_models = discover_vision_models()
|
|
|
|
if vision_models:
|
|
result = {}
|
|
for i, model in enumerate(vision_models):
|
|
key = f"vision_{i:02d}"
|
|
result[key] = {
|
|
"id": model["model_id"],
|
|
"ram_needed_gb": model["ram_needed_gb"],
|
|
"expected_issue": None,
|
|
"description": f"Vision: {model['model_id'].split('/')[-1]}"
|
|
}
|
|
|
|
print(f"\n👁️ Vision Portfolio: Found {len(result)} vision-capable models")
|
|
return result
|
|
else:
|
|
# No fallback for vision - requires real models
|
|
print(f"\n⚠️ Vision Portfolio: No vision models found in cache")
|
|
return {}
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def audio_portfolio():
|
|
"""Audio-only model portfolio (ADR-020 - Portfolio Separation).
|
|
|
|
Discovers audio models using discover_audio_models() which filters to
|
|
only models with audio capabilities. Includes both:
|
|
- STT models (Whisper, Voxtral) → mlx-audio backend
|
|
- Multimodal audio (Gemma-3n) → mlx-vlm backend
|
|
|
|
Returns:
|
|
Dict[str, Dict[str, Any]]: Audio model portfolio keyed by audio_model_key
|
|
{
|
|
"audio_00": {
|
|
"id": "mlx-community/whisper-large-v3-turbo-4bit",
|
|
"ram_needed_gb": 1.5,
|
|
"expected_issue": None,
|
|
"description": "Audio: whisper-large-v3-turbo-4bit"
|
|
},
|
|
...
|
|
}
|
|
"""
|
|
audio_models = discover_audio_models()
|
|
|
|
if audio_models:
|
|
result = {}
|
|
for i, model in enumerate(audio_models):
|
|
key = f"audio_{i:02d}"
|
|
result[key] = {
|
|
"id": model["model_id"],
|
|
"ram_needed_gb": model["ram_needed_gb"],
|
|
"expected_issue": None,
|
|
"description": f"Audio: {model['model_id'].split('/')[-1]}"
|
|
}
|
|
|
|
print(f"\n🔊 Audio Portfolio: Found {len(result)} audio-capable models")
|
|
return result
|
|
else:
|
|
# No fallback for audio - requires real models
|
|
print(f"\n⚠️ Audio Portfolio: No audio models found in cache")
|
|
return {}
|
|
|
|
|
|
@pytest.fixture
|
|
def model_info(portfolio_models, model_key):
|
|
"""Get model info for the current parametrized model_key.
|
|
|
|
DEPRECATED: Use text_model_info or vision_model_info for new tests.
|
|
|
|
This fixture provides convenient access to model metadata in
|
|
parametrized tests. It automatically looks up the model_key
|
|
in the portfolio and returns the model info dict.
|
|
|
|
Usage:
|
|
def test_something(model_info):
|
|
model_id = model_info["id"]
|
|
ram_needed = model_info["ram_needed_gb"]
|
|
...
|
|
|
|
Returns:
|
|
Dict[str, Any]: Model metadata with keys:
|
|
- id: Model ID (e.g., "mlx-community/Llama-3.2-3B-Instruct-4bit")
|
|
- ram_needed_gb: Estimated RAM requirement
|
|
- expected_issue: Known issue or None
|
|
- description: Human-readable description
|
|
"""
|
|
return portfolio_models[model_key]
|
|
|
|
|
|
@pytest.fixture
|
|
def text_model_info(text_portfolio, text_model_key):
|
|
"""Get model info for the current parametrized text_model_key (NEW).
|
|
|
|
This fixture provides convenient access to text model metadata in
|
|
parametrized tests. It automatically looks up the text_model_key
|
|
in the text_portfolio and returns the model info dict.
|
|
|
|
Usage:
|
|
def test_something(text_model_info):
|
|
model_id = text_model_info["id"]
|
|
ram_needed = text_model_info["ram_needed_gb"]
|
|
...
|
|
|
|
Returns:
|
|
Dict[str, Any]: Text model metadata with keys:
|
|
- id: Model ID (e.g., "mlx-community/Qwen2.5-0.5B-Instruct-4bit")
|
|
- ram_needed_gb: Estimated RAM requirement (1.2x text formula)
|
|
- expected_issue: Known issue or None
|
|
- description: Human-readable description
|
|
"""
|
|
return text_portfolio[text_model_key]
|
|
|
|
|
|
@pytest.fixture
|
|
def vision_model_info(vision_portfolio, vision_model_key):
|
|
"""Get model info for the current parametrized vision_model_key (NEW).
|
|
|
|
This fixture provides convenient access to vision model metadata in
|
|
parametrized tests. It automatically looks up the vision_model_key
|
|
in the vision_portfolio and returns the model info dict.
|
|
|
|
Usage:
|
|
def test_something(vision_model_info):
|
|
model_id = vision_model_info["id"]
|
|
ram_needed = vision_model_info["ram_needed_gb"]
|
|
...
|
|
|
|
Returns:
|
|
Dict[str, Any]: Vision model metadata with keys:
|
|
- id: Model ID (e.g., "mlx-community/Llama-3.2-11B-Vision-Instruct-4bit")
|
|
- ram_needed_gb: Estimated RAM requirement (0.70 threshold vision formula)
|
|
- expected_issue: Known issue or None
|
|
- description: Human-readable description
|
|
"""
|
|
return vision_portfolio[vision_model_key]
|
|
|
|
|
|
@pytest.fixture
|
|
def audio_model_info(audio_portfolio, audio_model_key):
|
|
"""Get model info for the current parametrized audio_model_key (ADR-020).
|
|
|
|
This fixture provides convenient access to audio model metadata in
|
|
parametrized tests. It automatically looks up the audio_model_key
|
|
in the audio_portfolio and returns the model info dict.
|
|
|
|
Usage:
|
|
def test_something(audio_model_info):
|
|
model_id = audio_model_info["id"]
|
|
ram_needed = audio_model_info["ram_needed_gb"]
|
|
...
|
|
|
|
Returns:
|
|
Dict[str, Any]: Audio model metadata with keys:
|
|
- id: Model ID (e.g., "mlx-community/whisper-large-v3-turbo-4bit")
|
|
- ram_needed_gb: Estimated RAM requirement
|
|
- expected_issue: Known issue or None
|
|
- description: Human-readable description
|
|
|
|
Returns None for skip markers (_skipped, _no_audio_models).
|
|
"""
|
|
if audio_model_key.startswith("_"):
|
|
return None
|
|
return audio_portfolio[audio_model_key]
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def _auto_report_vision_model(request):
|
|
"""Auto-report vision model info to benchmark log (autouse).
|
|
|
|
This fixture automatically adds vision model metadata to benchmark reports
|
|
for parametrized vision tests, without requiring explicit report_benchmark() calls.
|
|
|
|
This ensures vision models appear with proper annotations in memplot.py timeline charts.
|
|
|
|
Handles two types of vision tests:
|
|
1. API tests with vision_model_key parameter (vision_portfolio)
|
|
2. CLI tests in test_vision_e2e_live.py (hardcoded pixtral)
|
|
"""
|
|
# Type 1: Parametrized vision API tests (vision_model_key)
|
|
if "vision_model_key" in request.fixturenames:
|
|
# Get vision model info from fixture
|
|
try:
|
|
vision_model_info = request.getfixturevalue("vision_model_info")
|
|
except:
|
|
return
|
|
|
|
if not vision_model_info:
|
|
return
|
|
|
|
# Extract model metadata
|
|
model_id = vision_model_info["id"]
|
|
family, variant = _parse_model_family(model_id)
|
|
|
|
# Vision models: ram_needed_gb is disk size (no 1.2x overhead)
|
|
ram_gb = vision_model_info["ram_needed_gb"]
|
|
disk_size_gb = ram_gb if ram_gb != float('inf') else float('inf')
|
|
|
|
# Append to user_properties for benchmark reporting (schema v0.2.0)
|
|
request.node.user_properties.append(("model", {
|
|
"id": model_id,
|
|
"size_gb": round(disk_size_gb, 2) if disk_size_gb != float('inf') else disk_size_gb,
|
|
"family": family,
|
|
"variant": variant,
|
|
}))
|
|
return
|
|
|
|
# Type 2: CLI vision tests (test_vision_e2e_live.py)
|
|
# These tests use subprocess.run(["mlxk", "run", VISION_MODEL, ...])
|
|
# VISION_MODEL is explicitly set to "pixtral-12b-8bit" to avoid ambiguity
|
|
if 'test_vision_e2e_live.py' in request.node.nodeid:
|
|
# All CLI vision tests use explicit pixtral-12b-8bit
|
|
request.node.user_properties.append(("model", {
|
|
"id": "pixtral-12b-8bit", # Explicit model (not shorthand)
|
|
"size_gb": 13.5, # Actual disk size of 8bit variant
|
|
"family": "pixtral",
|
|
"variant": "12b-8bit",
|
|
}))
|
|
# Explicit inference_modality for CLI vision tests (v0.2.1)
|
|
# Required because these tests don't use vision_model_key fixture
|
|
request.node.user_properties.append(("inference_modality", "vision"))
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def _auto_report_audio_model(request):
|
|
"""Auto-report audio model info to benchmark log (autouse, ADR-020).
|
|
|
|
This fixture automatically adds audio model metadata to benchmark reports
|
|
for parametrized audio tests, without requiring explicit report_benchmark() calls.
|
|
|
|
This ensures audio models appear with proper annotations in memplot.py timeline charts.
|
|
|
|
Handles audio API tests with audio_model_key parameter (audio_portfolio).
|
|
"""
|
|
# Only for parametrized audio tests (audio_model_key)
|
|
if "audio_model_key" not in request.fixturenames:
|
|
return
|
|
|
|
# Get audio model info from fixture
|
|
try:
|
|
audio_model_info = request.getfixturevalue("audio_model_info")
|
|
except:
|
|
return
|
|
|
|
if not audio_model_info:
|
|
return
|
|
|
|
# Extract model metadata
|
|
model_id = audio_model_info["id"]
|
|
family, variant = _parse_model_family(model_id)
|
|
|
|
# Audio models: ram_needed_gb is disk size (no overhead)
|
|
ram_gb = audio_model_info["ram_needed_gb"]
|
|
disk_size_gb = ram_gb if ram_gb != float('inf') else float('inf')
|
|
|
|
# Append to user_properties for benchmark reporting (schema v0.2.2)
|
|
request.node.user_properties.append(("model", {
|
|
"id": model_id,
|
|
"size_gb": round(disk_size_gb, 2) if disk_size_gb != float('inf') else disk_size_gb,
|
|
"family": family,
|
|
"variant": variant,
|
|
}))
|
|
|
|
# Explicit inference_modality for audio tests (v0.2.1+)
|
|
# Required because audio_model_key fixture doesn't set this automatically
|
|
request.node.user_properties.append(("inference_modality", "audio"))
|
|
|
|
|
|
def _parse_model_family(model_id: str) -> tuple[str, str]:
|
|
"""Extract model family and variant from HuggingFace model ID.
|
|
|
|
Examples:
|
|
"mlx-community/Llama-3.2-3B-Instruct-4bit" → ("llama", "3.2-3b-instruct")
|
|
"mlx-community/Qwen2.5-7B-Instruct-4bit" → ("qwen", "2.5-7b-instruct")
|
|
"mlx-community/phi-3-mini-4k-instruct" → ("phi-3", "mini-4k-instruct")
|
|
|
|
Args:
|
|
model_id: HuggingFace model ID (org/name format)
|
|
|
|
Returns:
|
|
(family, variant) tuple. Returns ("unknown", model_name) if parsing fails.
|
|
"""
|
|
# Extract model name from org/name
|
|
model_name = model_id.split("/")[-1].lower()
|
|
|
|
# Common patterns
|
|
if "llama" in model_name:
|
|
family = "llama"
|
|
# Extract variant (everything after "llama-")
|
|
variant = model_name.split("llama-", 1)[1] if "llama-" in model_name else model_name
|
|
# Remove quantization suffix (-4bit, -8bit, etc.)
|
|
variant = variant.replace("-4bit", "").replace("-8bit", "").replace("-fp16", "")
|
|
return family, variant
|
|
|
|
if "qwen" in model_name:
|
|
family = "qwen"
|
|
variant = model_name.split("qwen", 1)[1] if "qwen" in model_name else model_name
|
|
variant = variant.replace("-4bit", "").replace("-8bit", "").replace("-fp16", "")
|
|
return family, variant
|
|
|
|
if "phi" in model_name:
|
|
# Phi models: phi-3.5, phi-3, phi-2, etc.
|
|
# Check most specific version first
|
|
if "phi-3.5" in model_name:
|
|
family = "phi-3.5"
|
|
variant = model_name.split("phi-3.5-", 1)[1] if "phi-3.5-" in model_name else "base"
|
|
elif "phi-3" in model_name:
|
|
family = "phi-3"
|
|
variant = model_name.split("phi-3-", 1)[1] if "phi-3-" in model_name else "base"
|
|
elif "phi-2" in model_name:
|
|
family = "phi-2"
|
|
variant = model_name.split("phi-2-", 1)[1] if "phi-2-" in model_name else "base"
|
|
else:
|
|
family = "phi"
|
|
variant = model_name
|
|
variant = variant.replace("-4bit", "").replace("-8bit", "")
|
|
return family, variant
|
|
|
|
if "deepseek" in model_name:
|
|
family = "deepseek"
|
|
variant = model_name.replace("deepseek-", "")
|
|
variant = variant.replace("-4bit", "").replace("-8bit", "")
|
|
return family, variant
|
|
|
|
if "mistral" in model_name or "mixtral" in model_name:
|
|
family = "mistral" if "mistral" in model_name else "mixtral"
|
|
variant = model_name.replace(f"{family}-", "")
|
|
variant = variant.replace("-4bit", "").replace("-8bit", "")
|
|
return family, variant
|
|
|
|
if "whisper" in model_name:
|
|
family = "whisper"
|
|
variant = model_name.replace("whisper-", "")
|
|
variant = variant.replace("-4bit", "").replace("-8bit", "").replace("-fp16", "")
|
|
return family, variant
|
|
|
|
if "pixtral" in model_name:
|
|
family = "pixtral"
|
|
variant = model_name.replace("pixtral-", "")
|
|
variant = variant.replace("-4bit", "").replace("-8bit", "")
|
|
return family, variant
|
|
|
|
# Fallback: unknown family
|
|
return "unknown", model_name.replace("-4bit", "").replace("-8bit", "")
|
|
|
|
|
|
@pytest.fixture
|
|
def report_benchmark(request):
|
|
"""Helper for writing benchmark data to test reports (ADR-013 Phase 0).
|
|
|
|
Simplifies adding model metadata and performance metrics to E2E test reports.
|
|
Reports are written as JSONL via pytest_runtest_makereport hook.
|
|
|
|
Dynamically uses text_model_info, vision_model_info, or model_info (deprecated)
|
|
based on what's available in the test's fixture request.
|
|
|
|
Usage:
|
|
def test_something(report_benchmark, text_model_info):
|
|
# ... test logic ...
|
|
|
|
# Report model info only
|
|
report_benchmark()
|
|
|
|
# Report with performance metrics
|
|
report_benchmark(performance={
|
|
"tokens_per_sec": 45.2,
|
|
"ram_peak_mb": 3200,
|
|
"prompt_tokens": 15,
|
|
"completion_tokens": 42
|
|
})
|
|
|
|
# Report with stop token data
|
|
report_benchmark(stop_tokens={
|
|
"configured": ["<|end|>"],
|
|
"detected": ["<|end|>"],
|
|
"workaround": "none",
|
|
"leaked": False
|
|
})
|
|
|
|
Args:
|
|
performance: Optional performance metrics dict
|
|
stop_tokens: Optional stop token validation data
|
|
**extra: Additional metadata (goes to metadata section)
|
|
"""
|
|
def _report(performance: Dict[str, Any] = None, stop_tokens: Dict[str, Any] = None, **extra):
|
|
# Dynamically get model_info from available fixtures (Portfolio Separation)
|
|
model_info = None
|
|
for fixture_name in ["text_model_info", "vision_model_info", "model_info"]:
|
|
try:
|
|
model_info = request.getfixturevalue(fixture_name)
|
|
if model_info is not None:
|
|
break
|
|
except:
|
|
continue
|
|
|
|
if model_info is None:
|
|
# No model info available (non-parametrized test)
|
|
return
|
|
|
|
# Extract model family/variant from model_id
|
|
model_id = model_info["id"]
|
|
family, variant = _parse_model_family(model_id)
|
|
|
|
# Build model section (convert RAM estimate to disk size)
|
|
# ram_needed_gb includes 1.2x overhead for text, direct size for vision
|
|
# For vision models (with 0.70 threshold), ram_needed_gb IS the disk size
|
|
# For text models, disk size = ram_needed_gb / 1.2
|
|
ram_gb = model_info["ram_needed_gb"]
|
|
if ram_gb == float('inf'):
|
|
disk_size_gb = float('inf') # Vision model too large
|
|
else:
|
|
# Heuristic: if ram < 1.5x disk size, assume it's vision (no overhead)
|
|
# Otherwise assume text (1.2x overhead)
|
|
disk_size_gb = ram_gb / 1.2
|
|
|
|
request.node.user_properties.append(("model", {
|
|
"id": model_id,
|
|
"size_gb": round(disk_size_gb, 2) if disk_size_gb != float('inf') else disk_size_gb,
|
|
"family": family,
|
|
"variant": variant,
|
|
}))
|
|
|
|
# Add performance if provided
|
|
if performance:
|
|
request.node.user_properties.append(("performance", performance))
|
|
|
|
# Add stop_tokens if provided
|
|
if stop_tokens:
|
|
request.node.user_properties.append(("stop_tokens", stop_tokens))
|
|
|
|
# Add any extra metadata
|
|
for key, value in extra.items():
|
|
request.node.user_properties.append((key, value))
|
|
|
|
return _report
|
|
|
|
|
|
# ============================================================================
|
|
# Precise Test Timing - For Effective Runtime Analysis
|
|
# ============================================================================
|
|
|
|
# StashKeys for test timing (pytest 7.0+ API)
|
|
test_start_key = pytest.StashKey[float]()
|
|
test_end_key = pytest.StashKey[float]()
|
|
|
|
|
|
@pytest.hookimpl(tryfirst=True)
|
|
def pytest_runtest_setup(item):
|
|
"""Hook: Capture precise test start timestamp (Unix epoch).
|
|
|
|
Enables accurate correlation with memmon samples and effective runtime
|
|
calculation by excluding idle periods (Memory Gates, setup overhead).
|
|
|
|
Stored in node stash for later retrieval in makereport hook.
|
|
"""
|
|
item.stash[test_start_key] = time.time()
|
|
|
|
|
|
@pytest.hookimpl(trylast=True)
|
|
def pytest_runtest_teardown(item):
|
|
"""Hook: Capture precise test end timestamp (Unix epoch).
|
|
|
|
Paired with test_start_ts for precise test duration measurement
|
|
independent of pytest's duration calculation.
|
|
"""
|
|
item.stash[test_end_key] = time.time()
|
|
|
|
|
|
@pytest.hookimpl(tryfirst=True)
|
|
def pytest_runtest_makereport(item, call):
|
|
"""Hook: Add precise timestamps to benchmark report (Schema v0.2.2).
|
|
|
|
Retrieves test_start_ts and test_end_ts from stash (captured in
|
|
setup/teardown hooks) and adds them to user_properties for
|
|
inclusion in benchmark JSONL output.
|
|
|
|
This enables post-processing tools to correlate test execution
|
|
with memmon samples and calculate effective runtime.
|
|
|
|
CRITICAL: Uses tryfirst=True to ensure this hook runs BEFORE the
|
|
conftest.py hook that writes JSONL (which has hookwrapper=True).
|
|
"""
|
|
if call.when == "call": # Only for actual test execution, not setup/teardown
|
|
test_start_ts = item.stash.get(test_start_key, None)
|
|
test_end_ts = item.stash.get(test_end_key, None)
|
|
|
|
if test_start_ts and test_end_ts:
|
|
item.user_properties.append(("test_start_ts", test_start_ts))
|
|
item.user_properties.append(("test_end_ts", test_end_ts)) |