Files
mlx-knife/tests_2.0/test_stop_tokens_live.py
T
The BROKE Cluster Team bf7480d042 Release 2.0.4-beta.9: Audio transcription via mlx-audio
Major Features:
- Audio transcription via mlx-audio backend (Whisper, >10min duration)
- OpenAI /v1/audio/transcriptions endpoint
- Memory Gate System (Vision: 8GB, Audio: 4GB)
- Config-based backend routing (ADR-020)
- Benchmark toolchain (memmon/memplot, Schema v0.2.2)

Key Fixes:
- EuroLLM tokenizer decoding
- Vision-model text-only routing regression
- Multimodal model context length detection
- Memory cleanup bug (mx.metal.clear_cache)
- Orphan process bug

Test Results:
- Unit tests: 647 passed, 11 skipped (Python 3.10-3.12)
- wet-umbrella: 171 passed total

See CHANGELOG.md for complete details and known issues.
2026-02-04 03:10:30 +01:00

821 lines
32 KiB
Python

"""Real-model stop token detection tests for Issue #32 (ADR-009).
This test suite validates stop token handling with real TEXT models that exhibit
known issues:
- MXFP4: Visible `<|end|>` tokens in output
- Qwen 2.5: Self-conversation (chat template role markers)
- Llama 3.2: Control baseline (should work correctly)
Test Strategy (ADR-009):
1. Phase 1: Baseline measurement (document broken behavior)
2. Phase 2: Fix validation (verify 2-LOC fix works)
3. Phase 3: Empirical mapping (document tokenizer configs)
Portfolio Discovery:
- Auto-discovers MLX TEXT chat models only (excludes Vision "chat+vision")
- Uses MLXRunner (mlx-lm) which cannot load Vision models (mllama etc.)
- See Portfolio Separation: live/test_utils.py for separated text/vision portfolios
Opt-in via: pytest -m live_stop_tokens
Requires: HF_HOME set to SSD cache (CoW same-volume requirement, ADR-007)
RAM Safety:
- Tests automatically skip models that exceed available RAM
- Progressive budget scaling: 40% (16GB), 50% (32GB), 60% (64GB), 70% (96GB+)
- Larger systems have lower relative overhead, enabling better RAM utilization
- See TESTING-DETAILS.md: "RAM-Aware Model Selection Strategy"
"""
from __future__ import annotations
import os
import sys
import pytest
import json
import subprocess
from pathlib import Path
from typing import Dict, Any, Optional
import importlib
import importlib.util
# Portfolio Separation reference: live/test_utils.py provides discover_text_models()
# but we can't use it here due to circular import (test_utils imports from this file)
# Instead, we fix discover_mlx_models_in_user_cache() to exclude Vision models directly
# Opt-in marker for live tests
# CRITICAL: Must include `live` marker so -m "not live" excludes these tests
pytestmark = [pytest.mark.live, pytest.mark.live_stop_tokens, pytest.mark.slow]
@pytest.fixture(scope="module", autouse=True)
def _use_real_mlx_modules():
"""Ensure live tests use real mlx / mlx-lm without polluting the rest of the suite."""
stub_path = Path(__file__).parent / "stubs"
stub_path_str = str(stub_path)
# Remove stub path from sys.path (if present) and remember to restore it later
path_removed = False
if stub_path_str in sys.path:
sys.path = [p for p in sys.path if p != stub_path_str]
path_removed = True
# Clear transformers modules FIRST (depends on huggingface_hub, uses lazy imports)
# Must happen before any getattr() calls on sys.modules to avoid triggering lazy imports
removed_transformers_modules: Dict[str, Any] = {}
for module_name, module in list(sys.modules.items()):
if module_name == "transformers" or module_name.startswith("transformers."):
removed_transformers_modules[module_name] = module
sys.modules.pop(module_name, None)
# Also clear any previously installed huggingface_hub shims
removed_hf_modules: Dict[str, Any] = {}
for module_name, module in list(sys.modules.items()):
if module_name == "huggingface_hub" or module_name.startswith("huggingface_hub."):
removed_hf_modules[module_name] = module
sys.modules.pop(module_name, None)
# Remove stub modules from sys.modules so real modules can be imported
# (AFTER transformers/huggingface_hub cleanup to avoid lazy import triggers)
removed_modules: Dict[str, Any] = {}
for module_name, module in list(sys.modules.items()):
module_file = getattr(module, "__file__", "") or ""
if module_file and stub_path_str in module_file:
removed_modules[module_name] = module
sys.modules.pop(module_name, None)
# Require real mlx / mlx-lm; skip entire module if not available
missing_runtime = False
if (
importlib.util.find_spec("mlx.core") is None
or importlib.util.find_spec("mlx_lm") is None
):
missing_runtime = True
else:
try:
huggingface_hub = importlib.import_module("huggingface_hub")
except ImportError:
missing_runtime = True
else:
if not hasattr(huggingface_hub, "snapshot_download"):
for name, mod in removed_modules.items():
sys.modules[name] = mod
for name, mod in removed_hf_modules.items():
sys.modules[name] = mod
for name, mod in removed_transformers_modules.items():
sys.modules[name] = mod
if path_removed and stub_path_str not in sys.path:
sys.path.insert(0, stub_path_str)
pytest.skip(
"requires huggingface_hub.snapshot_download (install latest huggingface-hub)",
allow_module_level=True,
)
if missing_runtime:
# Restore previous state before skipping so rest of suite still uses stubs
sys.modules.update({name: mod for name, mod in removed_modules.items()
if name not in sys.modules})
sys.modules.update({name: mod for name, mod in removed_hf_modules.items()
if name not in sys.modules})
sys.modules.update({name: mod for name, mod in removed_transformers_modules.items()
if name not in sys.modules})
if path_removed and stub_path_str not in sys.path:
sys.path.insert(0, stub_path_str)
pytest.skip(
"requires mlx / mlx-lm native runtime (Apple Silicon)",
allow_module_level=True,
)
try:
yield
finally:
# Restore stub modules for the remainder of the test run
for name, module in removed_modules.items():
sys.modules[name] = module
for name, module in removed_hf_modules.items():
sys.modules[name] = module
for name, module in removed_transformers_modules.items():
sys.modules[name] = module
# Ensure stub path is back at the front for unit tests
if path_removed and stub_path_str not in sys.path:
sys.path.insert(0, stub_path_str)
# HF_HOME is optional: Portfolio Discovery uses it if set, falls back to hardcoded TEST_MODELS
_HF_HOME = os.environ.get("HF_HOME")
def get_system_ram_gb() -> float:
"""Detect system RAM in GB (macOS portable)."""
try:
result = subprocess.run(
["sysctl", "hw.memsize"],
capture_output=True,
text=True,
check=True
)
# Output: "hw.memsize: 68719476736"
memsize_bytes = int(result.stdout.strip().split(":")[1].strip())
return memsize_bytes / (1024**3) # Convert to GB
except Exception:
# Fallback: assume minimum safe config (16GB)
return 16.0
def get_safe_ram_budget_gb() -> float:
"""Get safe RAM budget for model loading (progressive scaling).
Progressive budget strategy (relative overhead decreases with larger systems):
- 16GB System: 40% budget (6.4GB) - high relative OS overhead
- 32GB System: 50% budget (16GB) - moderate overhead
- 64GB System: 60% budget (38.4GB) - low overhead
- 96GB+ System: 70% budget (67GB+) - minimal overhead
Rationale:
- OS/System baseline overhead is ~4-6GB (relatively constant)
- Larger systems have more headroom after OS overhead
- Progressive scaling allows better utilization of high-RAM systems
"""
system_ram = get_system_ram_gb()
# Progressive budget scaling
if system_ram >= 96:
budget_ratio = 0.70 # 70% for 96GB+ systems
elif system_ram >= 64:
budget_ratio = 0.60 # 60% for 64GB systems
elif system_ram >= 32:
budget_ratio = 0.50 # 50% for 32GB systems
else:
budget_ratio = 0.40 # 40% for 16GB systems (conservative)
safe_budget = system_ram * budget_ratio
return safe_budget
def discover_mlx_models_in_user_cache() -> List[Dict[str, Any]]:
"""Discover MLX chat models via mlxk list --json (production command).
Uses production CLI instead of duplicating cache scanning logic.
Leverages official JSON API (docs/json-api-schema.json modelObject).
Filters for:
- Framework: MLX only (not GGUF/PyTorch)
- Health: healthy only (static file integrity)
- Runtime: runtime_compatible only (mlx-lm/mlx-vlm can load)
- Type: chat models (TEXT + VISION, includes all model_type="chat")
- Exclusions: KNOWN_BROKEN_MODELS (upstream runtime bugs)
Note: Returns BOTH text and vision models. Caller must filter by capabilities
if needed (e.g., portfolio_models fixture filters to TEXT-only).
Returns:
List of dicts with keys: model_id, ram_needed_gb, snapshot_path, weight_count
Note: snapshot_path and weight_count set to None (not needed for tests)
"""
import subprocess
import json
from mlxk2.core.model_resolution import resolve_model_for_operation
from mlxk2.core.cache import get_current_model_cache, hf_to_cache_dir
# Import blacklist (local import to avoid circular dependency)
# KNOWN_BROKEN_MODELS is defined in tests_2.0/live/test_utils.py
try:
sys.path.insert(0, str(Path(__file__).parent / "live"))
from test_utils import KNOWN_BROKEN_MODELS
sys.path.pop(0)
except ImportError:
KNOWN_BROKEN_MODELS = set() # Fallback if import fails
# Check HF_HOME is set (required for mlxk list)
env = os.environ.copy()
if not env.get("HF_HOME"):
return []
try:
# Call production mlxk list command
result = subprocess.run(
[sys.executable, "-m", "mlxk2.cli", "list", "--json"],
capture_output=True,
text=True,
timeout=30,
env=env # Pass environment with HF_HOME
)
if result.returncode != 0:
return []
# Parse JSON response (docs/json-api-schema.json)
data = json.loads(result.stdout)
# Extract models array from response
models = data.get("data", {}).get("models", [])
# Filter per schema modelObject fields
discovered = []
for model in models:
# Filter: MLX + healthy + runtime_compatible + chat (TEXT + VISION)
model_type = model.get("model_type")
is_chat = (
isinstance(model_type, str) and
model_type == "chat" # Includes both text and vision chat models
)
if (model.get("framework") == "MLX" and
model.get("health") == "healthy" and
model.get("runtime_compatible") is True and
is_chat):
# RAM estimation: size_bytes * 1.2 overhead
size_bytes = model.get("size_bytes", 0)
ram_gb = (size_bytes / (1024**3)) * 1.2 if size_bytes else 0
# Resolve to canonical cache name to avoid 404 during preload
model_name = model["name"]
try:
resolved_name, _, _ = resolve_model_for_operation(model_name)
if resolved_name:
model_name = resolved_name
except Exception:
pass
# FILTER: Exclude known broken models (upstream runtime bugs)
if model_name in KNOWN_BROKEN_MODELS:
continue
# Ensure cache directory exists (defensive against stale listings)
try:
cache_dir = get_current_model_cache() / hf_to_cache_dir(model_name)
if not cache_dir.exists():
continue
except Exception:
continue
discovered.append({
"model_id": model_name, # Canonical model ID
"ram_needed_gb": ram_gb,
"snapshot_path": None, # Not provided by list, not needed
"weight_count": None # Not provided by list, not needed
})
return discovered
except Exception:
# Robust: return empty list on any error (keeps tests runnable)
return []
# Test models from ADR-009 with RAM requirements
# RAM estimates from TESTING-DETAILS.md: "RAM-Aware Model Selection Strategy"
TEST_MODELS = {
"mxfp4": {
"id": "mlx-community/gpt-oss-20b-MXFP4-Q8",
"expected_issue": "visible_end_token",
"description": "MXFP4 format with visible <|end|> in output",
"ram_needed_gb": 12.0 # 20B MXFP4 (~12GB empirical)
},
"qwen25": {
"id": "mlx-community/Qwen2.5-0.5B-Instruct-4bit",
"expected_issue": "self_conversation",
"description": "Qwen 2.5 generates chat template markers",
"ram_needed_gb": 1.0 # 0.5B 4-bit (~1GB)
},
"llama32": {
"id": "mlx-community/Llama-3.2-3B-Instruct-4bit",
"expected_issue": None,
"description": "Control baseline (should work correctly)",
"ram_needed_gb": 4.0 # 3B 4-bit (~4GB)
}
}
@pytest.fixture(scope="module")
def portfolio_models():
"""Dynamic TEXT model portfolio: discovered models OR hardcoded fallback.
Discovers MLX TEXT chat models only (excludes Vision "chat+vision").
Uses MLXRunner (mlx-lm) which cannot load Vision models.
Enables portfolio testing when HF_HOME is set, falls back to
3 hardcoded test models otherwise (backward compatibility).
"""
all_models = discover_mlx_models_in_user_cache() # Returns TEXT + VISION
if all_models:
# Filter to TEXT-only models (exclude Vision)
# Vision models have "vision" in capabilities array (from mlxk list --json)
import subprocess
import json
import os
env = os.environ.copy()
if env.get("HF_HOME"):
try:
result_data = subprocess.run(
[sys.executable, "-m", "mlxk2.cli", "list", "--json"],
capture_output=True,
text=True,
timeout=30,
env=env
)
if result_data.returncode == 0:
data = json.loads(result_data.stdout)
models_list = data.get("data", {}).get("models", [])
# Build set of vision model IDs
vision_ids = {m["name"] for m in models_list if "vision" in m.get("capabilities", [])}
# Filter out vision models
text_models = [m for m in all_models if m["model_id"] not in vision_ids]
else:
text_models = all_models # Fallback: include all
except Exception:
text_models = all_models # Fallback: include all
else:
text_models = all_models # No HF_HOME, use all
# Convert discovered TEXT models to TEST_MODELS format
result = {}
for i, model in enumerate(text_models):
key = f"discovered_{i:02d}"
result[key] = {
"id": model["model_id"],
"ram_needed_gb": model["ram_needed_gb"],
"expected_issue": None, # Unknown for discovered models
"description": f"Discovered: {model['model_id']} ({model.get('weight_count', 'unknown')} weights)"
}
print(f"\n🔍 Portfolio Discovery: Found {len(result)} MLX TEXT models in cache")
return result
else:
# Fallback to hardcoded test models
print(f"\n📋 Using hardcoded TEST_MODELS (3 models)")
return TEST_MODELS
def should_skip_model(model_key: str, models_dict: Dict[str, Any] = None) -> tuple[bool, str]:
"""Check if model should be skipped due to insufficient RAM.
Args:
model_key: Key in models dictionary
models_dict: Optional models dict (defaults to TEST_MODELS)
Returns:
(should_skip, reason)
"""
if models_dict is None:
models_dict = TEST_MODELS
model_info = models_dict[model_key]
ram_needed = model_info["ram_needed_gb"]
ram_budget = get_safe_ram_budget_gb()
system_ram = get_system_ram_gb()
if ram_needed > ram_budget:
budget_pct = int((ram_budget / system_ram * 100) if system_ram > 0 else 40)
return (
True,
f"Model requires {ram_needed}GB but only {ram_budget:.1f}GB available "
f"({budget_pct}% of {system_ram:.0f}GB system RAM). See TESTING-DETAILS.md RAM-Aware Model Selection."
)
return (False, "")
# Standard test prompt (simple, predictable)
TEST_PROMPT = "Write one sentence about cats."
MAX_TOKENS = 50
def pytest_generate_tests(metafunc):
"""Dynamically parametrize empirical mapping test with discovered model keys.
This hook runs during test collection (before test execution).
Enables process-per-model isolation: each model runs in separate pytest process.
Architecture Decision (Session 56):
- Prevents memory leak accumulation (71GB swap with 20 models in one process)
- OS-level cleanup between models (process exit guarantees full cleanup)
- Reflects real-world usage (users never load 20+ models sequentially)
"""
if metafunc.function.__name__ == "test_empirical_mapping_single_model":
# Lightweight discovery for parametrization (same logic as portfolio_models fixture)
from live.test_utils import discover_mlx_models_in_user_cache
all_models = discover_mlx_models_in_user_cache()
if all_models:
# Filter to TEXT-only models (exclude Vision) - same as portfolio_models fixture
import json
env = os.environ.copy()
if env.get("HF_HOME"):
try:
result_data = subprocess.run(
[sys.executable, "-m", "mlxk2.cli", "list", "--json"],
capture_output=True,
text=True,
timeout=30,
env=env
)
if result_data.returncode == 0:
data = json.loads(result_data.stdout)
models_list = data.get("data", {}).get("models", [])
vision_ids = {m["name"] for m in models_list if "vision" in m.get("capabilities", [])}
text_models = [m for m in all_models if m["model_id"] not in vision_ids]
else:
text_models = all_models
except Exception:
text_models = all_models
else:
text_models = all_models
# Generate model keys (discovered_00, discovered_01, ...)
model_keys = [f"discovered_{i:02d}" for i in range(len(text_models))]
else:
# Fallback to hardcoded test models
model_keys = list(TEST_MODELS.keys())
# Parametrize with model keys (each key becomes a separate test)
# ids= makes test names readable: test_empirical_mapping_single_model[discovered_00]
metafunc.parametrize("model_key_param", model_keys, ids=lambda x: x)
class TestStopTokensValidation:
"""Validation: Verify stop token handling works correctly (Issue #32, ADR-009)."""
@pytest.mark.live_stop_tokens
def test_mxfp4_stop_token_filtering(self, request):
"""MXFP4: Stop tokens should be filtered correctly.
After ADR-009 2-LOC fix (eos_token_id → eos_token_ids):
- Model should stop cleanly without visible stop tokens
- No `<|end|>` or `<|return|>` in output
Background (Issue #32):
- MXFP4 previously showed visible `<|end|>` tokens
- Root cause: Runner only checked singular eos_token_id
- Fix: Use eos_token_ids Set to handle multiple EOS tokens
"""
# Only run when explicitly selected with -m live_stop_tokens
# NOTE: Excluded from -m wet to prevent nanobind crash (MLX re-import issue)
selected = request.config.getoption("-m") or ""
if "live_stop_tokens" not in selected:
pytest.skip("Run with -m live_stop_tokens to enable live model tests")
# RAM Safety Check
should_skip, reason = should_skip_model("mxfp4")
if should_skip:
pytest.skip(reason)
from mlxk2.core.runner import MLXRunner
model_id = TEST_MODELS["mxfp4"]["id"]
# Run inference
with MLXRunner(model_id) as runner:
output = runner.generate_batch(
prompt=TEST_PROMPT,
max_tokens=MAX_TOKENS
)
# Validate clean output
print(f"\n{'='*60}")
print(f"VALIDATION: MXFP4")
print(f"{'='*60}")
print(f"Model: {model_id}")
print(f"Prompt: {TEST_PROMPT}")
print(f"Output: {output!r}")
# Assert no visible stop tokens
assert "<|end|>" not in output, "MXFP4 should filter <|end|> token"
assert "<|return|>" not in output, "MXFP4 should filter <|return|> token"
print("✓ MXFP4: Stop tokens correctly filtered")
@pytest.mark.live_stop_tokens
def test_qwen25_no_self_conversation(self, request):
"""Qwen 2.5: Should not generate chat template role markers (self-conversation).
Self-Conversation Definition (ADR-009):
- Model generates chat template role markers (User:, Assistant:, etc.)
- Common patterns: '\nUser:', '\nAssistant:', '<|im_start|>user', '<|im_start|>assistant'
- Specific to Qwen: '<|im_start|>', '<|im_end|>' markers
Expected Behavior:
- Model stops cleanly after its response
- No chat template markers in output
"""
# Only run when explicitly selected with -m live_stop_tokens
# NOTE: Excluded from -m wet to prevent nanobind crash (MLX re-import issue)
selected = request.config.getoption("-m") or ""
if "live_stop_tokens" not in selected:
pytest.skip("Run with -m live_stop_tokens to enable live model tests")
# RAM Safety Check
should_skip, reason = should_skip_model("qwen25")
if should_skip:
pytest.skip(reason)
from mlxk2.core.runner import MLXRunner
model_id = TEST_MODELS["qwen25"]["id"]
# Run inference
with MLXRunner(model_id) as runner:
output = runner.generate_batch(
prompt=TEST_PROMPT,
max_tokens=MAX_TOKENS
)
# Validate clean output
print(f"\n{'='*60}")
print(f"VALIDATION: Qwen 2.5")
print(f"{'='*60}")
print(f"Model: {model_id}")
print(f"Prompt: {TEST_PROMPT}")
print(f"Output: {output!r}")
# Check for self-conversation patterns
generic_markers = ["\nUser:", "\nAssistant:", "\nHuman:", "\nAI:"]
qwen_markers = ["<|im_start|>user", "<|im_start|>assistant", "<|im_start|>", "<|im_end|>"]
found_generic = [m for m in generic_markers if m in output]
found_qwen = [m for m in qwen_markers if m in output]
print(f"Generic markers found: {found_generic}")
print(f"Qwen markers found: {found_qwen}")
# Assert no self-conversation
assert not found_generic, f"Qwen 2.5 should not generate generic chat markers. Found: {found_generic}"
assert not found_qwen, f"Qwen 2.5 should not generate Qwen-specific markers. Found: {found_qwen}"
print("✓ Qwen 2.5: No self-conversation")
@pytest.mark.live_stop_tokens
def test_llama32_regression_control(self, request):
"""Llama 3.2: Regression control (should work correctly).
Llama 3.2 has 3 eos_token_ids: [128008, 128001, 128009]
This validates that the 2-LOC fix correctly handles multi-EOS models.
Expected Behavior:
- Clean output without visible stop tokens
- No self-conversation
- Serves as regression baseline
"""
# Only run when explicitly selected with -m live_stop_tokens
# NOTE: Excluded from -m wet to prevent nanobind crash (MLX re-import issue)
selected = request.config.getoption("-m") or ""
if "live_stop_tokens" not in selected:
pytest.skip("Run with -m live_stop_tokens to enable live model tests")
# RAM Safety Check
should_skip, reason = should_skip_model("llama32")
if should_skip:
pytest.skip(reason)
from mlxk2.core.runner import MLXRunner
from mlxk2.core.cache import get_current_model_cache, hf_to_cache_dir
from pathlib import Path
model_id = TEST_MODELS["llama32"]["id"]
# Check if model exists in cache
cache = get_current_model_cache()
model_dir = cache / hf_to_cache_dir(model_id)
snapshots_dir = model_dir / "snapshots"
if not snapshots_dir.exists() or not any(snapshots_dir.iterdir()):
pytest.skip(f"Model not in cache: {model_id}")
# Run inference
with MLXRunner(model_id) as runner:
output = runner.generate_batch(
prompt=TEST_PROMPT,
max_tokens=MAX_TOKENS
)
# Validate clean output
print(f"\n{'='*60}")
print(f"VALIDATION: Llama 3.2 (Regression Control)")
print(f"{'='*60}")
print(f"Model: {model_id}")
print(f"Prompt: {TEST_PROMPT}")
print(f"Output: {output!r}")
# Llama 3.2 stop tokens
llama_stop_tokens = ["<|eot_id|>", "</s>", "<|end_of_text|>"]
found_stop = [t for t in llama_stop_tokens if t in output]
assert not found_stop, f"Llama 3.2 should filter stop tokens. Found: {found_stop}"
# No generic chat markers
generic_markers = ["\nUser:", "\nAssistant:", "\nHuman:", "\nAI:"]
found_markers = [m for m in generic_markers if m in output]
assert not found_markers, f"Llama 3.2 should not self-converse. Found: {found_markers}"
print("✓ Llama 3.2: Clean output (regression control passed)")
class TestStopTokensEmpiricalMapping:
"""Phase 3: Empirical mapping - document tokenizer configs and observed tokens."""
@pytest.mark.live_stop_tokens
def test_empirical_mapping_single_model(self, model_key_param, portfolio_models, request):
"""Document tokenizer configs and empirically observed stop tokens (ONE model per test).
ARCHITECTURE DECISION (Session 56):
- Each model runs in SEPARATE pytest process (process isolation)
- OS guarantees complete memory cleanup between models
- Prevents memory leak accumulation (71GB swap with 20 models in one process)
- Reflects real-world usage (users never load 20 models sequentially)
Uses portfolio_models fixture for dynamic model discovery.
Each test writes JSONL fragment, final report generated by finalize test.
Report Format (ADR-009):
{
"model": "gpt-oss",
"configured_eos": ["<|return|>"], # From tokenizer.eos_token
"configured_eos_ids": [50256, ...], # From tokenizer.eos_token_ids
"generated_tokens": ["<|end|>", ...], # Empirically observed
"workaround_needed": True/False
}
"""
# Only run when explicitly selected with -m live_stop_tokens
# NOTE: Excluded from -m wet to prevent nanobind crash (MLX re-import issue)
selected = request.config.getoption("-m") or ""
if "live_stop_tokens" not in selected:
pytest.skip("Run with -m live_stop_tokens to enable portfolio discovery")
from mlxk2.core.runner import MLXRunner
# Get model_key from pytest parametrize
model_key = model_key_param
model_info = portfolio_models[model_key]
model_id = model_info["id"]
system_ram = get_system_ram_gb()
ram_budget = get_safe_ram_budget_gb()
budget_ratio = ram_budget / system_ram if system_ram > 0 else 0.40
# Skip models that exceed RAM budget
should_skip, skip_reason = should_skip_model(model_key, portfolio_models)
if should_skip:
print(f"\nSkipping {model_key}: {skip_reason}")
result = {
"model_key": model_key,
"model_id": model_id,
"skipped": True,
"skip_reason": skip_reason,
"system_ram_gb": round(system_ram, 1),
"ram_budget_gb": round(ram_budget, 1),
"budget_ratio": round(budget_ratio, 2)
}
else:
with MLXRunner(model_id) as runner:
# Get tokenizer config
tokenizer = runner.tokenizer
# Extract configured stop tokens
eos_token = getattr(tokenizer, "eos_token", None)
eos_token_id = getattr(tokenizer, "eos_token_id", None)
# Try to get eos_token_ids (Set or List)
eos_token_ids = None
if hasattr(tokenizer, "eos_token_ids"):
eos_token_ids = tokenizer.eos_token_ids
if hasattr(eos_token_ids, "__iter__"):
eos_token_ids = list(eos_token_ids)
# Run inference to observe actual behavior
output = runner.generate_batch(
prompt=TEST_PROMPT,
max_tokens=MAX_TOKENS
)
# Detect visible stop tokens
potential_stop_tokens = ["<|end|>", "<|eot_id|>", "<|im_end|>", "<|endoftext|>"]
found_stop_tokens = [t for t in potential_stop_tokens if t in output]
result = {
"model_key": model_key,
"model_id": model_id,
"configured_eos_token": eos_token,
"configured_eos_token_id": eos_token_id,
"configured_eos_token_ids": eos_token_ids,
"generated_output": output[:100], # First 100 chars for reference
"visible_stop_tokens": found_stop_tokens,
"workaround_needed": bool(found_stop_tokens),
"system_ram_gb": round(system_ram, 1),
"ram_budget_gb": round(ram_budget, 1),
"budget_ratio": round(budget_ratio, 2)
}
# Write JSONL fragment (append mode - each test writes one line)
fragments_path = Path("stop_token_config_fragments.jsonl")
with open(fragments_path, "a") as f:
f.write(json.dumps(result) + "\n")
print(f"\n{'='*60}")
print(f"EMPIRICAL MAPPING: {model_key}")
print(f"{'='*60}")
print(json.dumps(result, indent=2))
@pytest.mark.live_stop_tokens
def test_empirical_mapping_generate_report(self, request):
"""Finalize: Aggregate JSONL fragments into final JSON report.
Runs AFTER all single-model tests complete.
Reads stop_token_config_fragments.jsonl and generates stop_token_config_report.json.
"""
# Only run when explicitly selected with -m live_stop_tokens
# NOTE: Excluded from -m wet to prevent nanobind crash (MLX re-import issue)
selected = request.config.getoption("-m") or ""
if "live_stop_tokens" not in selected:
pytest.skip("Run with -m live_stop_tokens to enable portfolio discovery")
fragments_path = Path("stop_token_config_fragments.jsonl")
report_path = Path("stop_token_config_report.json")
if not fragments_path.exists():
pytest.skip("No fragments found - single-model tests may not have run")
# Read all JSONL fragments
fragments = []
with open(fragments_path, "r") as f:
for line in f:
if line.strip():
fragments.append(json.loads(line))
# Build final report
report = {}
# Extract system info from first fragment
if fragments:
first = fragments[0]
report["_system_info"] = {
"system_ram_gb": first.get("system_ram_gb", 0),
"ram_budget_gb": first.get("ram_budget_gb", 0),
"budget_ratio": first.get("budget_ratio", 0)
}
# Add all model results
for fragment in fragments:
model_key = fragment.pop("model_key")
# Remove system_info fields from individual entries
fragment.pop("system_ram_gb", None)
fragment.pop("ram_budget_gb", None)
fragment.pop("budget_ratio", None)
report[model_key] = fragment
# Write final JSON report
report_path.write_text(json.dumps(report, indent=2))
print(f"\n{'='*60}")
print(f"EMPIRICAL MAPPING REPORT")
print(f"{'='*60}")
print(json.dumps(report, indent=2))
print(f"\nReport saved to: {report_path.absolute()}")
# Summary
models_needing_fix = [
k for k, v in report.items()
if isinstance(v, dict) and v.get("workaround_needed")
]
print(f"\nModels needing fix: {models_needing_fix}")
# Cleanup fragments
fragments_path.unlink()
print(f"Cleaned up: {fragments_path}")