Files
mlx-knife/tests_2.0/test_runner_core.py
T
The BROKE Cluster Team dab7ffb6fc fix: P0 bugfixes + test infrastructure + benchmark metadata sync
P0 Bugfixes:
- cache.py: Handle empty HF_HOME strings in get_current_cache_root()
- clone.py: Remove obsolete _validate_same_volume() check
- common.py: Use importlib.metadata instead of importing transformers

Test Infrastructure:
- runner/__init__.py: Replace "mock" fallback with clear RuntimeError
- Fix mock paths in test_runner_core, test_token_limits, etc.
- Add VISION_TEST_MODELS + AUDIO_TEST_MODELS fallbacks
- Portfolio fixtures work with and without HF_HOME

Benchmark Fixes:
- Sort models/tests alphabetically instead of by regression %
- Fix vision metadata drift: pixtral-12b-8bit → pixtral-12b-4bit

Documentation:
- ADR-022: Workspace-First Paradigm (draft)
- ADR-018: Phase 2 details expanded
- TESTING.md/TESTING-DETAILS.md: Fallback docs updated
2026-02-10 15:52:36 +01:00

417 lines
17 KiB
Python

"""
Core MLXRunner tests for 2.0 implementation.
Tests the core model execution engine ported from 1.x.
"""
import pytest
import tempfile
from unittest.mock import Mock, patch
from pathlib import Path
from contextlib import contextmanager
import mlx.core as mx
from mlxk2.core.runner import MLXRunner
class MockDetokenizer:
"""Mock detokenizer that mimics BPEStreamingDetokenizer behavior.
Used by unit tests to mock tokenizer.detokenizer after Session 60 changes.
Session 60 switched from tokenizer.decode() to tokenizer.detokenizer for
proper BPE space marker (Ġ U+0120) conversion.
"""
def __init__(self, decode_func):
"""Initialize with a decode function that maps token lists to strings."""
self.decode_func = decode_func
self.tokens = []
self._text = ""
def reset(self):
"""Reset accumulated tokens."""
self.tokens = []
self._text = ""
def add_token(self, token_id):
"""Add a token to the accumulated list."""
self.tokens.append(token_id)
def finalize(self):
"""Finalize and decode accumulated tokens."""
self._text = self.decode_func(self.tokens)
@property
def text(self):
"""Return the decoded text."""
return self._text
@contextmanager
def mock_runner_environment(temp_cache_dir, model_name="test-model"):
"""Mock the environment needed for MLXRunner tests."""
# IMPORTANT: Patch in the runner module where the functions are imported,
# not in the cache module where they're defined. This ensures the patched
# references are used by MLXRunner.
with patch('mlxk2.core.runner.load') as mock_load, \
patch('mlxk2.core.runner.resolve_model_for_operation') as mock_resolve, \
patch('mlxk2.core.runner.get_current_model_cache') as mock_cache, \
patch('mlxk2.core.runner.hf_to_cache_dir') as mock_hf_to_cache, \
patch('mlxk2.core.runner.get_model_context_length') as mock_context:
# Mock successful model resolution
mock_resolve.return_value = (model_name, None, None)
mock_cache.return_value = temp_cache_dir
mock_hf_to_cache.return_value = f"models--{model_name}"
mock_context.return_value = 8192
# Create mock snapshots directory
snapshots_dir = temp_cache_dir / f"models--{model_name}" / "snapshots" / "abc123"
snapshots_dir.mkdir(parents=True)
# Mock model and tokenizer
mock_model = Mock()
mock_tokenizer = Mock()
mock_tokenizer.eos_token = "</s>"
mock_tokenizer.eos_token_id = 2
mock_tokenizer.eos_token_ids = {mock_tokenizer.eos_token_id}
mock_tokenizer.pad_token = None
mock_tokenizer.additional_special_tokens = []
mock_tokenizer.added_tokens_decoder = {}
mock_tokenizer.chat_template = None
mock_tokenizer.name_or_path = f"mock-{model_name}"
mock_load.return_value = (mock_model, mock_tokenizer)
yield {
'mock_load': mock_load,
'mock_model': mock_model,
'mock_tokenizer': mock_tokenizer,
'mock_resolve': mock_resolve
}
class TestMLXRunnerBasic:
"""Basic MLXRunner functionality tests"""
def test_runner_context_manager(self, temp_cache_dir):
"""Test context manager pattern for memory safety"""
model_name = "test-model"
with mock_runner_environment(temp_cache_dir) as mocks:
with MLXRunner(model_name) as runner:
assert runner is not None
# Should have loaded model
mocks['mock_load'].assert_called_once()
# Should cleanup on exit (tested via mock verification)
def test_runner_cleanup_on_exception(self, temp_cache_dir):
"""Test that cleanup happens even on exception"""
model_name = "test-model"
with mock_runner_environment(temp_cache_dir) as mocks:
try:
with MLXRunner(model_name) as runner:
# Force an exception
raise ValueError("Test exception")
except ValueError:
pass
# Should still have called load and cleanup
mocks['mock_load'].assert_called_once()
def test_generate_streaming_basic(self, temp_cache_dir):
"""Test basic streaming generation"""
model_name = "test-model"
with mock_runner_environment(temp_cache_dir, model_name) as mocks:
# Mock generate_step to yield tokens
with patch('mlxk2.core.runner.generate_step') as mock_gen:
# generate_step yields (token, logits) tuples
mock_gen.return_value = [
(mx.array([1]), mx.zeros(1)), # Token IDs as mx.array
(mx.array([2]), mx.zeros(1)),
]
# Mock tokenizer methods
mocks['mock_tokenizer'].encode.return_value = [100, 101] # Prompt tokens
mocks['mock_tokenizer'].eos_token_id = 999 # Don't trigger EOS
mocks['mock_tokenizer'].eos_token_ids = {mocks['mock_tokenizer'].eos_token_id}
mocks['mock_tokenizer'].chat_template = None # Disable chat template
# Mock decode to return consistent strings based on token list length/content
def mock_decode(tokens):
if tokens == [1]:
return "Hello"
elif tokens == [1, 2]:
return "Hello world"
elif tokens == [2]:
return " world"
else:
return "unknown"
mocks['mock_tokenizer'].decode.side_effect = mock_decode
# Use MockDetokenizer for proper BPE space marker handling
mocks['mock_tokenizer'].detokenizer = MockDetokenizer(mock_decode)
with MLXRunner(model_name) as runner:
tokens = list(runner.generate_streaming("test prompt", max_tokens=2))
# Should yield incremental tokens
assert len(tokens) >= 1
assert any("Hello" in token for token in tokens)
def test_generate_batch(self, temp_cache_dir):
"""Test batch generation (complete output at once)"""
model_name = "test-model"
with mock_runner_environment(temp_cache_dir, model_name) as mocks:
with patch('mlxk2.core.runner.generate_step') as mock_gen:
mock_gen.return_value = [
(mx.array([1]), mx.zeros(1)),
(mx.array([2]), mx.zeros(1)),
(mx.array([3]), mx.zeros(1))
]
# Mock tokenizer for batch mode
mocks['mock_tokenizer'].encode.return_value = [100, 101] # Prompt
mocks['mock_tokenizer'].decode.side_effect = lambda tokens: " ".join([f"token{t}" for t in tokens])
mocks['mock_tokenizer'].eos_token_id = 999 # Don't trigger EOS
mocks['mock_tokenizer'].eos_token_ids = {mocks['mock_tokenizer'].eos_token_id}
mocks['mock_tokenizer'].chat_template = None
with MLXRunner(model_name) as runner:
result = runner.generate_batch("test prompt", max_tokens=3)
# Should return a single string (complete response)
assert isinstance(result, str)
assert len(result) > 0
class TestMLXRunnerStopTokens:
"""Test stop token filtering functionality"""
def test_chat_stop_tokens_filtered_when_enabled(self, temp_cache_dir):
"""Chat stop tokens are filtered only when explicitly enabled"""
model_name = "test-model"
with mock_runner_environment(temp_cache_dir, model_name) as mocks:
with patch('mlxk2.core.runner.generate_step') as mock_gen:
mock_gen.return_value = [
(1, 0),
(2, 0),
(3, 0)
]
# Encode returns prompt tokens
mocks['mock_tokenizer'].encode.return_value = [100]
# Decode returns full generated text when decoding generated tokens
def mock_decode(tokens):
if tokens == [1]:
return "Response"
if tokens == [1, 2]:
return "Response\nHuman:"
if tokens == [1, 2, 3]:
return "Response\nHuman: filtered"
# Fallback for other cases
return ""
mocks['mock_tokenizer'].decode.side_effect = mock_decode
# Mock detokenizer (Session 60 BPE fix)
mocks['mock_tokenizer'].detokenizer = MockDetokenizer(mock_decode)
with MLXRunner(model_name) as runner:
result = runner.generate_batch("test prompt", use_chat_stop_tokens=True)
# Should stop at chat stop token
assert "\nHuman:" not in result
assert result == "Response"
def test_chat_stop_tokens_not_filtered_by_default(self, temp_cache_dir):
"""By default, batch mode does not strip chat stop tokens"""
model_name = "test-model"
with mock_runner_environment(temp_cache_dir, model_name) as mocks:
with patch('mlxk2.core.runner.generate_step') as mock_gen:
mock_gen.return_value = [
(1, 0),
(2, 0),
(3, 0)
]
mocks['mock_tokenizer'].encode.return_value = [100]
def mock_decode(tokens):
if tokens == [1]:
return "Response"
if tokens == [1, 2]:
return "Response\nHuman:"
if tokens == [1, 2, 3]:
return "Response\nHuman: rest"
return ""
mocks['mock_tokenizer'].decode.side_effect = mock_decode
# Mock detokenizer (Session 60 BPE fix)
mocks['mock_tokenizer'].detokenizer = MockDetokenizer(mock_decode)
with MLXRunner(model_name) as runner:
result = runner.generate_batch("test prompt")
# Default behavior: token remains unless explicitly enabled
assert "\nHuman:" in result
def test_streaming_vs_batch_consistency(self, temp_cache_dir):
"""Test that streaming and batch modes produce identical output"""
model_name = "test-model"
with mock_runner_environment(temp_cache_dir, model_name) as mocks:
# Same mock sequence for both tests
def mock_generation():
return [
(1, 0),
(2, 0),
(3, 0)
]
mocks['mock_tokenizer'].encode.return_value = [100]
def mock_decode(tokens):
if tokens == [1]:
return "Hello"
if tokens == [2]:
return " world"
if tokens == [3]:
return "!"
if tokens == [1, 2]:
return "Hello world"
if tokens == [2, 3]:
return " world!"
if tokens == [1, 2, 3]:
return "Hello world!"
return ""
mocks['mock_tokenizer'].decode.side_effect = mock_decode
# Mock detokenizer (Session 60 BPE fix)
mocks['mock_tokenizer'].detokenizer = MockDetokenizer(mock_decode)
with MLXRunner(model_name) as runner:
# Test streaming
with patch('mlxk2.core.runner.generate_step', return_value=mock_generation()):
streaming_result = "".join(runner.generate_streaming("test"))
# Test batch
with patch('mlxk2.core.runner.generate_step', return_value=mock_generation()):
batch_result = runner.generate_batch("test")
assert streaming_result == batch_result
class TestMLXRunnerMemorySafety:
"""Test memory management and cleanup"""
def test_model_cleanup_on_context_exit(self, temp_cache_dir):
"""Test that model is properly cleaned up"""
model_name = "test-model"
with mock_runner_environment(temp_cache_dir, model_name) as mocks:
runner = None
with MLXRunner(model_name) as r:
runner = r
assert runner.model is not None
assert runner.tokenizer is not None
# After context exit, model should be cleaned up
assert runner.model is None
assert runner.tokenizer is None
def test_multiple_context_managers(self, temp_cache_dir):
"""Test that multiple runners can be used sequentially"""
model_name = "test-model"
with mock_runner_environment(temp_cache_dir, model_name) as mocks:
# First runner
with MLXRunner(model_name) as runner1:
assert runner1 is not None
# Second runner should work independently
with MLXRunner(model_name) as runner2:
assert runner2 is not None
# Should have loaded model twice
assert mocks['mock_load'].call_count == 2
class TestMLXRunnerDynamicTokens:
"""Test dynamic token limit functionality"""
def test_no_max_tokens_uses_dynamic(self, temp_cache_dir):
"""Test that None max_tokens uses dynamic limit based on model context"""
model_name = "test-model"
with mock_runner_environment(temp_cache_dir, model_name) as mocks:
with MLXRunner(model_name) as runner:
# Should calculate dynamic limit from context length (8192 from mock)
dynamic_limit = runner._calculate_dynamic_max_tokens()
# Should be a reasonable fraction of context (server-mode default)
# Accept half-context on 8K models as reasonable
assert 1000 <= dynamic_limit <= 4096
def test_respects_explicit_max_tokens(self, temp_cache_dir):
"""Test that explicit max_tokens is respected"""
model_name = "test-model"
with mock_runner_environment(temp_cache_dir, model_name) as mocks:
# Update mock tokenizer with extra methods needed for generation
mocks['mock_tokenizer'].encode.return_value = [1]
mocks['mock_tokenizer'].decode.return_value = "ok"
with MLXRunner(model_name) as runner:
# When max_tokens is explicitly set, should respect it
with patch('mlxk2.core.runner.generate_step') as mock_gen:
mock_gen.return_value = iter([(mx.array([1]), mx.zeros(1))])
# Mock to check that max_tokens is passed through
result = runner.generate_batch("test", max_tokens=100)
# Should have respected the explicit limit
# (Details depend on implementation)
class TestMLXRunnerErrorHandling:
"""Test error handling and edge cases"""
def test_model_loading_failure(self, temp_cache_dir):
"""Test handling of model loading failures"""
model_name = "test-model"
# Create the mock environment but configure load to raise an error
with mock_runner_environment(temp_cache_dir, model_name) as mocks:
mocks['mock_load'].side_effect = FileNotFoundError("Model not found")
with pytest.raises(FileNotFoundError):
with MLXRunner(model_name):
pass
def test_generation_interruption(self, temp_cache_dir):
"""Test Ctrl-C interruption handling"""
model_name = "test-model"
with mock_runner_environment(temp_cache_dir, model_name) as mocks:
# Update mock tokenizer with extra methods needed for generation
mocks['mock_tokenizer'].encode.return_value = [1]
mocks['mock_tokenizer'].decode.return_value = "ok"
# With new recovery semantics, a pre-existing interruption flag
# is cleared at the start of a new generation.
with MLXRunner(model_name) as runner:
runner._interrupted = True
tokens = list(runner.generate_streaming("test"))
# Should not yield an interruption message at start
assert not any(isinstance(t, str) and "interrupted" in t.lower() for t in tokens)
# Test fixtures for integration with existing test infrastructure
@pytest.fixture
def mock_tiny_model():
"""Minimal model for fast tests"""
return "hf-internal-testing/tiny-random-gpt2"
@pytest.fixture
def temp_cache_dir():
"""Isolated cache directory for testing"""
with tempfile.TemporaryDirectory() as tmpdir:
yield Path(tmpdir)