Files
mlx-knife/tests_2.0/test_run_vision.py
T
The BROKE Cluster Team bf7480d042 Release 2.0.4-beta.9: Audio transcription via mlx-audio
Major Features:
- Audio transcription via mlx-audio backend (Whisper, >10min duration)
- OpenAI /v1/audio/transcriptions endpoint
- Memory Gate System (Vision: 8GB, Audio: 4GB)
- Config-based backend routing (ADR-020)
- Benchmark toolchain (memmon/memplot, Schema v0.2.2)

Key Fixes:
- EuroLLM tokenizer decoding
- Vision-model text-only routing regression
- Multimodal model context length detection
- Memory cleanup bug (mx.metal.clear_cache)
- Orphan process bug

Test Results:
- Unit tests: 647 passed, 11 skipped (Python 3.10-3.12)
- wet-umbrella: 171 passed total

See CHANGELOG.md for complete details and known issues.
2026-02-04 03:10:30 +01:00

240 lines
8.3 KiB
Python

from __future__ import annotations
import sys
from pathlib import Path
import pytest
from mlxk2.operations.run import run_model
from mlxk2.core.cache import hf_to_cache_dir
from mlxk2.core.capabilities import Backend, PolicyDecision, BackendPolicy, ModelCapabilities
# Vision support requires Python 3.10+ (mlx-vlm requirement)
pytestmark = pytest.mark.skipif(
sys.version_info < (3, 10),
reason="Vision support requires Python 3.10+ (mlx-vlm dependency)"
)
# Lazy import VisionRunner to avoid import errors on Python 3.9
if sys.version_info >= (3, 10):
from mlxk2.core.vision_runner import VisionRunner
def _mk_vision_snapshot(cache_root: Path, repo: str) -> Path:
cache_dir = cache_root / hf_to_cache_dir(repo)
snap = cache_dir / "snapshots" / "1111111111111111111111111111111111111111"
snap.mkdir(parents=True, exist_ok=True)
(snap / "config.json").write_text('{"model_type": "llava"}', encoding="utf-8")
(snap / "model.safetensors").write_bytes(b"weights")
# ADR-012 Phase 2: Vision models require preprocessor_config.json for health
(snap / "preprocessor_config.json").write_text('{"size": 224}', encoding="utf-8")
return snap
def _make_vision_policy(model_path, model_name):
"""Create a mock vision policy that allows vision backend."""
caps = ModelCapabilities(
model_path=model_path,
model_name=model_name,
is_vision=True,
mlx_vlm_available=True,
python_version=(3, 10, 0),
)
policy = BackendPolicy(
backend=Backend.MLX_VLM,
decision=PolicyDecision.ALLOW,
)
return caps, policy
def test_run_vision_routes_to_vision_runner(monkeypatch, isolated_cache):
repo = "mlx-community/llava-vision-test"
snap = _mk_vision_snapshot(isolated_cache, repo)
calls = {}
class DummyVisionRunner:
def __init__(self, model_path, model_name, verbose=False):
calls["path"] = model_path
calls["name"] = model_name
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
return False
def generate(self, prompt, images=None, **kwargs):
calls["prompt"] = prompt
calls["images"] = list(images or [])
calls["kwargs"] = kwargs
return "vision-output"
monkeypatch.setattr("mlxk2.core.vision_runner.VisionRunner", DummyVisionRunner)
monkeypatch.setattr("mlxk2.operations.run.resolve_model_for_operation", lambda spec: (repo, None, None))
monkeypatch.setattr("mlxk2.operations.run.get_current_model_cache", lambda: isolated_cache)
# Mock the new probe_and_select to return vision policy
monkeypatch.setattr(
"mlxk2.core.capabilities.probe_and_select",
lambda path, name, context="cli", has_images=False: _make_vision_policy(path, name)
)
# Session 146: Vision models now only route to VisionRunner when images are present
# Pass a dummy image to trigger vision path
image_bytes = b"dummy"
result = run_model(
model_spec=repo,
prompt="hello",
images=[("test.png", image_bytes)],
stream=False,
json_output=True
)
assert result == "vision-output"
assert calls["path"] == snap
assert calls["name"] == repo
assert calls["prompt"] == "hello"
assert calls["images"] == [("test.png", image_bytes)]
def test_run_vision_images_get_default_prompt(monkeypatch, isolated_cache):
repo = "mlx-community/llava-vision-test-image"
snap = _mk_vision_snapshot(isolated_cache, repo)
captured = {}
class DummyVisionRunner:
def __init__(self, model_path, model_name, verbose=False):
captured["path"] = model_path
captured["name"] = model_name
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
return False
def generate(self, prompt, images=None, **kwargs):
captured["prompt"] = prompt
captured["images"] = list(images or [])
return "vision-image-output"
monkeypatch.setattr("mlxk2.core.vision_runner.VisionRunner", DummyVisionRunner)
monkeypatch.setattr("mlxk2.operations.run.resolve_model_for_operation", lambda spec: (repo, None, None))
monkeypatch.setattr("mlxk2.operations.run.get_current_model_cache", lambda: isolated_cache)
# Mock the new probe_and_select to return vision policy
monkeypatch.setattr(
"mlxk2.core.capabilities.probe_and_select",
lambda path, name, context="cli", has_images=False: _make_vision_policy(path, name)
)
image_bytes = b"raw"
result = run_model(
model_spec=repo,
prompt=None,
images=[("img.png", image_bytes)],
stream=False,
json_output=True,
)
assert result == "vision-image-output"
assert captured["prompt"] == "Describe the image."
assert captured["images"] == [("img.png", image_bytes)]
def test_vision_normalizes_generation_result():
class GenerationResult:
def __init__(self):
self.text = "vision answer"
self.generation_tokens = 10
result = VisionRunner._normalize_result(GenerationResult())
assert result == "vision answer"
def test_vision_adds_filename_mapping_for_multiple_images():
"""Test that filename mapping is added for multiple images (deterministic).
The mapping is formatted as a Markdown table with an HTML comment marker
'<!-- mlxk:filenames -->' to distinguish it from model-generated content.
The comment is invisible in rendered markdown but identifiable in raw text.
"""
images = [
("vacation1.jpg", b"data1"),
("vacation2.jpg", b"data2"),
("vacation3.jpg", b"data3"),
]
result = VisionRunner._add_filename_mapping("Images 1 and 3 show motorboats.", images)
# Updated for ADR-017 Phase 1: Collapsible HTML table with EXIF columns (enabled by default)
# Session 75: Metadata moved to BEGINNING for better UX in chunking scenarios
# Hashes: sha256(b"data1")[:8] = 5b41362b, etc.
expected = """<details>
<summary>📸 Image Metadata (3 images)</summary>
<!-- mlxk:filenames -->
| Image | Filename | Original | Location | Date | Camera |
|-------|----------|----------|----------|------|--------|
| 1 | image_5b41362b.jpeg | vacation1.jpg | - | - | - |
| 2 | image_d98cf53e.jpeg | vacation2.jpg | - | - | - |
| 3 | image_f60f2d65.jpeg | vacation3.jpg | - | - | - |
</details>
Images 1 and 3 show motorboats."""
assert result == expected
def test_vision_no_mapping_for_single_image():
"""Single image should now get mapping (ADR-017 Phase 1: enables cross-model workflows)."""
images = [("single.jpg", b"data")]
# ADR-017: Even single images get mapping (enables Vision→Text model switching)
result = VisionRunner._add_filename_mapping("A dog.", images)
# Updated for ADR-017 Phase 1: Collapsible HTML table with EXIF columns (enabled by default)
# Session 75: Metadata moved to BEGINNING for better UX in chunking scenarios
# Hash: sha256(b"data")[:8] = 3a6eb079
expected = """<details>
<summary>📸 Image Metadata (1 image)</summary>
<!-- mlxk:filenames -->
| Image | Filename | Original | Location | Date | Camera |
|-------|----------|----------|----------|------|--------|
| 1 | image_3a6eb079.jpeg | single.jpg | - | - | - |
</details>
A dog."""
assert result == expected
def test_vision_text_only_routing_condition():
"""Session 146: Vision routing uses 'if images' check, so empty list routes to text path.
This is a simple unit test that verifies the routing logic condition.
The actual E2E behavior is tested in tests_2.0/live/test_vision_e2e_live.py.
"""
# The key routing condition in run.py line 495:
# if images or (audio and audio_backend == Backend.MLX_VLM):
# → VisionRunner path
# else:
# → MLXRunner path (text)
# Empty list is falsy in Python
images = []
audio = None
audio_backend = None
# This is the routing condition from run.py
uses_vision_path = bool(images) or (audio and audio_backend is not None)
assert not uses_vision_path, "Empty images should NOT trigger vision path"
# With images, it SHOULD trigger vision path
images_with_content = [("test.png", b"data")]
uses_vision_path = bool(images_with_content) or (audio and audio_backend is not None)
assert uses_vision_path, "Non-empty images SHOULD trigger vision path"