mirror of
https://github.com/cloudstack-llc/mlx-knife.git
synced 2026-07-01 20:44:14 -04:00
bf7480d042
Major Features: - Audio transcription via mlx-audio backend (Whisper, >10min duration) - OpenAI /v1/audio/transcriptions endpoint - Memory Gate System (Vision: 8GB, Audio: 4GB) - Config-based backend routing (ADR-020) - Benchmark toolchain (memmon/memplot, Schema v0.2.2) Key Fixes: - EuroLLM tokenizer decoding - Vision-model text-only routing regression - Multimodal model context length detection - Memory cleanup bug (mx.metal.clear_cache) - Orphan process bug Test Results: - Unit tests: 647 passed, 11 skipped (Python 3.10-3.12) - wet-umbrella: 171 passed total See CHANGELOG.md for complete details and known issues.
240 lines
8.3 KiB
Python
240 lines
8.3 KiB
Python
from __future__ import annotations
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from mlxk2.operations.run import run_model
|
|
from mlxk2.core.cache import hf_to_cache_dir
|
|
from mlxk2.core.capabilities import Backend, PolicyDecision, BackendPolicy, ModelCapabilities
|
|
|
|
# Vision support requires Python 3.10+ (mlx-vlm requirement)
|
|
pytestmark = pytest.mark.skipif(
|
|
sys.version_info < (3, 10),
|
|
reason="Vision support requires Python 3.10+ (mlx-vlm dependency)"
|
|
)
|
|
|
|
# Lazy import VisionRunner to avoid import errors on Python 3.9
|
|
if sys.version_info >= (3, 10):
|
|
from mlxk2.core.vision_runner import VisionRunner
|
|
|
|
|
|
def _mk_vision_snapshot(cache_root: Path, repo: str) -> Path:
|
|
cache_dir = cache_root / hf_to_cache_dir(repo)
|
|
snap = cache_dir / "snapshots" / "1111111111111111111111111111111111111111"
|
|
snap.mkdir(parents=True, exist_ok=True)
|
|
(snap / "config.json").write_text('{"model_type": "llava"}', encoding="utf-8")
|
|
(snap / "model.safetensors").write_bytes(b"weights")
|
|
# ADR-012 Phase 2: Vision models require preprocessor_config.json for health
|
|
(snap / "preprocessor_config.json").write_text('{"size": 224}', encoding="utf-8")
|
|
return snap
|
|
|
|
|
|
def _make_vision_policy(model_path, model_name):
|
|
"""Create a mock vision policy that allows vision backend."""
|
|
caps = ModelCapabilities(
|
|
model_path=model_path,
|
|
model_name=model_name,
|
|
is_vision=True,
|
|
mlx_vlm_available=True,
|
|
python_version=(3, 10, 0),
|
|
)
|
|
policy = BackendPolicy(
|
|
backend=Backend.MLX_VLM,
|
|
decision=PolicyDecision.ALLOW,
|
|
)
|
|
return caps, policy
|
|
|
|
|
|
def test_run_vision_routes_to_vision_runner(monkeypatch, isolated_cache):
|
|
repo = "mlx-community/llava-vision-test"
|
|
snap = _mk_vision_snapshot(isolated_cache, repo)
|
|
|
|
calls = {}
|
|
|
|
class DummyVisionRunner:
|
|
def __init__(self, model_path, model_name, verbose=False):
|
|
calls["path"] = model_path
|
|
calls["name"] = model_name
|
|
|
|
def __enter__(self):
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
return False
|
|
|
|
def generate(self, prompt, images=None, **kwargs):
|
|
calls["prompt"] = prompt
|
|
calls["images"] = list(images or [])
|
|
calls["kwargs"] = kwargs
|
|
return "vision-output"
|
|
|
|
monkeypatch.setattr("mlxk2.core.vision_runner.VisionRunner", DummyVisionRunner)
|
|
monkeypatch.setattr("mlxk2.operations.run.resolve_model_for_operation", lambda spec: (repo, None, None))
|
|
monkeypatch.setattr("mlxk2.operations.run.get_current_model_cache", lambda: isolated_cache)
|
|
# Mock the new probe_and_select to return vision policy
|
|
monkeypatch.setattr(
|
|
"mlxk2.core.capabilities.probe_and_select",
|
|
lambda path, name, context="cli", has_images=False: _make_vision_policy(path, name)
|
|
)
|
|
|
|
# Session 146: Vision models now only route to VisionRunner when images are present
|
|
# Pass a dummy image to trigger vision path
|
|
image_bytes = b"dummy"
|
|
result = run_model(
|
|
model_spec=repo,
|
|
prompt="hello",
|
|
images=[("test.png", image_bytes)],
|
|
stream=False,
|
|
json_output=True
|
|
)
|
|
|
|
assert result == "vision-output"
|
|
assert calls["path"] == snap
|
|
assert calls["name"] == repo
|
|
assert calls["prompt"] == "hello"
|
|
assert calls["images"] == [("test.png", image_bytes)]
|
|
|
|
|
|
def test_run_vision_images_get_default_prompt(monkeypatch, isolated_cache):
|
|
repo = "mlx-community/llava-vision-test-image"
|
|
snap = _mk_vision_snapshot(isolated_cache, repo)
|
|
|
|
captured = {}
|
|
|
|
class DummyVisionRunner:
|
|
def __init__(self, model_path, model_name, verbose=False):
|
|
captured["path"] = model_path
|
|
captured["name"] = model_name
|
|
|
|
def __enter__(self):
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
return False
|
|
|
|
def generate(self, prompt, images=None, **kwargs):
|
|
captured["prompt"] = prompt
|
|
captured["images"] = list(images or [])
|
|
return "vision-image-output"
|
|
|
|
monkeypatch.setattr("mlxk2.core.vision_runner.VisionRunner", DummyVisionRunner)
|
|
monkeypatch.setattr("mlxk2.operations.run.resolve_model_for_operation", lambda spec: (repo, None, None))
|
|
monkeypatch.setattr("mlxk2.operations.run.get_current_model_cache", lambda: isolated_cache)
|
|
# Mock the new probe_and_select to return vision policy
|
|
monkeypatch.setattr(
|
|
"mlxk2.core.capabilities.probe_and_select",
|
|
lambda path, name, context="cli", has_images=False: _make_vision_policy(path, name)
|
|
)
|
|
|
|
image_bytes = b"raw"
|
|
result = run_model(
|
|
model_spec=repo,
|
|
prompt=None,
|
|
images=[("img.png", image_bytes)],
|
|
stream=False,
|
|
json_output=True,
|
|
)
|
|
|
|
assert result == "vision-image-output"
|
|
assert captured["prompt"] == "Describe the image."
|
|
assert captured["images"] == [("img.png", image_bytes)]
|
|
|
|
|
|
def test_vision_normalizes_generation_result():
|
|
class GenerationResult:
|
|
def __init__(self):
|
|
self.text = "vision answer"
|
|
self.generation_tokens = 10
|
|
|
|
result = VisionRunner._normalize_result(GenerationResult())
|
|
assert result == "vision answer"
|
|
|
|
|
|
def test_vision_adds_filename_mapping_for_multiple_images():
|
|
"""Test that filename mapping is added for multiple images (deterministic).
|
|
|
|
The mapping is formatted as a Markdown table with an HTML comment marker
|
|
'<!-- mlxk:filenames -->' to distinguish it from model-generated content.
|
|
The comment is invisible in rendered markdown but identifiable in raw text.
|
|
"""
|
|
images = [
|
|
("vacation1.jpg", b"data1"),
|
|
("vacation2.jpg", b"data2"),
|
|
("vacation3.jpg", b"data3"),
|
|
]
|
|
|
|
result = VisionRunner._add_filename_mapping("Images 1 and 3 show motorboats.", images)
|
|
|
|
# Updated for ADR-017 Phase 1: Collapsible HTML table with EXIF columns (enabled by default)
|
|
# Session 75: Metadata moved to BEGINNING for better UX in chunking scenarios
|
|
# Hashes: sha256(b"data1")[:8] = 5b41362b, etc.
|
|
expected = """<details>
|
|
<summary>📸 Image Metadata (3 images)</summary>
|
|
|
|
<!-- mlxk:filenames -->
|
|
| Image | Filename | Original | Location | Date | Camera |
|
|
|-------|----------|----------|----------|------|--------|
|
|
| 1 | image_5b41362b.jpeg | vacation1.jpg | - | - | - |
|
|
| 2 | image_d98cf53e.jpeg | vacation2.jpg | - | - | - |
|
|
| 3 | image_f60f2d65.jpeg | vacation3.jpg | - | - | - |
|
|
|
|
</details>
|
|
|
|
Images 1 and 3 show motorboats."""
|
|
assert result == expected
|
|
|
|
|
|
def test_vision_no_mapping_for_single_image():
|
|
"""Single image should now get mapping (ADR-017 Phase 1: enables cross-model workflows)."""
|
|
images = [("single.jpg", b"data")]
|
|
|
|
# ADR-017: Even single images get mapping (enables Vision→Text model switching)
|
|
result = VisionRunner._add_filename_mapping("A dog.", images)
|
|
|
|
# Updated for ADR-017 Phase 1: Collapsible HTML table with EXIF columns (enabled by default)
|
|
# Session 75: Metadata moved to BEGINNING for better UX in chunking scenarios
|
|
# Hash: sha256(b"data")[:8] = 3a6eb079
|
|
expected = """<details>
|
|
<summary>📸 Image Metadata (1 image)</summary>
|
|
|
|
<!-- mlxk:filenames -->
|
|
| Image | Filename | Original | Location | Date | Camera |
|
|
|-------|----------|----------|----------|------|--------|
|
|
| 1 | image_3a6eb079.jpeg | single.jpg | - | - | - |
|
|
|
|
</details>
|
|
|
|
A dog."""
|
|
assert result == expected
|
|
|
|
|
|
def test_vision_text_only_routing_condition():
|
|
"""Session 146: Vision routing uses 'if images' check, so empty list routes to text path.
|
|
|
|
This is a simple unit test that verifies the routing logic condition.
|
|
The actual E2E behavior is tested in tests_2.0/live/test_vision_e2e_live.py.
|
|
"""
|
|
# The key routing condition in run.py line 495:
|
|
# if images or (audio and audio_backend == Backend.MLX_VLM):
|
|
# → VisionRunner path
|
|
# else:
|
|
# → MLXRunner path (text)
|
|
|
|
# Empty list is falsy in Python
|
|
images = []
|
|
audio = None
|
|
audio_backend = None
|
|
|
|
# This is the routing condition from run.py
|
|
uses_vision_path = bool(images) or (audio and audio_backend is not None)
|
|
|
|
assert not uses_vision_path, "Empty images should NOT trigger vision path"
|
|
|
|
# With images, it SHOULD trigger vision path
|
|
images_with_content = [("test.png", b"data")]
|
|
uses_vision_path = bool(images_with_content) or (audio and audio_backend is not None)
|
|
|
|
assert uses_vision_path, "Non-empty images SHOULD trigger vision path"
|