Files
mlx-knife/tests_2.0/test_vision_adapter.py
T
The BROKE Cluster Team bf7480d042 Release 2.0.4-beta.9: Audio transcription via mlx-audio
Major Features:
- Audio transcription via mlx-audio backend (Whisper, >10min duration)
- OpenAI /v1/audio/transcriptions endpoint
- Memory Gate System (Vision: 8GB, Audio: 4GB)
- Config-based backend routing (ADR-020)
- Benchmark toolchain (memmon/memplot, Schema v0.2.2)

Key Fixes:
- EuroLLM tokenizer decoding
- Vision-model text-only routing regression
- Multimodal model context length detection
- Memory cleanup bug (mx.metal.clear_cache)
- Orphan process bug

Test Results:
- Unit tests: 647 passed, 11 skipped (Python 3.10-3.12)
- wet-umbrella: 171 passed total

See CHANGELOG.md for complete details and known issues.
2026-02-04 03:10:30 +01:00

986 lines
36 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Tests for Vision HTTP Adapter (ADR-012 Phase 3).
Tests Base64 decoding, OpenAI message format parsing, and error handling.
"""
import base64
import pytest
# No MLXKError import needed - using standard ValueError
from mlxk2.tools.vision_adapter import (
VisionHTTPAdapter,
MAX_SAFE_CHUNK_SIZE,
MAX_IMAGE_SIZE_BYTES,
MAX_TOTAL_IMAGE_SIZE_BYTES,
MAX_AUDIO_SIZE_BYTES,
)
# Test fixtures: Base64-encoded images (minimal valid images)
# 1x1 red pixel JPEG
VALID_JPEG_B64 = "/9j/4AAQSkZJRgABAQEASABIAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/2wBDAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/wAARCAABAAEDAREAAhEBAxEB/8QAFQABAQAAAAAAAAAAAAAAAAAAAAv/xAAUEAEAAAAAAAAAAAAAAAAAAAAA/8QAFQEBAQAAAAAAAAAAAAAAAAAAAAX/xAAUEQEAAAAAAAAAAAAAAAAAAAAA/9oADAMBAAIRAxEAPwA/wA//"
# 1x1 transparent pixel PNG
VALID_PNG_B64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=="
class TestDecodeBase64Image:
"""Tests for decode_base64_image()."""
def test_decode_valid_jpeg_data_url(self):
"""Test decoding a valid JPEG data URL."""
url = f"data:image/jpeg;base64,{VALID_JPEG_B64}"
filename, raw_bytes = VisionHTTPAdapter.decode_base64_image(url)
assert filename.startswith("image_")
assert filename.endswith(".jpeg")
assert len(raw_bytes) > 0
assert isinstance(raw_bytes, bytes)
def test_decode_valid_png_data_url(self):
"""Test decoding a valid PNG data URL."""
url = f"data:image/png;base64,{VALID_PNG_B64}"
filename, raw_bytes = VisionHTTPAdapter.decode_base64_image(url)
assert filename.startswith("image_")
assert filename.endswith(".png")
assert len(raw_bytes) > 0
def test_decode_jpg_normalized_to_jpeg(self):
"""Test that jpg MIME type is normalized to jpeg."""
url = f"data:image/jpg;base64,{VALID_JPEG_B64}"
filename, raw_bytes = VisionHTTPAdapter.decode_base64_image(url)
# Should use .jpeg extension, not .jpg
assert filename.endswith(".jpeg")
def test_filename_is_deterministic(self):
"""Test that same image produces same filename."""
url = f"data:image/jpeg;base64,{VALID_JPEG_B64}"
filename1, _ = VisionHTTPAdapter.decode_base64_image(url)
filename2, _ = VisionHTTPAdapter.decode_base64_image(url)
assert filename1 == filename2
def test_filename_differs_for_different_images(self):
"""Test that different images produce different filenames."""
url1 = f"data:image/jpeg;base64,{VALID_JPEG_B64}"
url2 = f"data:image/png;base64,{VALID_PNG_B64}"
filename1, _ = VisionHTTPAdapter.decode_base64_image(url1)
filename2, _ = VisionHTTPAdapter.decode_base64_image(url2)
assert filename1 != filename2
def test_malformed_base64_raises_error(self):
"""Test that malformed base64 data raises validation error."""
url = "data:image/jpeg;base64,!!!invalid_base64!!!"
with pytest.raises(ValueError) as exc:
VisionHTTPAdapter.decode_base64_image(url)
assert "Failed to decode base64" in str(exc.value)
def test_empty_base64_data_raises_error(self):
"""Test that empty base64 data raises validation error."""
url = "data:image/jpeg;base64,"
with pytest.raises(ValueError) as exc:
VisionHTTPAdapter.decode_base64_image(url)
# Empty base64 data fails regex match
assert "Invalid data URL format" in str(exc.value)
def test_unsupported_mime_type_raises_error(self):
"""Test that unsupported MIME types raise validation error."""
# BMP is not supported
url = "data:image/bmp;base64,Qk0="
with pytest.raises(ValueError) as exc:
VisionHTTPAdapter.decode_base64_image(url)
assert "Invalid data URL format" in str(exc.value) or "Unsupported" in str(exc.value)
def test_external_url_raises_error(self):
"""Test that external URLs (https) are rejected."""
url = "https://example.com/image.jpg"
with pytest.raises(ValueError) as exc:
VisionHTTPAdapter.decode_base64_image(url)
assert "Only data URLs are supported" in str(exc.value)
assert "External URLs are not supported" in str(exc.value)
def test_non_data_url_scheme_raises_error(self):
"""Test that non-data URL schemes are rejected."""
url = "file:///path/to/image.jpg"
with pytest.raises(ValueError) as exc:
VisionHTTPAdapter.decode_base64_image(url)
assert "Only data URLs are supported" in str(exc.value)
def test_oversized_image_raises_error(self):
"""Test that images exceeding size limit raise validation error."""
# Create a large base64 string (> 20 MB)
large_data = "A" * (MAX_IMAGE_SIZE_BYTES + 1000)
large_b64 = base64.b64encode(large_data.encode()).decode()
url = f"data:image/jpeg;base64,{large_b64}"
with pytest.raises(ValueError) as exc:
VisionHTTPAdapter.decode_base64_image(url)
assert "exceeds limit" in str(exc.value).lower()
class TestParseOpenAIMessages:
"""Tests for parse_openai_messages()."""
def test_parse_single_image_message(self):
"""Test parsing a message with one image."""
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this image?"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{VALID_JPEG_B64}"}}
]
}
]
prompt, images, _audio = VisionHTTPAdapter.parse_openai_messages(messages)
assert prompt == "What's in this image?"
assert len(images) == 1
assert images[0][0].startswith("image_")
assert images[0][0].endswith(".jpeg")
def test_parse_multiple_images_message(self):
"""Test parsing a message with multiple images."""
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Compare these images"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{VALID_JPEG_B64}"}},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{VALID_PNG_B64}"}}
]
}
]
prompt, images, _audio = VisionHTTPAdapter.parse_openai_messages(messages)
assert prompt == "Compare these images"
assert len(images) == 2
assert images[0][0].endswith(".jpeg")
assert images[1][0].endswith(".png")
def test_parse_string_content(self):
"""Test parsing a simple string message (no vision)."""
messages = [
{"role": "user", "content": "Hello, how are you?"}
]
prompt, images, _audio = VisionHTTPAdapter.parse_openai_messages(messages)
assert prompt == "Hello, how are you?"
assert len(images) == 0
def test_parse_images_without_text_uses_default_prompt(self):
"""Test that images without text get default prompt."""
messages = [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{VALID_JPEG_B64}"}}
]
}
]
prompt, images, _audio = VisionHTTPAdapter.parse_openai_messages(messages)
assert prompt == "Describe the image."
assert len(images) == 1
def test_parse_multiple_text_blocks_combined(self):
"""Test that multiple text blocks are combined."""
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "First part."},
{"type": "text", "text": "Second part."}
]
}
]
prompt, images, _audio = VisionHTTPAdapter.parse_openai_messages(messages)
assert prompt == "First part. Second part."
assert len(images) == 0
def test_parse_unknown_content_type_skipped(self):
"""Test that unknown content types are skipped gracefully."""
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Valid text"},
{"type": "unknown_type", "data": "ignored"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{VALID_JPEG_B64}"}}
]
}
]
prompt, images, _audio = VisionHTTPAdapter.parse_openai_messages(messages)
assert prompt == "Valid text"
assert len(images) == 1
def test_empty_messages_raises_error(self):
"""Test that empty messages list raises validation error."""
messages = []
with pytest.raises(ValueError) as exc:
VisionHTTPAdapter.parse_openai_messages(messages)
assert "Messages list cannot be empty" in str(exc.value)
def test_no_content_raises_error(self):
"""Test that messages with no text or images raise validation error."""
messages = [
{"role": "user", "content": []}
]
with pytest.raises(ValueError) as exc:
VisionHTTPAdapter.parse_openai_messages(messages)
assert "at least text" in str(exc.value).lower()
def test_invalid_content_type_raises_error(self):
"""Test that invalid content types raise validation error."""
messages = [
{"role": "user", "content": 123} # Not string or list
]
with pytest.raises(ValueError) as exc:
VisionHTTPAdapter.parse_openai_messages(messages)
assert "must be string or array" in str(exc.value).lower()
def test_missing_image_url_dict_raises_error(self):
"""Test that missing image_url dict raises validation error."""
messages = [
{
"role": "user",
"content": [
{"type": "image_url"} # Missing image_url dict
]
}
]
with pytest.raises(ValueError) as exc:
VisionHTTPAdapter.parse_openai_messages(messages)
assert "image_url" in str(exc.value).lower()
def test_empty_image_url_raises_error(self):
"""Test that empty image URL raises validation error."""
messages = [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": ""}}
]
}
]
with pytest.raises(ValueError) as exc:
VisionHTTPAdapter.parse_openai_messages(messages)
assert "url cannot be empty" in str(exc.value).lower()
# NOTE: MAX_IMAGES_PER_REQUEST limit removed (beta.9)
# Image count is unlimited - chunking (MAX_SAFE_CHUNK_SIZE) handles batch safety
# See: git log for beta.6 rationale
def test_total_image_size_limit_raises_error(self):
"""Test that total image size > 50MB raises validation error (F-01)."""
# Create large image data (just under per-image limit, but total exceeds 50MB)
# 6 images × 10MB each = 60MB > 50MB limit
large_data = "A" * (10 * 1024 * 1024) # 10 MB
large_b64 = base64.b64encode(large_data.encode()).decode()
image_items = [
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{large_b64}"}}
for _ in range(5) # 5 × 10MB = 50MB, but with encoding overhead it exceeds
]
messages = [
{
"role": "user",
"content": [{"type": "text", "text": "describe"}] + image_items
}
]
# This should raise due to total size (each decoded is ~10MB, 5 = 50MB = at limit)
# With Base64 overhead, actual decoded size may differ - test validates behavior
# Either succeeds at exactly limit or fails if slightly over
try:
prompt, images, _audio = VisionHTTPAdapter.parse_openai_messages(messages)
# If it doesn't raise, it's at or under limit (acceptable)
assert len(images) == 5
except ValueError as e:
# If it raises, should mention size limit
assert "total image size" in str(e).lower() or "exceeds limit" in str(e).lower()
class TestSequentialImageExtraction:
"""
Tests for sequential image extraction logic (VISION-SEQUENTIAL-IMAGES-ISSUE).
Images should be extracted ONLY from the most recent user message.
Text context should be extracted from ALL messages (including assistant responses).
"""
def test_sequential_images_extracts_only_last(self):
"""Test that sequential images in separate messages extract only the last one."""
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "describe this picture"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{VALID_JPEG_B64}"}}
]
},
{"role": "assistant", "content": "A cat on a couch."},
{
"role": "user",
"content": [
{"type": "text", "text": "describe this picture"},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{VALID_PNG_B64}"}}
]
}
]
prompt, images, _audio = VisionHTTPAdapter.parse_openai_messages(messages)
# Should extract ONLY the PNG from the last user message
assert len(images) == 1
assert images[0][0].endswith(".png") # PNG, not JPEG
# Text context should include ALL messages
assert "describe this picture" in prompt
assert "A cat on a couch" in prompt
def test_text_only_follow_up_extracts_no_images(self):
"""Test that text-only follow-up after images extracts no images."""
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "describe this picture"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{VALID_JPEG_B64}"}}
]
},
{"role": "assistant", "content": "A tabby cat on a blue couch."},
{"role": "user", "content": "What color was the cat?"}
]
prompt, images, _audio = VisionHTTPAdapter.parse_openai_messages(messages)
# Should extract NO images (last user message is text-only)
assert len(images) == 0
# Text context should include all messages
assert "describe this picture" in prompt
assert "A tabby cat on a blue couch" in prompt
assert "What color was the cat?" in prompt
def test_multiple_images_in_last_message_all_extracted(self):
"""Test that multiple images in the last user message are all extracted."""
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "first image"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{VALID_JPEG_B64}"}}
]
},
{"role": "assistant", "content": "Described first image."},
{
"role": "user",
"content": [
{"type": "text", "text": "compare these two"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{VALID_JPEG_B64}"}},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{VALID_PNG_B64}"}}
]
}
]
prompt, images, _audio = VisionHTTPAdapter.parse_openai_messages(messages)
# Should extract BOTH images from the last user message
assert len(images) == 2
assert images[0][0].endswith(".jpeg")
assert images[1][0].endswith(".png")
# Text context from all messages
assert "first image" in prompt
assert "Described first image" in prompt
assert "compare these two" in prompt
def test_assistant_response_preserved_in_text_context(self):
"""Test that assistant responses are preserved in text context."""
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "describe this picture"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{VALID_JPEG_B64}"}}
]
},
{"role": "assistant", "content": "The image shows a garden with flowers."},
{"role": "user", "content": "What country is this garden in?"}
]
prompt, images, _audio = VisionHTTPAdapter.parse_openai_messages(messages)
# No images in last message
assert len(images) == 0
# Assistant's description must be in text context (important for follow-up!)
assert "The image shows a garden with flowers." in prompt
assert "What country is this garden in?" in prompt
def test_three_sequential_images_extracts_only_third(self):
"""Test that three sequential images extract only the third."""
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Image 1"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{VALID_JPEG_B64}"}}
]
},
{"role": "assistant", "content": "Description 1"},
{
"role": "user",
"content": [
{"type": "text", "text": "Image 2"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{VALID_JPEG_B64}"}}
]
},
{"role": "assistant", "content": "Description 2"},
{
"role": "user",
"content": [
{"type": "text", "text": "Image 3"},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{VALID_PNG_B64}"}}
]
}
]
prompt, images, _audio = VisionHTTPAdapter.parse_openai_messages(messages)
# Should extract ONLY the PNG from the third user message
assert len(images) == 1
assert images[0][0].endswith(".png")
# All text context preserved
assert "Image 1" in prompt
assert "Description 1" in prompt
assert "Image 2" in prompt
assert "Description 2" in prompt
assert "Image 3" in prompt
def test_last_user_message_with_string_content_no_images(self):
"""Test that last user message with string content extracts no images."""
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "describe"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{VALID_JPEG_B64}"}}
]
},
{"role": "assistant", "content": "A picture."},
{"role": "user", "content": "Thanks!"} # String content, not array
]
prompt, images, _audio = VisionHTTPAdapter.parse_openai_messages(messages)
# No images (last user message is string, not array)
assert len(images) == 0
# Text context preserved
assert "describe" in prompt
assert "A picture." in prompt
assert "Thanks!" in prompt
class TestAssignImageIdsFromHistory:
"""
Tests for history-based image ID assignment (Session 32: Option D).
The conversation history IS the session - no server-side state needed.
IDs are assigned chronologically based on content hash for deduplication.
"""
def test_single_image_gets_id_1(self):
"""Test that single image in first request gets ID 1."""
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "describe"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{VALID_JPEG_B64}"}}
]
}
]
id_map = VisionHTTPAdapter.assign_image_ids_from_history(messages)
assert len(id_map) == 1
# Should have one entry with value 1
assert list(id_map.values()) == [1]
def test_sequential_images_get_sequential_ids(self):
"""Test that sequential images in separate messages get sequential IDs."""
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "describe"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{VALID_JPEG_B64}"}}
]
},
{"role": "assistant", "content": "A cat."},
{
"role": "user",
"content": [
{"type": "text", "text": "describe"},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{VALID_PNG_B64}"}}
]
}
]
id_map = VisionHTTPAdapter.assign_image_ids_from_history(messages)
# Should have 2 entries: Image 1 (JPEG) and Image 2 (PNG)
assert len(id_map) == 2
assert sorted(id_map.values()) == [1, 2]
def test_deduplication_same_image_same_id(self):
"""Test that re-uploading the same image gets the same ID (deduplication)."""
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "first upload"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{VALID_JPEG_B64}"}}
]
},
{"role": "assistant", "content": "A cat."},
{
"role": "user",
"content": [
{"type": "text", "text": "upload same image again"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{VALID_JPEG_B64}"}}
]
}
]
id_map = VisionHTTPAdapter.assign_image_ids_from_history(messages)
# Should have only 1 entry (same hash = same ID)
assert len(id_map) == 1
assert list(id_map.values()) == [1]
def test_text_only_messages_skipped(self):
"""Test that text-only messages don't affect ID assignment."""
messages = [
{"role": "user", "content": "Hello"},
{"role": "assistant", "content": "Hi there!"},
{
"role": "user",
"content": [
{"type": "text", "text": "describe"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{VALID_JPEG_B64}"}}
]
},
{"role": "assistant", "content": "A cat."},
{"role": "user", "content": "Thanks!"}
]
id_map = VisionHTTPAdapter.assign_image_ids_from_history(messages)
# Only 1 image in history
assert len(id_map) == 1
assert list(id_map.values()) == [1]
def test_empty_messages_returns_empty_map(self):
"""Test that empty messages list returns empty map."""
messages = []
id_map = VisionHTTPAdapter.assign_image_ids_from_history(messages)
assert id_map == {}
def test_no_images_returns_empty_map(self):
"""Test that messages with no images return empty map."""
messages = [
{"role": "user", "content": "Hello"},
{"role": "assistant", "content": "Hi!"},
{"role": "user", "content": "How are you?"}
]
id_map = VisionHTTPAdapter.assign_image_ids_from_history(messages)
assert id_map == {}
def test_multiple_images_in_single_message(self):
"""Test that multiple images in single message get sequential IDs."""
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "compare these"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{VALID_JPEG_B64}"}},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{VALID_PNG_B64}"}}
]
}
]
id_map = VisionHTTPAdapter.assign_image_ids_from_history(messages)
# Should have 2 entries: ID 1 and ID 2
assert len(id_map) == 2
assert sorted(id_map.values()) == [1, 2]
def test_assistant_messages_ignored(self):
"""Test that assistant messages (even if they contain image_url) are ignored."""
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "describe"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{VALID_JPEG_B64}"}}
]
},
{
"role": "assistant",
"content": [
{"type": "text", "text": "Here's an image:"},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{VALID_PNG_B64}"}}
]
}
]
id_map = VisionHTTPAdapter.assign_image_ids_from_history(messages)
# Only 1 image (from user message, not assistant)
assert len(id_map) == 1
assert list(id_map.values()) == [1]
def test_chronological_order_preserved(self):
"""Test that IDs are assigned in chronological order."""
messages = [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{VALID_JPEG_B64}"}}
]
},
{"role": "assistant", "content": "First image."},
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{VALID_PNG_B64}"}}
]
},
{"role": "assistant", "content": "Second image."},
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{VALID_JPEG_B64}"}}
]
}
]
id_map = VisionHTTPAdapter.assign_image_ids_from_history(messages)
# JPEG appeared first (ID 1), PNG appeared second (ID 2)
# Third message is same JPEG → reuses ID 1
assert len(id_map) == 2
# Get hashes to verify order
import hashlib
jpeg_hash = hashlib.sha256(base64.b64decode(VALID_JPEG_B64)).hexdigest()[:8]
png_hash = hashlib.sha256(base64.b64decode(VALID_PNG_B64)).hexdigest()[:8]
assert id_map[jpeg_hash] == 1 # JPEG first
assert id_map[png_hash] == 2 # PNG second
class TestMappingTableParsing:
"""
Tests for parsing filename mapping tables from assistant responses.
This enables clients to drop Base64 data from history (storage optimization)
while preserving Image ID continuity via the server's own text output.
"""
def test_parse_simple_mapping_table(self):
"""Test parsing a simple mapping table with two images."""
content = """A sandy beach with blue water.
<!-- mlxk:filenames -->
| Image | Filename |
|-------|----------|
| 1 | image_5733332c.jpeg |
| 2 | image_49779094.jpeg |
"""
parsed = VisionHTTPAdapter._parse_filename_mapping(content)
assert len(parsed) == 2
assert parsed["5733332c"] == 1
assert parsed["49779094"] == 2
def test_parse_mapping_table_with_original_filenames(self):
"""Test parsing mapping table that includes original filenames."""
content = """Description here.
<!-- mlxk:filenames -->
| Image | Filename |
|-------|----------|
| 1 | image_5733332c.jpeg (beach.jpg) |
| 2 | image_49779094.jpeg (mountain.png) |
"""
parsed = VisionHTTPAdapter._parse_filename_mapping(content)
assert len(parsed) == 2
assert parsed["5733332c"] == 1
assert parsed["49779094"] == 2
def test_parse_empty_content(self):
"""Test parsing content without mapping table."""
content = "Just some regular text without any mapping."
parsed = VisionHTTPAdapter._parse_filename_mapping(content)
assert parsed == {}
def test_parse_table_without_marker_still_parses(self):
"""Test that _parse_filename_mapping() is a simple parser (marker check is caller's job)."""
content = """Some description.
| Image | Filename |
|-------|----------|
| 1 | image_5733332c.jpeg |
"""
# Note: No <!-- mlxk:filenames --> marker
# But _parse_filename_mapping() is just a parser - it doesn't validate markers
# The marker check happens in assign_image_ids_from_history()
parsed = VisionHTTPAdapter._parse_filename_mapping(content)
# Parser extracts data regardless of marker
assert parsed == {"5733332c": 1}
def test_history_without_marker_ignored(self):
"""Test that assign_image_ids_from_history() ignores tables without marker."""
messages = [
{"role": "user", "content": "describe"},
{
"role": "assistant",
"content": """Beach.
| Image | Filename |
|-------|----------|
| 1 | image_aaaaaaaa.jpeg |
"""
# Note: No <!-- mlxk:filenames --> marker
}
]
id_map = VisionHTTPAdapter.assign_image_ids_from_history(messages)
# Should be empty because marker check in assign_image_ids_from_history()
assert id_map == {}
def test_history_with_mapping_table_no_base64(self):
"""Test that Image IDs are reconstructed from mapping table without Base64 data."""
import hashlib
# Compute hashes for reference
jpeg_hash = hashlib.sha256(base64.b64decode(VALID_JPEG_B64)).hexdigest()[:8]
png_hash = hashlib.sha256(base64.b64decode(VALID_PNG_B64)).hexdigest()[:8]
# Simulate conversation where client dropped Base64 after first request
messages = [
# Request 1 (Vision): User sent beach.jpg with Base64 (not in history anymore)
{"role": "user", "content": "describe this picture"},
{
"role": "assistant",
"content": f"""A sandy beach.
<!-- mlxk:filenames -->
| Image | Filename |
|-------|----------|
| 1 | image_{jpeg_hash}.jpeg |
"""
},
# Request 2 (Text): No images
{"role": "user", "content": "What color?"},
{"role": "assistant", "content": "Blue."},
# Request 3 (Vision): User sends mountain.jpg with Base64
{
"role": "user",
"content": [
{"type": "text", "text": "describe this new picture"},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{VALID_PNG_B64}"}}
]
}
]
id_map = VisionHTTPAdapter.assign_image_ids_from_history(messages)
# Should reconstruct beach.jpg from mapping table (ID 1)
# And assign mountain.jpg as ID 2
assert len(id_map) == 2
assert id_map[jpeg_hash] == 1 # From mapping table
assert id_map[png_hash] == 2 # From new image_url
def test_multiple_mapping_tables_in_history(self):
"""Test handling multiple mapping tables across conversation."""
messages = [
{"role": "user", "content": "describe"},
{
"role": "assistant",
"content": """Beach.
<!-- mlxk:filenames -->
| Image | Filename |
|-------|----------|
| 1 | image_aaaaaaaa.jpeg |
"""
},
{"role": "user", "content": "describe another"},
{
"role": "assistant",
"content": """Mountain.
<!-- mlxk:filenames -->
| Image | Filename |
|-------|----------|
| 1 | image_aaaaaaaa.jpeg |
| 2 | image_bbbbbbbb.jpeg |
"""
}
]
id_map = VisionHTTPAdapter.assign_image_ids_from_history(messages)
# Should use the most complete mapping (from second table)
assert len(id_map) == 2
assert id_map["aaaaaaaa"] == 1
assert id_map["bbbbbbbb"] == 2
class TestAudioLimits:
"""
Tests for audio limits in parse_openai_messages (F-02).
Only 1 audio per request is allowed due to mlx-vlm limitation.
"""
# Minimal valid WAV header (44 bytes)
VALID_WAV_B64 = base64.b64encode(
b'RIFF' + b'\x24\x00\x00\x00' + # ChunkSize
b'WAVE' +
b'fmt ' + b'\x10\x00\x00\x00' + # Subchunk1Size
b'\x01\x00' + # AudioFormat (PCM)
b'\x01\x00' + # NumChannels (1)
b'\x44\xac\x00\x00' + # SampleRate (44100)
b'\x88\x58\x01\x00' + # ByteRate
b'\x02\x00' + # BlockAlign
b'\x10\x00' + # BitsPerSample
b'data' + b'\x00\x00\x00\x00' # Subchunk2Size (0 = empty)
).decode()
def test_single_audio_allowed(self):
"""Test that single audio in request is allowed."""
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "transcribe"},
{"type": "input_audio", "input_audio": {"data": self.VALID_WAV_B64, "format": "wav"}}
]
}
]
prompt, images, audio = VisionHTTPAdapter.parse_openai_messages(messages)
assert len(audio) == 1
assert audio[0][0].endswith(".wav")
def test_multi_audio_raises_error(self):
"""Test that more than 1 audio raises validation error (F-02)."""
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "transcribe"},
{"type": "input_audio", "input_audio": {"data": self.VALID_WAV_B64, "format": "wav"}},
{"type": "input_audio", "input_audio": {"data": self.VALID_WAV_B64, "format": "wav"}}
]
}
]
with pytest.raises(ValueError) as exc:
VisionHTTPAdapter.parse_openai_messages(messages)
assert "1 audio per request" in str(exc.value).lower() or "only 1" in str(exc.value).lower()
def test_audio_with_images_allowed_in_adapter(self):
"""Test that audio + images is allowed in adapter (filtering happens in server)."""
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "describe"},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{VALID_JPEG_B64}"}},
{"type": "input_audio", "input_audio": {"data": self.VALID_WAV_B64, "format": "wav"}}
]
}
]
# Adapter allows both - server handles the filtering (F-03)
prompt, images, audio = VisionHTTPAdapter.parse_openai_messages(messages)
assert len(images) == 1
assert len(audio) == 1
def test_mpeg_format_accepted(self):
"""Test that 'mpeg' format is accepted and normalized to mp3 (F-08)."""
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "transcribe"},
{"type": "input_audio", "input_audio": {"data": self.VALID_WAV_B64, "format": "mpeg"}}
]
}
]
prompt, images, audio = VisionHTTPAdapter.parse_openai_messages(messages)
assert len(audio) == 1
# Should be normalized to .mp3
assert audio[0][0].endswith(".mp3")