Files
mlx-knife/mlxk2/tools/vision_adapter.py
T
The BROKE Cluster Team bf7480d042 Release 2.0.4-beta.9: Audio transcription via mlx-audio
Major Features:
- Audio transcription via mlx-audio backend (Whisper, >10min duration)
- OpenAI /v1/audio/transcriptions endpoint
- Memory Gate System (Vision: 8GB, Audio: 4GB)
- Config-based backend routing (ADR-020)
- Benchmark toolchain (memmon/memplot, Schema v0.2.2)

Key Fixes:
- EuroLLM tokenizer decoding
- Vision-model text-only routing regression
- Multimodal model context length detection
- Memory cleanup bug (mx.metal.clear_cache)
- Orphan process bug

Test Results:
- Unit tests: 647 passed, 11 skipped (Python 3.10-3.12)
- wet-umbrella: 171 passed total

See CHANGELOG.md for complete details and known issues.
2026-02-04 03:10:30 +01:00

490 lines
18 KiB
Python

"""
Vision/Audio HTTP adapter for converting OpenAI-compatible requests to VisionRunner format.
This module handles Base64 image/audio decoding and OpenAI message format parsing
for the server Vision/Audio API (ADR-012 Phase 3, ADR-019 Phase 4).
"""
from __future__ import annotations
import base64
import hashlib
import re
from typing import Any, Dict, List, Optional, Tuple
# No imports needed - use standard Python exceptions
# Limits for vision requests (safety and resource management)
# Per-image size limit prevents Metal OOM crashes (ADR-012 Phase 3)
# Total image count is unlimited - chunking (MAX_SAFE_CHUNK_SIZE) handles batch safety
MAX_IMAGE_SIZE_BYTES = 20 * 1024 * 1024 # 20 MB per image (Metal API limit)
MAX_TOTAL_IMAGE_SIZE_BYTES = 50 * 1024 * 1024 # 50 MB total (Metal OOM prevention)
MAX_SAFE_CHUNK_SIZE = 5 # Empirically tested stable (5 images @ ~50MB total)
SUPPORTED_MIME_TYPES = frozenset({"jpeg", "jpg", "png", "gif", "webp"})
# Audio limits (ADR-019 Phase 4)
# 50MB limit for audio (~15 min at 16kHz mono)
# Note: Gemma-3n ~30s (token limit), Voxtral >10min (larger token capacity)
# Token count is the real constraint, file size is just a sanity check
MAX_AUDIO_SIZE_BYTES = 50 * 1024 * 1024 # 50 MB per audio file
# mlx-vlm natively supports WAV and MP3 (verified in mlx-vlm README)
SUPPORTED_AUDIO_FORMATS = frozenset({"wav", "mp3", "mpeg"})
class VisionHTTPAdapter:
"""Adapter for converting OpenAI Vision API format to VisionRunner format."""
@staticmethod
def parse_openai_messages(
messages: List[Dict[str, Any]]
) -> Tuple[str, List[Tuple[str, bytes]], List[Tuple[str, bytes]]]:
"""
Parse OpenAI-style messages and extract text prompt + images + audio.
OpenAI Vision API format:
[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this image?"},
{"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,..."}},
{"type": "input_audio", "input_audio": {"data": "base64...", "format": "wav"}}
]
}
]
Important: Images/audio are extracted ONLY from the most recent user message.
Text context is extracted from ALL messages (including assistant responses).
This follows OpenAI Vision API behavior where previous media remain in
history for text context but are NOT sent as visual/audio input to the model.
Args:
messages: List of message dicts (OpenAI format)
Returns:
(prompt, images, audio) tuple where:
- prompt: str - Combined text from all text content blocks
- images: List[Tuple[str, bytes]] - List of (filename, raw_bytes) tuples
- audio: List[Tuple[str, bytes]] - List of (filename, raw_bytes) tuples
Raises:
ValueError: If message format is invalid or media cannot be decoded
See: docs/ISSUES/VISION-SEQUENTIAL-IMAGES-ISSUE.md
"""
if not messages:
raise ValueError("Messages list cannot be empty")
text_parts = []
# Extract text from ALL messages (for conversation context)
for msg in messages:
if not isinstance(msg, dict):
raise ValueError("Each message must be a dict")
content = msg.get("content")
if content is None:
continue
# Handle string content (simple text message)
if isinstance(content, str):
text_parts.append(content)
continue
# Handle array content (Vision API format with text + images)
if not isinstance(content, list):
raise ValueError(
f"Message content must be string or array, got {type(content).__name__}"
)
for item in content:
if not isinstance(item, dict):
raise ValueError(
"Each content item must be a dict with 'type' field"
)
item_type = item.get("type")
if item_type == "text":
text = item.get("text", "")
if text:
text_parts.append(text)
# Extract images and audio ONLY from the most recent user message
images = []
audio = []
for msg in reversed(messages):
if not isinstance(msg, dict):
continue
role = msg.get("role")
if role != "user":
continue
content = msg.get("content")
if content is None or not isinstance(content, list):
# Last user message has no media (text-only follow-up)
break
# Process image_url and input_audio items from this message only
for item in content:
if not isinstance(item, dict):
continue
item_type = item.get("type")
if item_type == "image_url":
image_url_obj = item.get("image_url")
if not isinstance(image_url_obj, dict):
raise ValueError(
"image_url content must have 'image_url' dict"
)
url = image_url_obj.get("url", "")
if not url:
raise ValueError(
"image_url.url cannot be empty"
)
# Decode base64 image (validates size per image)
filename, raw_bytes = VisionHTTPAdapter.decode_base64_image(url)
images.append((filename, raw_bytes))
elif item_type == "input_audio":
# OpenAI input_audio format (ADR-019 Phase 4)
input_audio_obj = item.get("input_audio")
if not isinstance(input_audio_obj, dict):
raise ValueError(
"input_audio content must have 'input_audio' dict"
)
data = input_audio_obj.get("data", "")
fmt = input_audio_obj.get("format", "wav")
if not data:
raise ValueError(
"input_audio.data cannot be empty"
)
# Decode base64 audio (validates size and format)
filename, raw_bytes = VisionHTTPAdapter.decode_base64_audio(data, fmt)
audio.append((filename, raw_bytes))
# Stop after processing first (most recent) user message
break
# Validate image size limits (total size only - count is unlimited, chunking handles batch safety)
if images:
total_size = sum(len(data) for _, data in images)
if total_size > MAX_TOTAL_IMAGE_SIZE_BYTES:
size_mb = total_size / (1024 * 1024)
limit_mb = MAX_TOTAL_IMAGE_SIZE_BYTES / (1024 * 1024)
raise ValueError(
f"Total image size ({size_mb:.1f} MB) exceeds limit ({limit_mb:.0f} MB)"
)
# Validate audio limits (F-02: Only 1 audio per request, mlx-vlm limitation)
if len(audio) > 1:
raise ValueError(
f"Only 1 audio per request (mlx-vlm limitation). Got: {len(audio)}"
)
# Combine text parts
prompt = " ".join(text_parts).strip()
# Validation: Must have either text or images or audio
if not prompt and not images and not audio:
raise ValueError(
"Request must contain at least text, images, or audio"
)
# Default prompts based on media type
if not prompt:
if images:
prompt = "Describe the image."
elif audio:
prompt = "Transcribe what is spoken in this audio."
return prompt, images, audio
@staticmethod
def decode_base64_image(url: str) -> Tuple[str, bytes]:
"""
Decode a Base64-encoded image from a data URL.
Supports data URLs like:
- data:image/jpeg;base64,/9j/4AAQSkZJRg...
- data:image/png;base64,iVBORw0KGgo...
Args:
url: Data URL string with base64-encoded image
Returns:
(filename, raw_bytes) tuple where:
- filename: Generated name based on content hash (e.g., "image_a1b2c3.jpg")
- raw_bytes: Decoded image bytes
Raises:
MLXKError: If URL is not a valid data URL or base64 decoding fails
"""
# Check if it's a data URL
if not url.startswith("data:"):
raise ValueError(
"Only data URLs are supported (e.g., data:image/jpeg;base64,...). "
"External URLs are not supported."
)
# Parse data URL: data:image/jpeg;base64,<data>
match = re.match(
r"^data:image/(jpeg|jpg|png|gif|webp);base64,(.+)$", url, re.IGNORECASE
)
if not match:
raise ValueError(
"Invalid data URL format. Expected: data:image/<type>;base64,<data>. "
f"Supported types: {', '.join(sorted(SUPPORTED_MIME_TYPES))}"
)
mime_type = match.group(1).lower()
base64_data = match.group(2)
# Normalize MIME type
if mime_type == "jpg":
mime_type = "jpeg"
# Validate MIME type
if mime_type not in SUPPORTED_MIME_TYPES:
raise ValueError(
f"Unsupported image type: {mime_type}. "
f"Supported types: {', '.join(sorted(SUPPORTED_MIME_TYPES))}"
)
# Decode base64
try:
raw_bytes = base64.b64decode(base64_data, validate=True)
except Exception as e:
raise ValueError(
f"Failed to decode base64 image data: {e}"
) from e
# Validate that we got some data
if not raw_bytes:
raise ValueError("Decoded image data is empty")
# Enforce size limit
if len(raw_bytes) > MAX_IMAGE_SIZE_BYTES:
size_mb = len(raw_bytes) / (1024 * 1024)
limit_mb = MAX_IMAGE_SIZE_BYTES / (1024 * 1024)
raise ValueError(
f"Image size ({size_mb:.1f} MB) exceeds limit ({limit_mb:.0f} MB)"
)
# Generate deterministic filename from content hash
content_hash = hashlib.sha256(raw_bytes).hexdigest()[:8]
filename = f"image_{content_hash}.{mime_type}"
return filename, raw_bytes
@staticmethod
def decode_base64_audio(data: str, fmt: str) -> Tuple[str, bytes]:
"""
Decode Base64-encoded audio data.
OpenAI input_audio format:
{
"type": "input_audio",
"input_audio": {
"data": "<base64-encoded-audio>",
"format": "wav" # or "mp3"
}
}
Args:
data: Base64-encoded audio data (without data URL prefix)
fmt: Audio format ("wav", "mp3")
Returns:
(filename, raw_bytes) tuple where:
- filename: Generated name based on content hash (e.g., "audio_a1b2c3.wav")
- raw_bytes: Decoded audio bytes
Raises:
ValueError: If format is unsupported or base64 decoding fails
"""
# Normalize format (audio/mpeg -> mp3)
fmt_lower = fmt.lower()
if fmt_lower == "mpeg":
fmt_lower = "mp3"
# Validate format
if fmt_lower not in SUPPORTED_AUDIO_FORMATS:
raise ValueError(
f"Unsupported audio format: {fmt}. "
f"Supported formats: {', '.join(sorted(SUPPORTED_AUDIO_FORMATS))}"
)
# Decode base64
try:
raw_bytes = base64.b64decode(data, validate=True)
except Exception as e:
raise ValueError(
f"Failed to decode base64 audio data: {e}"
) from e
# Validate that we got some data
if not raw_bytes:
raise ValueError("Decoded audio data is empty")
# Enforce size limit
if len(raw_bytes) > MAX_AUDIO_SIZE_BYTES:
size_mb = len(raw_bytes) / (1024 * 1024)
limit_mb = MAX_AUDIO_SIZE_BYTES / (1024 * 1024)
raise ValueError(
f"Audio size ({size_mb:.1f} MB) exceeds limit ({limit_mb:.0f} MB)"
)
# Generate deterministic filename from content hash
content_hash = hashlib.sha256(raw_bytes).hexdigest()[:8]
filename = f"audio_{content_hash}.{fmt_lower}"
return filename, raw_bytes
@staticmethod
def assign_image_ids_from_history(messages: List[Dict[str, Any]]) -> Dict[str, int]:
"""
Assign stable image IDs from conversation history.
Scans all messages chronologically and assigns sequential IDs based on
content hash. This enables stable "Image 1, Image 2, ..." numbering
across multiple requests in a conversation.
The conversation history IS the session - no server-side state needed.
This is 100% OpenAI API compatible.
Strategy:
1. Scan assistant messages for filename mapping tables (server-generated)
2. Scan user messages for image_url content (current request)
3. Combine both to build complete hash->ID mapping
This allows clients to drop Base64 data from history (storage optimization)
while preserving Image ID continuity via the server's own text output.
Args:
messages: List of message dicts (OpenAI format, full history)
Returns:
Dict mapping content_hash (8 chars) -> image_id (1-based)
Example: {"5c691ddb": 1, "aaad16ca": 2}
Behavior:
- Request 1: beach.jpg → Image 1
- Request 2: beach.jpg + mountain.jpg in history → Image 1, Image 2
- Re-upload beach.jpg → Still Image 1 (hash match = deduplication)
"""
seen_hashes: Dict[str, int] = {}
next_id = 1
for msg in messages:
if not isinstance(msg, dict):
continue
role = msg.get("role")
content = msg.get("content")
# Scan assistant messages for filename mapping tables
if role == "assistant" and isinstance(content, str):
# Parse server-generated mapping tables: "| 1 | image_5733332c.jpeg |"
# Only parse if the mlxk marker is present (avoids false positives)
if "<!-- mlxk:filenames -->" in content:
parsed_hashes = VisionHTTPAdapter._parse_filename_mapping(content)
for hash_val, img_id in parsed_hashes.items():
if hash_val not in seen_hashes:
seen_hashes[hash_val] = img_id
next_id = max(next_id, img_id + 1)
# Scan user messages for image_url content
elif role == "user":
if not isinstance(content, list):
# String content = text-only message, skip
continue
# Process image_url items
for item in content:
if not isinstance(item, dict):
continue
if item.get("type") != "image_url":
continue
image_url_obj = item.get("image_url")
if not isinstance(image_url_obj, dict):
continue
url = image_url_obj.get("url", "")
if not url:
continue
# Compute content hash from base64 data
content_hash = VisionHTTPAdapter._compute_content_hash(url)
if content_hash and content_hash not in seen_hashes:
seen_hashes[content_hash] = next_id
next_id += 1
return seen_hashes
@staticmethod
def _parse_filename_mapping(content: str) -> Dict[str, int]:
"""
Parse filename mapping table from assistant response text.
Extracts hash->ID mappings from server-generated tables like:
<!-- mlxk:filenames -->
| Image | Filename |
|-------|----------|
| 1 | image_5733332c.jpeg |
| 2 | image_49779094.jpeg |
Args:
content: Assistant message content (string)
Returns:
Dict mapping content_hash (8 chars) -> image_id (1-based)
Example: {"5733332c": 1, "49779094": 2}
"""
parsed: Dict[str, int] = {}
# Pattern: "| 1 | image_5733332c.jpeg |"
# Matches: (image_id, hash)
pattern = r'\|\s*(\d+)\s*\|\s*image_([a-f0-9]{8})\.'
matches = re.findall(pattern, content)
for (img_id_str, hash_val) in matches:
img_id = int(img_id_str)
parsed[hash_val] = img_id
return parsed
@staticmethod
def _compute_content_hash(url: str) -> Optional[str]:
"""
Compute content hash from a data URL.
Args:
url: Data URL string (data:image/jpeg;base64,...)
Returns:
8-character hash string, or None if URL is invalid
"""
if not url.startswith("data:"):
return None
# Extract base64 data after the comma
try:
_, base64_data = url.split(",", 1)
raw_bytes = base64.b64decode(base64_data, validate=True)
return hashlib.sha256(raw_bytes).hexdigest()[:8]
except Exception:
return None