feat: Experimental Voxtral audio support

Add experimental support for Voxtral audio models:
- Add "voxtral" to AUDIO_MODEL_TYPES
- Increase audio limit: 5MB → 50MB (CLI + Server)
  - Gemma-3n: ~30s duration (188 tokens @ 6.25 tokens/s)
  - Voxtral: >10min duration (>2000 tokens)
  - Token count is real constraint, file size is sanity check

Requirements:
- mlx-vlm with Voxtral support (upstream merge pending)
- mistral-common for tekken tokenizer

Status: EXPERIMENTAL - pending mlx-vlm upstream integration
This commit is contained in:
The BROKE Cluster Team
2026-01-26 17:20:50 +01:00
parent e8b10ea10b
commit 8c873530da
3 changed files with 10 additions and 5 deletions
+5 -3
View File
@@ -498,9 +498,11 @@ def main():
print_result(result, None, True if args.json else False)
sys.exit(1)
data = aud_path.read_bytes()
# 5MB limit for audio (~2-3 min at 16kHz mono; token count is the real constraint)
if len(data) > 5 * 1024 * 1024:
result = handle_error("CommandError", f"Audio file too large (>5MB): {audio_path}")
# 50MB limit for audio (~15 min at 16kHz mono)
# Note: Gemma-3n ~30s (token limit), Voxtral >10min (larger token capacity)
# Token count is the real constraint, file size is just a sanity check
if len(data) > 50 * 1024 * 1024:
result = handle_error("CommandError", f"Audio file too large (>50MB): {audio_path}")
print_result(result, None, True if args.json else False)
sys.exit(1)
audio_inputs.append((aud_path.name, data))
+1
View File
@@ -80,6 +80,7 @@ VISION_MODEL_TYPES = frozenset({
AUDIO_MODEL_TYPES = frozenset({
"gemma3n", # Google Gemma 3n (Vision + Audio + Text)
"gemma3n_audio", # Audio encoder subcomponent
"voxtral", # Voxtral mini (Audio + Text) - EXPERIMENTAL (pre-mlx-vlm merge)
})
+4 -2
View File
@@ -23,8 +23,10 @@ MAX_SAFE_CHUNK_SIZE = 5 # Empirically tested stable (5 images @ ~50MB total)
SUPPORTED_MIME_TYPES = frozenset({"jpeg", "jpg", "png", "gif", "webp"})
# Audio limits (ADR-019 Phase 4)
# 5MB limit matches CLI (~2-3 min at 16kHz mono, token count is the real constraint)
MAX_AUDIO_SIZE_BYTES = 5 * 1024 * 1024 # 5 MB per audio file
# 50MB limit for audio (~15 min at 16kHz mono)
# Note: Gemma-3n ~30s (token limit), Voxtral >10min (larger token capacity)
# Token count is the real constraint, file size is just a sanity check
MAX_AUDIO_SIZE_BYTES = 50 * 1024 * 1024 # 50 MB per audio file
# mlx-vlm natively supports WAV and MP3 (verified in mlx-vlm README)
SUPPORTED_AUDIO_FORMATS = frozenset({"wav", "mp3", "mpeg"})