mirror of
https://github.com/cloudstack-llc/mlx-knife.git
synced 2026-07-01 20:44:14 -04:00
feat: Experimental Voxtral audio support
Add experimental support for Voxtral audio models: - Add "voxtral" to AUDIO_MODEL_TYPES - Increase audio limit: 5MB → 50MB (CLI + Server) - Gemma-3n: ~30s duration (188 tokens @ 6.25 tokens/s) - Voxtral: >10min duration (>2000 tokens) - Token count is real constraint, file size is sanity check Requirements: - mlx-vlm with Voxtral support (upstream merge pending) - mistral-common for tekken tokenizer Status: EXPERIMENTAL - pending mlx-vlm upstream integration
This commit is contained in:
+5
-3
@@ -498,9 +498,11 @@ def main():
|
||||
print_result(result, None, True if args.json else False)
|
||||
sys.exit(1)
|
||||
data = aud_path.read_bytes()
|
||||
# 5MB limit for audio (~2-3 min at 16kHz mono; token count is the real constraint)
|
||||
if len(data) > 5 * 1024 * 1024:
|
||||
result = handle_error("CommandError", f"Audio file too large (>5MB): {audio_path}")
|
||||
# 50MB limit for audio (~15 min at 16kHz mono)
|
||||
# Note: Gemma-3n ~30s (token limit), Voxtral >10min (larger token capacity)
|
||||
# Token count is the real constraint, file size is just a sanity check
|
||||
if len(data) > 50 * 1024 * 1024:
|
||||
result = handle_error("CommandError", f"Audio file too large (>50MB): {audio_path}")
|
||||
print_result(result, None, True if args.json else False)
|
||||
sys.exit(1)
|
||||
audio_inputs.append((aud_path.name, data))
|
||||
|
||||
@@ -80,6 +80,7 @@ VISION_MODEL_TYPES = frozenset({
|
||||
AUDIO_MODEL_TYPES = frozenset({
|
||||
"gemma3n", # Google Gemma 3n (Vision + Audio + Text)
|
||||
"gemma3n_audio", # Audio encoder subcomponent
|
||||
"voxtral", # Voxtral mini (Audio + Text) - EXPERIMENTAL (pre-mlx-vlm merge)
|
||||
})
|
||||
|
||||
|
||||
|
||||
@@ -23,8 +23,10 @@ MAX_SAFE_CHUNK_SIZE = 5 # Empirically tested stable (5 images @ ~50MB total)
|
||||
SUPPORTED_MIME_TYPES = frozenset({"jpeg", "jpg", "png", "gif", "webp"})
|
||||
|
||||
# Audio limits (ADR-019 Phase 4)
|
||||
# 5MB limit matches CLI (~2-3 min at 16kHz mono, token count is the real constraint)
|
||||
MAX_AUDIO_SIZE_BYTES = 5 * 1024 * 1024 # 5 MB per audio file
|
||||
# 50MB limit for audio (~15 min at 16kHz mono)
|
||||
# Note: Gemma-3n ~30s (token limit), Voxtral >10min (larger token capacity)
|
||||
# Token count is the real constraint, file size is just a sanity check
|
||||
MAX_AUDIO_SIZE_BYTES = 50 * 1024 * 1024 # 50 MB per audio file
|
||||
# mlx-vlm natively supports WAV and MP3 (verified in mlx-vlm README)
|
||||
SUPPORTED_AUDIO_FORMATS = frozenset({"wav", "mp3", "mpeg"})
|
||||
|
||||
|
||||
Reference in New Issue
Block a user