From e021fb32cd66e1db8f95477619898bb2b97246b9 Mon Sep 17 00:00:00 2001 From: The BROKE Cluster Team Date: Thu, 5 Feb 2026 10:42:50 +0100 Subject: [PATCH] Release 2.0.4-beta.10: Audio PyPI fix (tiktoken workaround complete) Audio/Whisper works with pip install - no Git workaround needed. See CHANGELOG.md for details. Tested: 647 passed, 11 skipped (Python 3.10-3.12) --- .gitignore | 3 +- CHANGELOG.md | 40 ++ README.md | 28 +- TESTING-DETAILS.md | 4 +- .../ADR/ADR-020-Audio-Backend-Architecture.md | 32 +- docs/ARCHITECTURE.md | 124 +++++ mlxk2/__init__.py | 2 +- mlxk2/audio/__init__.py | 9 + mlxk2/audio/whisper_tokenizer.py | 442 ++++++++++++++++++ mlxk2/core/audio_runner.py | 61 +++ mlxk2/operations/common.py | 92 +++- mlxk2/operations/run.py | 3 +- pyproject.toml | 17 +- 13 files changed, 819 insertions(+), 38 deletions(-) create mode 100644 mlxk2/audio/__init__.py create mode 100644 mlxk2/audio/whisper_tokenizer.py diff --git a/.gitignore b/.gitignore index 979d451..1531258 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,7 @@ venv31?/ venv_*/ venv-*/ test_env*/ -test_results*.log +test-install-venv/ mypy_*.log ruff_*.log */__pycache__/* @@ -29,6 +29,7 @@ ML-workspaces/ # Test artifacts (generated reports) *_report.json +test_results_3_*.log test-img-collection/ small-img-collection benchmarks/reports/*.html diff --git a/CHANGELOG.md b/CHANGELOG.md index b4bcf09..0c96e75 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,45 @@ # Changelog +## [2.0.4-beta.10] - 2026-02-05 + +> **⚠️ Upgrade Notice:** If you installed beta.9 from PyPI, audio transcription does not work due to an incomplete tiktoken patch. Please upgrade to beta.10: `pip install mlx-knife[all]==2.0.4b10` + +### Highlights + +**Audio Works Out-of-the-Box:** Complete tiktoken workaround for mlx-audio Issue #479. PyPI installation (`pip install mlx-knife[audio]`) now works without any manual Git installs. We bundle the full Whisper tokenizer (~340 LOC) from mlx-audio commit 9349644 and patch `Model.get_tokenizer()` at runtime to fallback to tiktoken when HuggingFace processor is unavailable. + +**Beta.9 Audio Bug:** The beta.9 release on PyPI had an incomplete tiktoken patch - it bundled the assets but didn't patch `Model.get_tokenizer()` (the class was incorrectly named `Whisper` instead of `Model`). This caused "Processor not found" errors with Whisper models. + +**Runtime Compatibility Accuracy:** Fixed `runtime_compatible` field in `mlxk list --health` showing incorrect values. Now properly gates embedding models, mis-routed audio models (Qwen3-Omni), transformers 5.x video_processor bugs, and unsupported tokenizers (Voxtral tekken.json). + +### Added + +- **Whisper Tokenizer Patch (mlx-audio Issue #479):** + - New `mlxk2/audio/` module with `whisper_tokenizer.py` (~340 LOC) + - Complete `Tokenizer` class and `get_tokenizer()` from mlx-audio commit 9349644 + - `audio_runner.py`: `_apply_whisper_tokenizer_patch()` patches `Model.get_tokenizer()` + - tiktoken>=0.7.0 dependency (OpenAI core library, stable API) + - Bundled tiktoken assets: `gpt2.tiktoken`, `multilingual.tiktoken` + +### Fixed + +- **Audio/Whisper with PyPI mlx-audio:** `pip install mlx-knife[audio]` now works without Git install workaround. The tiktoken regression in mlx-audio 0.3.1 (Issue #479) is fully patched. + +- **`runtime_compatible` Accuracy:** + - Embedding models (Qwen3-Embedding) → Gate 5: "not supported by mlxk run" + - Mis-routed audio models (Qwen3-Omni) → Gate 3a: "model_type not supported by mlx-audio" + - transformers 5.x bugs (Qwen2-VL, MiMo-VL) → Gate 4a: "Video processor bug" + - Voxtral tekken.json → Gate 3a: "tekken.json tokenizer not supported" + +### Changed + +- **Documentation:** + - README.md: Audio installation simplified (no more Git install instructions) + - ARCHITECTURE.md: Added "Runtime Compatibility Decision Tree" and Probe concept + - ADR-020: Qwen3-Omni clarification (routes to mlx-vlm, not mlx-audio) + +--- + ## [2.0.4-beta.9] - 2026-02-04 ### Highlights diff --git a/README.md b/README.md index a60f773..ad5f4af 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,9 @@ MLX Knife Demo

-**Current Version: 2.0.4-beta.9** (Stable: 2.0.3) +**Current Version: 2.0.4-beta.10** (Stable: 2.0.3) -[![GitHub Release](https://img.shields.io/badge/version-2.0.4--beta.9-blue.svg)](https://github.com/mzau/mlx-knife/releases) +[![GitHub Release](https://img.shields.io/badge/version-2.0.4--beta.10-blue.svg)](https://github.com/mzau/mlx-knife/releases) [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0) [![Python 3.10-3.12](https://img.shields.io/badge/python-3.10--3.12-blue.svg)](https://www.python.org/downloads/) [![Apple Silicon](https://img.shields.io/badge/Apple%20Silicon-green.svg)](https://support.apple.com/en-us/HT211814) @@ -17,6 +17,8 @@ ## Features +> **⚠️ Beta.9 Audio Bug:** If you installed `mlx-knife[audio]==2.0.4b9` from PyPI, audio transcription fails with "Processor not found". Upgrade to beta.10: `pip install mlx-knife[all]==2.0.4b10` + ### What's New in 2.0.4 (Coming Soon - Currently Beta) - **Audio Transcription (STT)** - Whisper speech-to-text (`--audio` flag, `pip install mlx-knife[audio]`) - **Vision Models with EXIF Metadata** - Image analysis + automatic GPS/date/camera extraction visible to the model @@ -94,15 +96,15 @@ mlxk --version # → mlxk 2.0.3 **Requirements:** macOS Apple Silicon, Python 3.9-3.12 -### 2. PyPI Beta (2.0.4-beta.9 - Text + Vision + Audio) +### 2. PyPI Beta (2.0.4-beta.10 - Text + Vision + Audio) ```bash -pip install mlx-knife[all]==2.0.4b9 -mlxk --version # → mlxk 2.0.4b9 +pip install mlx-knife[all]==2.0.4b10 +mlxk --version # → mlxk 2.0.4b10 ``` **Requirements:** macOS Apple Silicon, Python 3.10-3.12 -**Features:** Audio STT (Whisper), Vision with EXIF metadata, tiktoken workaround bundled +**Features:** Audio STT (Whisper), Vision with EXIF metadata, full tiktoken workaround ### 3. Developer Installation @@ -111,7 +113,7 @@ git clone https://github.com/mzau/mlx-knife.git cd mlx-knife pip install -e ".[all,dev,test]" -mlxk --version # → mlxk 2.0.4b9 +mlxk --version # → mlxk 2.0.4b10 pytest -v ``` @@ -449,18 +451,14 @@ mlxk convert --repair-index ### Audio Transcription (Speech-to-Text) -> **🎙️ New in beta.9:** Professional STT via dedicated Whisper models (mlx-audio backend). Backward compatible with Gemma-3n multimodal audio (mlx-vlm). +> **🎙️ New in beta.9/10:** Professional STT via dedicated Whisper models (mlx-audio backend). Beta.10 fixes PyPI install (no Git workaround needed). Backward compatible with Gemma-3n multimodal audio (mlx-vlm). **Requirements:** - **Python 3.10+** (mlx-audio dependency) -- **Installation:** (mlx-audio 0.3.1 PyPI has regression - install from Git): - ```bash - pip install -e "git+https://github.com/Blaizzy/mlx-audio.git@9349644#egg=mlx-audio" - pip install tiktoken - ``` +- **Installation:** `pip install mlx-knife[audio]` (tiktoken workaround bundled) - **No system dependencies:** MP3/WAV decoding via embedded libsndfile (no ffmpeg or Homebrew required) -**✅ Recommended Models** (mlx-knife v2.0.4-beta.9): +**✅ Recommended Models** (mlx-knife v2.0.4-beta.10): | Model | Backend | Size | Duration | Notes | |-------|---------|------|----------|-------| @@ -1241,7 +1239,7 @@ Apache License 2.0 — see `LICENSE` (root) and `mlxk2/NOTICE`.

Made with ❤️ by The BROKE team BROKE Logo
- Version 2.0.4-beta.9 | February 2026
+ Version 2.0.4-beta.10 | February 2026
💬 Web UI: nChat - lightweight chat interface🔮 Multi-node: BROKE Cluster

diff --git a/TESTING-DETAILS.md b/TESTING-DETAILS.md index 73d96d0..e66f710 100644 --- a/TESTING-DETAILS.md +++ b/TESTING-DETAILS.md @@ -4,7 +4,7 @@ This document contains version-specific details, complete file listings, and imp ## Current Status -✅ **2.0.4-beta.9** — Audio transcription (Whisper via mlx-audio); Server `/v1/audio/transcriptions` endpoint; Probe/Policy architecture complete; Vision support Phase 1-3 (CLI + Server); Pipes/Memory-Aware; EXIF metadata; **Test Portfolio Separation complete**; Workspace Infrastructure (ADR-018 Phase 0a+0b+0c); Convert Operation (ADR-018 Phase 1); Resumable Clone; **Benchmark Schema v0.2.2** (Precise test timing). +✅ **2.0.4-beta.10** — **Audio PyPI Fix** (tiktoken workaround complete); Runtime compatibility accuracy; Audio transcription (Whisper via mlx-audio); Server `/v1/audio/transcriptions` endpoint; Probe/Policy architecture complete; Vision support Phase 1-3 (CLI + Server); Pipes/Memory-Aware; EXIF metadata; **Test Portfolio Separation complete**; Workspace Infrastructure (ADR-018 Phase 0a+0b+0c); Convert Operation (ADR-018 Phase 1); Resumable Clone; **Benchmark Schema v0.2.2** (Precise test timing). ### Test Results (Official Reference) @@ -1614,7 +1614,7 @@ MLXK2_LIVE_PUSH=1 \ --- -### A5. Complete Test File Structure (2.0.4-beta.9) +### A5. Complete Test File Structure (2.0.4-beta.10) ``` scripts/ diff --git a/docs/ADR/ADR-020-Audio-Backend-Architecture.md b/docs/ADR/ADR-020-Audio-Backend-Architecture.md index 0f7622f..7fec68b 100644 --- a/docs/ADR/ADR-020-Audio-Backend-Architecture.md +++ b/docs/ADR/ADR-020-Audio-Backend-Architecture.md @@ -235,10 +235,40 @@ def detect_audio_backend(probe: Path, config: Optional[Dict]) -> Optional[Backen | Whisper-* | `model_type: whisper*` | - | WhisperFeatureExtractor | MLX_AUDIO | | VibeVoice-ASR | Name heuristic | - | WhisperFeatureExtractor | MLX_AUDIO | | Gemma-3n | audio_config | vision_config (populated) | - | MLX_VLM | -| Qwen3-Omni | audio_config | vision_config (populated) | - | MLX_VLM | **Note on Voxtral:** Config has `audio_config` but empty `vision_config: {}`. Priority 1 ensures it routes to mlx-audio (not mlx-vlm) per blaizzy's guidance. Works for both Original Mistral and mlx-knife converted variants. +#### Qwen3-Omni Model Family: Special Considerations + +**Model Config Reality (mlx-community/Qwen3-Omni-30B-A3B-Instruct-4bit):** + +```json +{ + "model_type": "qwen3_omni_moe", + "audio_config": /* NOT PRESENT */, + "vision_config": {} /* EMPTY dict */ +} +``` + +Plus `preprocessor_config.json` contains `"feature_extractor_type": "WhisperFeatureExtractor"`. + +**Current Detection Result:** +- Priority 2 (audio_config + vision_config): **Skipped** (no audio_config) +- Priority 4 (WhisperFeatureExtractor): **Matched** → `Backend.MLX_AUDIO` + +**Runtime Compatibility Issue:** +- `model_type: "qwen3_omni_moe"` is NOT supported by mlx-lm +- Therefore `runtime_compatible: False` with reason: "Model type qwen3_omni_moe not supported." + +**Status:** Qwen3-Omni is currently **not runnable** via mlx-knife because: +1. mlx-audio doesn't support `qwen3_omni_moe` model architecture +2. mlx-vlm doesn't support `qwen3_omni_moe` model architecture +3. The model lacks `audio_config` so it doesn't route to MLX_VLM anyway + +**Future:** When mlx-vlm or mlx-audio adds Qwen3-Omni support, the detection logic may need adjustment to: +- Add Priority 1.5: `model_type == "qwen3_omni*"` → appropriate backend +- Or: Model converter creates `audio_config` during conversion + ### Complete Routing Hierarchy **Three-tier routing logic (run.py:452-620):** diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 35e2585..882a86a 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -22,6 +22,130 @@ All code paths follow this sequence: **Rationale:** Consistent probing and policy enforcement prevents silent fallbacks and ensures errors are visible at the earliest possible stage. +#### The Probe Concept + +**User Perspective (UX):** + +"Probe" is the step where mlx-knife reads a model's metadata files to understand: +- What the model can do (text, vision, audio, embeddings) +- Whether it's healthy (files present, formats correct) +- Whether it can run on this system (backend availability, memory) + +This happens automatically during `mlxk list`, `mlxk show`, `mlxk run`, and server model loading. + +**Implementation:** + +A "probe" is a `Path` object pointing to the model's snapshot directory containing: + +``` +probe/ +├── config.json # Model architecture (model_type, vision_config, audio_config) +├── tokenizer_config.json # Chat template, tokenizer settings +├── tokenizer.json # Vocabulary (optional, may be sentencepiece) +├── preprocessor_config.json # Vision/audio processor info (optional) +└── *.safetensors # Weight files (checked for naming convention) +``` + +**Key Probe Functions:** + +| Function | Location | Purpose | +|----------|----------|---------| +| `detect_framework()` | `common.py` | MLX vs PyTorch vs GGUF | +| `detect_model_type()` | `common.py` | chat, base, audio, embedding | +| `detect_capabilities()` | `common.py` | text-generation, vision, audio, embeddings | +| `detect_vision_capability()` | `common.py` | vision_config, preprocessor_config | +| `detect_audio_capability()` | `common.py` | audio_config, WhisperFeatureExtractor | +| `detect_audio_backend()` | `common.py` | MLX_AUDIO (STT) vs MLX_VLM (multimodal) | +| `check_runtime_compatibility()` | `health.py` | Backend supports model_type | + +**Probe vs Config:** + +- `probe`: The `Path` to the snapshot directory (filesystem location) +- `config`: The parsed `dict` from `config.json` (model metadata) + +Both are passed to detection functions because some signals come from config fields, others from file presence. + +#### Runtime Compatibility Decision Tree + +The `runtime_compatible` field in `mlxk list --json` follows this decision tree: + +``` +runtime_compatible? +│ +├─[1] healthy == False? +│ └─→ False (reason from health check) +│ +├─[2] framework != "MLX"? +│ └─→ False ("Incompatible framework: {framework}") +│ +├─[3] has_audio AND audio_backend != None? +│ │ +│ ├─[3a] audio_backend == MLX_AUDIO? +│ │ │ +│ │ ├─ mlx-audio not installed? +│ │ │ └─→ False ("mlx-audio not installed") +│ │ │ +│ │ ├─ model_type NOT in [whisper*, voxtral]? +│ │ │ └─→ False ("Model type '{x}' not supported by mlx-audio") +│ │ │ +│ │ └─ tekken.json exists WITHOUT tokenizer.json? +│ │ └─→ False ("Voxtral tekken.json tokenizer not supported") +│ │ +│ └─[3b] audio_backend == MLX_VLM? +│ │ +│ ├─ vision_runtime_compatibility(probe) fails? +│ │ └─→ False (vision reason) +│ │ +│ └─ check_runtime_compatibility(probe) fails? +│ └─→ False ("Model type '{x}' not supported") +│ +├─[4] has_vision? +│ │ +│ ├─[4a] vision_runtime_compatibility(probe): +│ │ │ +│ │ ├─ Python < 3.10? +│ │ │ └─→ False ("Vision requires Python 3.10+") +│ │ │ +│ │ ├─ mlx-vlm not installed? +│ │ │ └─→ False ("mlx-vlm not installed") +│ │ │ +│ │ └─ transformers 5.x + temporal_patch_size? +│ │ └─→ False ("Video processor bug in transformers 5.x") +│ │ +│ └─[4b] check_runtime_compatibility(probe): +│ │ +│ └─ mlx-lm doesn't support model_type? +│ └─→ False ("Model type '{x}' not supported") +│ +├─[5] "embeddings" in capabilities? +│ └─→ False ("Embedding models not supported by mlxk run") +│ +└─[6] else (text-only models): + │ + └─ check_runtime_compatibility(probe): + │ + ├─ Legacy weight format (weights.*.safetensors)? + │ └─→ False ("Legacy format not supported by mlx-lm") + │ + └─ mlx-lm doesn't support model_type? + └─→ False ("Model type '{x}' not supported") + +If all gates pass → True (runtime_compatible) +``` + +**Gate Priority:** + +| Priority | Gate | Checked For | +|----------|------|-------------| +| 1 | Health | All models | +| 2 | Framework | All models | +| 3 | Audio Backend | Audio-capable models | +| 4 | Vision Backend | Vision-capable models (non-audio) | +| 5 | Embeddings | Embedding models | +| 6 | Text/LLM | Text-only models | + +**Implementation:** `build_model_object()` in `common.py:599-634` + ### 2. No Silent Fallbacks If a model requires a specific capability but the corresponding backend is unavailable, the system **must fail explicitly**. Do not degrade to a lower-capability mode. diff --git a/mlxk2/__init__.py b/mlxk2/__init__.py index 123b89f..18fb922 100644 --- a/mlxk2/__init__.py +++ b/mlxk2/__init__.py @@ -7,4 +7,4 @@ import warnings # Issue parity with 1.1.0 (Issue #22) warnings.filterwarnings('ignore', message='urllib3 v2 only supports OpenSSL 1.1.1+') -__version__ = "2.0.4b9" +__version__ = "2.0.4b10" diff --git a/mlxk2/audio/__init__.py b/mlxk2/audio/__init__.py new file mode 100644 index 0000000..e81a47a --- /dev/null +++ b/mlxk2/audio/__init__.py @@ -0,0 +1,9 @@ +"""Audio support module for mlxk2. + +Contains workarounds for mlx-audio regressions, specifically: +- Whisper tokenizer (tiktoken-based) for Issue #479 +""" + +from .whisper_tokenizer import Tokenizer, get_tokenizer + +__all__ = ["Tokenizer", "get_tokenizer"] diff --git a/mlxk2/audio/whisper_tokenizer.py b/mlxk2/audio/whisper_tokenizer.py new file mode 100644 index 0000000..7bafe44 --- /dev/null +++ b/mlxk2/audio/whisper_tokenizer.py @@ -0,0 +1,442 @@ +# Copyright 2023 Apple Inc. +# SPDX-License-Identifier: MIT +# +# Whisper tokenizer implementation from mlx-audio commit 9349644 (2026-01-26). +# This code was removed in PyPI release 0.3.1 (2026-01-29), breaking Whisper +# transcription for models without HuggingFace processor. +# +# Bundled here as workaround for mlx-audio Issue #479. +# See: https://github.com/Blaizzy/mlx-audio/issues/479 +# +# Original source: +# https://github.com/Blaizzy/mlx-audio/blob/9349644/mlx_audio/stt/models/whisper/tokenizer.py + +from __future__ import annotations + +import base64 +import string +from dataclasses import dataclass, field +from functools import cached_property, lru_cache +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +import tiktoken + +# Try to import LANGUAGES from mlx-audio (still present in 0.3.1) +# Fall back to our own definition if import fails +try: + from mlx_audio.stt.models.whisper.tokenizer import LANGUAGES, TO_LANGUAGE_CODE +except ImportError: + # Fallback: Full LANGUAGES dict from mlx-audio + LANGUAGES = { + "en": "english", + "zh": "chinese", + "de": "german", + "es": "spanish", + "ru": "russian", + "ko": "korean", + "fr": "french", + "ja": "japanese", + "pt": "portuguese", + "tr": "turkish", + "pl": "polish", + "ca": "catalan", + "nl": "dutch", + "ar": "arabic", + "sv": "swedish", + "it": "italian", + "id": "indonesian", + "hi": "hindi", + "fi": "finnish", + "vi": "vietnamese", + "he": "hebrew", + "uk": "ukrainian", + "el": "greek", + "ms": "malay", + "cs": "czech", + "ro": "romanian", + "da": "danish", + "hu": "hungarian", + "ta": "tamil", + "no": "norwegian", + "th": "thai", + "ur": "urdu", + "hr": "croatian", + "bg": "bulgarian", + "lt": "lithuanian", + "la": "latin", + "mi": "maori", + "ml": "malayalam", + "cy": "welsh", + "sk": "slovak", + "te": "telugu", + "fa": "persian", + "lv": "latvian", + "bn": "bengali", + "sr": "serbian", + "az": "azerbaijani", + "sl": "slovenian", + "kn": "kannada", + "et": "estonian", + "mk": "macedonian", + "br": "breton", + "eu": "basque", + "is": "icelandic", + "hy": "armenian", + "ne": "nepali", + "mn": "mongolian", + "bs": "bosnian", + "kk": "kazakh", + "sq": "albanian", + "sw": "swahili", + "gl": "galician", + "mr": "marathi", + "pa": "punjabi", + "si": "sinhala", + "km": "khmer", + "sn": "shona", + "yo": "yoruba", + "so": "somali", + "af": "afrikaans", + "oc": "occitan", + "ka": "georgian", + "be": "belarusian", + "tg": "tajik", + "sd": "sindhi", + "gu": "gujarati", + "am": "amharic", + "yi": "yiddish", + "lo": "lao", + "uz": "uzbek", + "fo": "faroese", + "ht": "haitian creole", + "ps": "pashto", + "tk": "turkmen", + "nn": "nynorsk", + "mt": "maltese", + "sa": "sanskrit", + "lb": "luxembourgish", + "my": "myanmar", + "bo": "tibetan", + "tl": "tagalog", + "mg": "malagasy", + "as": "assamese", + "tt": "tatar", + "haw": "hawaiian", + "ln": "lingala", + "ha": "hausa", + "ba": "bashkir", + "jw": "javanese", + "su": "sundanese", + "yue": "cantonese", + } + + TO_LANGUAGE_CODE = { + **{language: code for code, language in LANGUAGES.items()}, + "burmese": "my", + "valencian": "ca", + "flemish": "nl", + "haitian": "ht", + "letzeburgesch": "lb", + "pushto": "ps", + "panjabi": "pa", + "moldavian": "ro", + "moldovan": "ro", + "sinhalese": "si", + "castilian": "es", + "mandarin": "zh", + } + + +# Path to bundled tiktoken assets in mlxk2 +_ASSETS_DIR = Path(__file__).parent.parent / "assets" / "whisper" + + +@dataclass +class Tokenizer: + """A thin wrapper around `tiktoken` providing quick access to special tokens.""" + + encoding: tiktoken.Encoding + num_languages: int + language: Optional[str] = None + task: Optional[str] = None + sot_sequence: Tuple[int, ...] = () + special_tokens: Dict[str, int] = field(default_factory=dict) + + def __post_init__(self): + for special in self.encoding.special_tokens_set: + special_token = self.encoding.encode_single_token(special) + self.special_tokens[special] = special_token + + sot: int = self.special_tokens["<|startoftranscript|>"] + translate: int = self.special_tokens["<|translate|>"] + transcribe: int = self.special_tokens["<|transcribe|>"] + + langs = tuple(LANGUAGES.keys())[: self.num_languages] + sot_sequence = [sot] + if self.language is not None: + sot_sequence.append(sot + 1 + langs.index(self.language)) + if self.task is not None: + task_token: int = transcribe if self.task == "transcribe" else translate + sot_sequence.append(task_token) + + self.sot_sequence = tuple(sot_sequence) + + def encode(self, text, **kwargs): + return self.encoding.encode(text, **kwargs) + + def decode(self, token_ids: List[int], **kwargs) -> str: + token_ids = [t for t in token_ids if t < self.timestamp_begin] + return self.encoding.decode(token_ids, **kwargs) + + def decode_with_timestamps(self, token_ids: List[int], **kwargs) -> str: + """Timestamp tokens are above other special tokens' id range and are ignored by decode(). + This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>". + """ + return self.encoding.decode(token_ids, **kwargs) + + @cached_property + def eot(self) -> int: + return self.encoding.eot_token + + @cached_property + def transcribe(self) -> int: + return self.special_tokens["<|transcribe|>"] + + @cached_property + def translate(self) -> int: + return self.special_tokens["<|translate|>"] + + @cached_property + def sot(self) -> int: + return self.special_tokens["<|startoftranscript|>"] + + @cached_property + def sot_lm(self) -> int: + return self.special_tokens["<|startoflm|>"] + + @cached_property + def sot_prev(self) -> int: + return self.special_tokens["<|startofprev|>"] + + @cached_property + def no_speech(self) -> int: + return self.special_tokens["<|nospeech|>"] + + @cached_property + def no_timestamps(self) -> int: + return self.special_tokens["<|notimestamps|>"] + + @cached_property + def timestamp_begin(self) -> int: + return self.special_tokens["<|0.00|>"] + + @cached_property + def language_token(self) -> int: + """Returns the token id corresponding to the value of the `language` field.""" + if self.language is None: + raise ValueError("This tokenizer does not have language token configured") + + return self.to_language_token(self.language) + + def to_language_token(self, language): + if token := self.special_tokens.get(f"<|{language}|>", None): + return token + + raise KeyError(f"Language {language} not found in tokenizer.") + + @cached_property + def all_language_tokens(self) -> Tuple[int, ...]: + result = [] + for token, token_id in self.special_tokens.items(): + if token.strip("<|>") in LANGUAGES: + result.append(token_id) + return tuple(result)[: self.num_languages] + + @cached_property + def all_language_codes(self) -> Tuple[str, ...]: + return tuple(self.decode([_l]).strip("<|>") for _l in self.all_language_tokens) + + @cached_property + def sot_sequence_including_notimestamps(self) -> Tuple[int, ...]: + return tuple(list(self.sot_sequence) + [self.no_timestamps]) + + @cached_property + def non_speech_tokens(self) -> Tuple[int, ...]: + """Returns the list of tokens to suppress in order to avoid any speaker tags or + non-speech annotations, to prevent sampling texts that are not actually spoken + in the audio, e.g. + + - (SPEAKING FOREIGN LANGUAGE) + - [DAVID] Hey there, + + keeping basic punctuations like commas, periods, question marks, etc. + """ + symbols = list('"#()*+/:;<=>@[\\]^_`{|}~') + symbols += ( + "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ".split() + ) + + # symbols that may be a single token or multiple tokens depending on the tokenizer. + # In case they're multiple tokens, suppress the first token, which is safe because: + # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress + # in generations, and in the 3-byte UTF-8 representation they share the first two bytes. + miscellaneous = set("\u2669\u266a\u266b\u266c\u266d\u266e\u266f") + assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous) + + # allow hyphens "-" and single quotes "'" between words, but not at the beginning + result = {self.encoding.encode(" -")[0], self.encoding.encode(" '")[0]} + for symbol in symbols + list(miscellaneous): + for tokens in [ + self.encoding.encode(symbol), + self.encoding.encode(" " + symbol), + ]: + if len(tokens) == 1 or symbol in miscellaneous: + result.add(tokens[0]) + + return tuple(sorted(result)) + + def split_to_word_tokens(self, tokens: List[int]): + if self.language in {"zh", "ja", "th", "lo", "my", "yue"}: + # These languages don't typically use spaces, so it is difficult to split words + # without morpheme analysis. Here, we instead split words at any + # position where the tokens are decoded as valid unicode points + return self.split_tokens_on_unicode(tokens) + + return self.split_tokens_on_spaces(tokens) + + def split_tokens_on_unicode(self, tokens: List[int]): + decoded_full = self.decode_with_timestamps(tokens) + replacement_char = "\ufffd" + + words = [] + word_tokens = [] + current_tokens = [] + unicode_offset = 0 + + for token in tokens: + current_tokens.append(token) + decoded = self.decode_with_timestamps(current_tokens) + + if ( + replacement_char not in decoded + or decoded_full[unicode_offset + decoded.index(replacement_char)] + == replacement_char + ): + words.append(decoded) + word_tokens.append(current_tokens) + current_tokens = [] + unicode_offset += len(decoded) + + return words, word_tokens + + def split_tokens_on_spaces(self, tokens: List[int]): + subwords, subword_tokens_list = self.split_tokens_on_unicode(tokens) + words = [] + word_tokens = [] + + for subword, subword_tokens in zip(subwords, subword_tokens_list): + special = subword_tokens[0] >= self.eot + with_space = subword.startswith(" ") + punctuation = subword.strip() in string.punctuation + if special or with_space or punctuation or len(words) == 0: + words.append(subword) + word_tokens.append(subword_tokens) + else: + words[-1] = words[-1] + subword + word_tokens[-1].extend(subword_tokens) + + return words, word_tokens + + +@lru_cache(maxsize=None) +def get_encoding(name: str = "gpt2", num_languages: int = 99) -> tiktoken.Encoding: + """Load tiktoken encoding from bundled assets. + + Uses assets from mlxk2/assets/whisper/ instead of mlx-audio's removed assets. + """ + vocab_path = _ASSETS_DIR / f"{name}.tiktoken" + + if not vocab_path.exists(): + raise FileNotFoundError( + f"Tiktoken vocabulary file not found: {vocab_path}\n" + f"This is an mlx-audio Issue #479 workaround.\n" + f"Expected assets in: {_ASSETS_DIR}" + ) + + with open(vocab_path) as fid: + ranks = { + base64.b64decode(token): int(rank) + for token, rank in (line.split() for line in fid if line) + } + + n_vocab = len(ranks) + special_tokens = {} + + specials = [ + "<|endoftext|>", + "<|startoftranscript|>", + *[f"<|{lang}|>" for lang in list(LANGUAGES.keys())[:num_languages]], + "<|translate|>", + "<|transcribe|>", + "<|startoflm|>", + "<|startofprev|>", + "<|nospeech|>", + "<|notimestamps|>", + *[f"<|{i * 0.02:.2f}|>" for i in range(1501)], + ] + + for token in specials: + special_tokens[token] = n_vocab + n_vocab += 1 + + return tiktoken.Encoding( + name=vocab_path.name, + explicit_n_vocab=n_vocab, + pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", + mergeable_ranks=ranks, + special_tokens=special_tokens, + ) + + +@lru_cache(maxsize=None) +def get_tokenizer( + multilingual: bool, + *, + num_languages: int = 99, + language: Optional[str] = None, + task: Optional[str] = None, # Literal["transcribe", "translate", None] +) -> Tokenizer: + """Get a Whisper tokenizer. + + Args: + multilingual: Whether to use multilingual tokenizer (True for most models) + num_languages: Number of languages supported (default 99) + language: Language code (e.g., 'en', 'de') or None for auto-detect + task: 'transcribe' or 'translate' or None + + Returns: + Tokenizer instance + """ + if language is not None: + language = language.lower() + if language not in LANGUAGES: + if language in TO_LANGUAGE_CODE: + language = TO_LANGUAGE_CODE[language] + else: + raise ValueError(f"Unsupported language: {language}") + + if multilingual: + encoding_name = "multilingual" + language = language or "en" + task = task or "transcribe" + else: + encoding_name = "gpt2" + language = None + task = None + + encoding = get_encoding(name=encoding_name, num_languages=num_languages) + + return Tokenizer( + encoding=encoding, num_languages=num_languages, language=language, task=task + ) diff --git a/mlxk2/core/audio_runner.py b/mlxk2/core/audio_runner.py index 280146a..566bbc7 100644 --- a/mlxk2/core/audio_runner.py +++ b/mlxk2/core/audio_runner.py @@ -105,6 +105,67 @@ def _apply_tiktoken_patch(): _apply_tiktoken_patch() +# ============================================================================ +# CRITICAL: Patch Model.get_tokenizer() to use our tiktoken tokenizer +# ============================================================================ +# mlx-audio 0.3.1 (PyPI) removed the get_tokenizer() function that creates +# tiktoken-based tokenizers. The Model.get_tokenizer() method now throws +# ValueError if no HuggingFace processor is available. +# +# We patch Model.get_tokenizer() to fall back to our bundled tokenizer +# (mlxk2.audio.whisper_tokenizer) when no processor is available. +# +# This MUST happen at module import time, before any Whisper model is loaded! +# ============================================================================ + +def _apply_whisper_tokenizer_patch(): + """Patch Model.get_tokenizer to fall back to our tiktoken tokenizer.""" + try: + from mlx_audio.stt.models.whisper.whisper import Model as WhisperModel + from mlxk2.audio.whisper_tokenizer import get_tokenizer + + # Store original method (if it exists and isn't already patched) + if hasattr(WhisperModel, "_mlxk_original_get_tokenizer"): + # Already patched + return + + original_get_tokenizer = WhisperModel.get_tokenizer + + def patched_get_tokenizer(self, language=None, task="transcribe"): + """Patched get_tokenizer with tiktoken fallback. + + First tries the original method (uses HF Processor if available). + Falls back to our bundled tiktoken-based tokenizer on failure. + """ + # Try original first (uses HF Processor if available) + if hasattr(self, "_processor") and self._processor is not None: + try: + return original_get_tokenizer(self, language, task) + except Exception: + # HF Processor failed, fall through to tiktoken + pass + + # Fallback to our tiktoken-based tokenizer + return get_tokenizer( + self.is_multilingual, + num_languages=getattr(self, "num_languages", 99), + language=language, + task=task, + ) + + # Apply patch + WhisperModel.get_tokenizer = patched_get_tokenizer + WhisperModel._mlxk_original_get_tokenizer = original_get_tokenizer + + except ImportError: + # mlx-audio not installed - skip patching + pass + + +# Apply Whisper tokenizer patch immediately at module import +_apply_whisper_tokenizer_patch() + + class AudioRunner: """Wrapper around mlx-audio STT API for dedicated transcription models. diff --git a/mlxk2/operations/common.py b/mlxk2/operations/common.py index 0cf6a60..7c05eb9 100644 --- a/mlxk2/operations/common.py +++ b/mlxk2/operations/common.py @@ -426,25 +426,59 @@ def detect_capabilities( return caps -def vision_runtime_compatibility() -> tuple[bool, Optional[str]]: - """Vision uses mlx-vlm backend; mark compatible only if available.""" +def vision_runtime_compatibility(probe: Optional[Path] = None) -> tuple[bool, Optional[str]]: + """Vision uses mlx-vlm backend; mark compatible only if available. + + Args: + probe: Optional path to model snapshot for video processor detection + + Returns: + (is_compatible, reason): reason is None if compatible + """ if sys.version_info < (3, 10): return False, "Vision requires Python 3.10+ (mlx-vlm dependency)" spec = importlib.util.find_spec("mlx_vlm") if spec is None: return False, "mlx-vlm not installed (install extras: vision)" + + # Gate 3: Check for transformers 5.x video_processor bug + # transformers 5.0.x RC has a bug where video_processor_class_from_name() + # fails with "argument of type 'NoneType' is not iterable" for models + # with temporal_patch_size (video-capable models like Qwen2-VL) + if probe is not None: + try: + import transformers + tf_version = getattr(transformers, "__version__", "0.0.0") + # Check if transformers 5.x (RC or early release with potential bugs) + if tf_version.startswith("5."): + preprocessor_path = probe / "preprocessor_config.json" + if preprocessor_path.exists(): + preproc_data = _json.loads(preprocessor_path.read_text(encoding="utf-8", errors="ignore")) + if isinstance(preproc_data, dict) and "temporal_patch_size" in preproc_data: + return False, f"Video processor bug in transformers {tf_version} (use transformers<5.0 or wait for fix)" + except Exception: + pass # If check fails, proceed (may still work) + return True, None -def audio_runtime_compatibility(backend: Backend) -> tuple[bool, Optional[str]]: +def audio_runtime_compatibility( + backend: Backend, + probe: Optional[Path] = None, + framework: str = "MLX" +) -> tuple[bool, Optional[str]]: """Audio runtime check based on backend (ADR-020). Args: - backend: Backend.MLX_AUDIO (Whisper/Voxtral) or Backend.MLX_VLM (Gemma-3n) + backend: Backend.MLX_AUDIO (Whisper/Voxtral) or Backend.MLX_VLM (Gemma-3n, Qwen3-Omni) + probe: Path to model snapshot (required for MLX_VLM model_type check) + framework: Framework string (default "MLX") Returns: (is_compatible, reason): reason is None if compatible """ + from .health import check_runtime_compatibility + if sys.version_info < (3, 10): return False, "Audio requires Python 3.10+" @@ -453,10 +487,47 @@ def audio_runtime_compatibility(backend: Backend) -> tuple[bool, Optional[str]]: spec = importlib.util.find_spec("mlx_audio") if spec is None: return False, "mlx-audio not installed (pip install mlx-knife[audio])" + + # Gate 2: model_type must be supported by mlx-audio (Whisper, Voxtral only) + # This catches mis-routed models like Qwen3-Omni that have WhisperFeatureExtractor + # but are NOT STT models (they're multimodal with unsupported architecture) + if probe is not None: + config_path = probe / "config.json" + if config_path.exists(): + try: + config = _json.loads(config_path.read_text(encoding="utf-8", errors="ignore")) + model_type = config.get("model_type", "") + if isinstance(model_type, str): + model_type_lower = model_type.lower() + # Check if model_type is a known STT type + if not any(stt in model_type_lower for stt in ["whisper", "voxtral"]): + return False, f"Model type '{model_type}' not supported by mlx-audio (only Whisper/Voxtral)" + except Exception: + pass # If config can't be read, proceed (health check will catch it) + + # Gate 3: Check for Voxtral tekken.json tokenizer bug (mlx-audio#450) + # Voxtral uses tekken.json (Mistral tokenizer format) which mlx-audio can't convert properly + tekken_path = probe / "tekken.json" + tokenizer_path = probe / "tokenizer.json" + if tekken_path.exists() and not tokenizer_path.exists(): + return False, "Voxtral tekken.json tokenizer not supported (mlx-audio#450, upstream fix pending)" + return True, None elif backend == Backend.MLX_VLM: - # Multimodal audio (Gemma-3n) needs mlx-vlm - return vision_runtime_compatibility() + # Multimodal audio (Gemma-3n, Qwen3-Omni) needs mlx-vlm + # Gate 1: mlx-vlm must be available (pass probe for video_processor bug check) + vlm_ok, vlm_reason = vision_runtime_compatibility(probe) + if not vlm_ok: + return vlm_ok, vlm_reason + + # Gate 2: mlx-lm must support model_type (text-only fallback mode) + # This catches unsupported model_types like "qwen3_omni_moe" + if probe is not None: + text_ok, text_reason = check_runtime_compatibility(probe, framework) + if not text_ok: + return text_ok, text_reason + + return True, None else: return False, "Unknown audio backend" @@ -546,11 +617,12 @@ def build_model_object(hf_name: str, model_root: Path, selected_path: Optional[P runtime_reason = f"Incompatible framework: {framework}" elif has_audio and audio_backend is not None: # Audio models: check based on backend (ADR-020) - runtime_compatible, runtime_reason = audio_runtime_compatibility(audio_backend) + runtime_compatible, runtime_reason = audio_runtime_compatibility(audio_backend, probe, framework) elif has_vision: # Vision models: check BOTH backends for full chat+vision support # 1. mlx-vlm must be available (vision mode with images) - vision_ok, vision_reason = vision_runtime_compatibility() + # Pass probe for transformers 5.x video_processor bug detection + vision_ok, vision_reason = vision_runtime_compatibility(probe) # 2. mlx-lm must support model_type (text-only mode without images) text_ok, text_reason = check_runtime_compatibility(probe, framework) @@ -561,6 +633,10 @@ def build_model_object(hf_name: str, model_root: Path, selected_path: Optional[P runtime_compatible = False # Prefer text_reason as it's more specific (model_type not supported) runtime_reason = text_reason or vision_reason + elif Capability.EMBEDDINGS.value in capabilities: + # Embedding models: mlxk run doesn't support embeddings (future: mlxk embed) + runtime_compatible = False + runtime_reason = "Embedding models not supported by mlxk run (use mlxk embed)" else: runtime_compatible, runtime_reason = check_runtime_compatibility(probe, framework) diff --git a/mlxk2/operations/run.py b/mlxk2/operations/run.py index e27e5fa..7fa3fcf 100644 --- a/mlxk2/operations/run.py +++ b/mlxk2/operations/run.py @@ -382,7 +382,8 @@ def run_model( return error_result if is_vision_model: - compat, reason = vision_runtime_compatibility() + # Pass model_path for transformers 5.x video_processor bug detection + compat, reason = vision_runtime_compatibility(model_path) if not compat: error_msg = f"Model '{resolved_name}' is vision-capable but not runnable: {reason}" error_result = f"Error: {error_msg}" diff --git a/pyproject.toml b/pyproject.toml index 0d310f7..35ac362 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,19 +66,18 @@ dev = [ "mypy>=1.5.0", ] vision = [ - "mlx-vlm>=0.3.10", # Vision support (ADR-012) + "mlx-vlm==0.3.10", # Pinned: stable, tested version (ADR-012) ] audio = [ - # mlx-audio 0.3.1+ (tiktoken assets workaround bundled in mlxk2/assets/whisper/) - # See Issue #479: https://github.com/Blaizzy/mlx-audio/issues/479 - "mlx-audio>=0.3.1", - "tiktoken>=0.7.0", # Required by bundled tiktoken assets + # mlx-audio pinned: 0.3.1 has tiktoken regression (Issue #479) + # We bundle complete tiktoken workaround in mlxk2/audio/whisper_tokenizer.py + "mlx-audio==0.3.1", + "tiktoken>=0.7.0", # Required for Whisper tiktoken fallback ] all = [ - "mlx-vlm>=0.3.10", - # mlx-audio 0.3.1+ (tiktoken assets workaround bundled in mlxk2/assets/whisper/) - "mlx-audio>=0.3.1", - "tiktoken>=0.7.0", # Required by bundled tiktoken assets + "mlx-vlm==0.3.10", # Pinned: stable, tested version + "mlx-audio==0.3.1", # Pinned: tiktoken regression patched by mlxk2 + "tiktoken>=0.7.0", ] [tool.setuptools]