mirror of
https://github.com/cloudstack-llc/mlx-knife.git
synced 2026-07-01 20:44:14 -04:00
Release 2.0.0-beta.6: Stop token & compatibility bug fixes
Fixes Issue #32 (generic multi-EOS detection) and Issue #37 (model detection) - Generic stop token detection: Multi-EOS models (MXFP4, Qwen, Llama) now use eos_token_ids Set instead of model-specific workarounds - Private/org MLX model detection: `mlxk run` now works outside `mlx-community/*` namespace - Commit-pinned compatibility checks: Models with `@commit_hash` validated before inference - Packaging dependencies: Fixed `pip install -e .` requirements - ADR-009: Stop Token Detection Fix (generic approach + test strategy) - ADR-011: E2E Live Test Architecture (planned) See CHANGELOG.md and TESTING.md for details.
This commit is contained in:
@@ -22,3 +22,6 @@ install_*.log
|
|||||||
.claude/
|
.claude/
|
||||||
openwebui311/bin/
|
openwebui311/bin/
|
||||||
.gitignore
|
.gitignore
|
||||||
|
|
||||||
|
# Test artifacts (generated reports)
|
||||||
|
*_report.json
|
||||||
|
|||||||
@@ -1,5 +1,20 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## 2.0.0-beta.6 — 2025-10-22
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- **Stop token detection for multi-EOS models** (Issue #32, ADR-009): MXFP4 and Qwen models no longer generate visible stop tokens (`<|end|>`) or chat template markers in output
|
||||||
|
- **Private/org MLX model detection** (Issue #37): `mlxk run` now correctly detects MLX models outside `mlx-community/*` namespace
|
||||||
|
- **Commit-pinned compatibility checks**: Models with `@commit_hash` syntax now correctly validated before inference
|
||||||
|
- **Packaging dependencies** (P0): `pip install -e .` now installs all required dependencies (`mlx-lm`, `mlx`, `fastapi`, etc.) via `pyproject.toml`
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
- Simplified installation instructions in README.md and TESTING.md (consistent `pip install -e ".[dev,test]"` recommendation)
|
||||||
|
|
||||||
|
### Testing
|
||||||
|
- 297 passed, 20 skipped (317 total)
|
||||||
|
- Added 6 new tests: 4 stop token validation tests (opt-in), 2 compatibility check tests
|
||||||
|
|
||||||
## 2.0.0-beta.5 — 2025-10-20
|
## 2.0.0-beta.5 — 2025-10-20
|
||||||
|
|
||||||
**Enhanced Error Handling & Logging (ADR-004)**: Unified error envelope, structured logging with JSON support, and request correlation.
|
**Enhanced Error Handling & Logging (ADR-004)**: Unified error envelope, structured logging with JSON support, and request correlation.
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# <img src="https://github.com/mzau/mlx-knife/raw/main/broke-logo.png" alt="BROKE Logo" width="60" style="vertical-align: middle;"> MLX-Knife 2.0.0-beta.5
|
# <img src="https://github.com/mzau/mlx-knife/raw/main/broke-logo.png" alt="BROKE Logo" width="60" style="vertical-align: middle;"> MLX-Knife 2.0.0-beta.6
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<img src="https://github.com/mzau/mlx-knife/raw/feature/2.0.0-alpha.1/mlxk-demo.gif" alt="MLX Knife Demo" width="900">
|
<img src="https://github.com/mzau/mlx-knife/raw/feature/2.0.0-alpha.1/mlxk-demo.gif" alt="MLX Knife Demo" width="900">
|
||||||
@@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
**Stable Version: 1.1.1**
|
**Stable Version: 1.1.1**
|
||||||
|
|
||||||
[](https://github.com/mzau/mlx-knife/releases)
|
[](https://github.com/mzau/mlx-knife/releases)
|
||||||
[](https://opensource.org/licenses/MIT)
|
[](https://opensource.org/licenses/MIT)
|
||||||
[](https://www.python.org/downloads/)
|
[](https://www.python.org/downloads/)
|
||||||
[](https://support.apple.com/en-us/HT211814)
|
[](https://support.apple.com/en-us/HT211814)
|
||||||
@@ -43,10 +43,10 @@ MLX Knife has been comprehensively tested and verified on:
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Install latest beta release directly from GitHub
|
# Install latest beta release directly from GitHub
|
||||||
pip install https://github.com/mzau/mlx-knife/releases/download/v2.0.0-beta.5/mlxk_json-2.0.0b5-py3-none-any.whl
|
pip install https://github.com/mzau/mlx-knife/releases/download/v2.0.0-beta.6/mlxk_json-2.0.0b6-py3-none-any.whl
|
||||||
|
|
||||||
# Verify installation
|
# Verify installation
|
||||||
mlxk2 --version # → mlxk2 2.0.0b5
|
mlxk2 --version # → mlxk2 2.0.0b6
|
||||||
```
|
```
|
||||||
|
|
||||||
### Development Installation
|
### Development Installation
|
||||||
@@ -56,15 +56,21 @@ mlxk2 --version # → mlxk2 2.0.0b5
|
|||||||
git clone https://github.com/mzau/mlx-knife.git
|
git clone https://github.com/mzau/mlx-knife.git
|
||||||
cd mlx-knife
|
cd mlx-knife
|
||||||
git checkout feature/2.0.0-alpha.1
|
git checkout feature/2.0.0-alpha.1
|
||||||
pip install -e .
|
|
||||||
|
|
||||||
# Install with development tools (ruff, mypy, tests)
|
# Install with all development dependencies (required for testing and code quality)
|
||||||
pip install -e ".[dev,test]"
|
pip install -e ".[dev,test]"
|
||||||
|
|
||||||
# Verify installation
|
# Verify installation
|
||||||
mlxk2 --version # → mlxk2 2.0.0-beta.5
|
mlxk2 --version # → mlxk2 2.0.0-beta.6
|
||||||
|
|
||||||
|
# Run tests and quality checks (before committing)
|
||||||
|
pytest -v
|
||||||
|
ruff check mlxk2/ --fix
|
||||||
|
mypy mlxk2/
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Note:** For minimal user installation without dev tools: `pip install -e .`
|
||||||
|
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
|
|
||||||
@@ -565,6 +571,6 @@ Apache License 2.0 — see `LICENSE` (root) and `mlxk2/NOTICE`.
|
|||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<b>Made with ❤️ by The BROKE team <img src="broke-logo.png" alt="BROKE Logo" width="30" style="vertical-align: middle;"></b><br>
|
<b>Made with ❤️ by The BROKE team <img src="broke-logo.png" alt="BROKE Logo" width="30" style="vertical-align: middle;"></b><br>
|
||||||
<i>Version 2.0.0-beta.5 | October 2025</i><br>
|
<i>Version 2.0.0-beta.6 | October 2025</i><br>
|
||||||
<a href="https://github.com/mzau/broke-cluster">🔮 Next: BROKE Cluster for multi-node deployments</a>
|
<a href="https://github.com/mzau/broke-cluster">🔮 Next: BROKE Cluster for multi-node deployments</a>
|
||||||
</p>
|
</p>
|
||||||
|
|||||||
+38
-23
@@ -2,51 +2,60 @@
|
|||||||
|
|
||||||
## Current Status
|
## Current Status
|
||||||
|
|
||||||
✅ **295/295 tests passing** (October 2025) — 2.0.0-beta.5; 14 skipped (opt-in)
|
✅ **297/317 tests passing** (October 2025) — 2.0.0-beta.6; 20 skipped (opt-in)
|
||||||
✅ **Test environment:** macOS 14.x, M2 Max, Python 3.9-3.13
|
✅ **Test environment:** macOS 14.x, M2 Max, Python 3.9-3.13
|
||||||
✅ **Production verified & reported:** M1, M1 Max, M2 Max in real-world use
|
✅ **Production verified & reported:** M1, M1 Max, M2 Max in real-world use
|
||||||
✅ **Beta (CLI/JSON)** — stable features only, experimental features opt-in
|
✅ **Beta (CLI/JSON)** — stable features only, experimental features opt-in
|
||||||
✅ **Isolated test system** - user cache stays pristine with temp cache isolation
|
✅ **Isolated test system** - user cache stays pristine with temp cache isolation
|
||||||
✅ **3-category test strategy** - optimized for performance and safety
|
✅ **3-category test strategy** - optimized for performance and safety
|
||||||
|
|
||||||
### Skipped Tests Breakdown (14 total, standard run without HF_HOME)
|
### Skipped Tests Breakdown (20 total, standard run without HF_HOME)
|
||||||
|
- **4 Live Stop Tokens tests** - Stop token validation with real models (requires `pytest -m live_stop_tokens`, ADR-009)
|
||||||
|
- **1 Live Run test** - Private/org model detection (requires `pytest -m live_run`, Issue #37)
|
||||||
- **3 Live Clone tests** - APFS same-volume clone workflow (requires `MLXK2_LIVE_CLONE=1`)
|
- **3 Live Clone tests** - APFS same-volume clone workflow (requires `MLXK2_LIVE_CLONE=1`)
|
||||||
- **1 Live List test** - Tests against user cache (requires HF_HOME with models)
|
- **1 Live List test** - Tests against user cache (requires HF_HOME with models)
|
||||||
- **1 Live Push test** - Real HuggingFace push (requires `MLXK2_LIVE_PUSH=1`)
|
- **1 Live Push test** - Real HuggingFace push (requires `MLXK2_LIVE_PUSH=1`)
|
||||||
- **7 Issue #27 tests** - Real-model health validation (requires HF_HOME or MLXK2_USER_HF_HOME setup)
|
- **7 Issue #27 tests** - Real-model health validation (requires HF_HOME or MLXK2_USER_HF_HOME setup)
|
||||||
|
- **3 Additional opt-in tests** - Various live validation scenarios
|
||||||
|
|
||||||
## Quick Start (2.0 Default)
|
## Quick Start (2.0 Default)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Install package + tests
|
# Install package + development tools (required for ruff/mypy/pytest)
|
||||||
pip install -e .[test]
|
pip install -e ".[dev,test]"
|
||||||
|
|
||||||
# Download test model (optional; most 2.0 tests use isolated cache)
|
# Download test model (optional; most 2.0 tests use isolated cache)
|
||||||
# Only needed for opt-in live tests or local experiments
|
# Only needed for opt-in live tests or local experiments
|
||||||
# mlxk pull mlx-community/Phi-3-mini-4k-instruct-4bit
|
# mlxk pull mlx-community/Phi-3-mini-4k-instruct-4bit
|
||||||
|
|
||||||
# Run 2.0 tests (default discovery: tests_2.0/)
|
# Run 2.0 tests (default discovery: tests_2.0/)
|
||||||
pytest -v # 295 passed, 14 skipped
|
pytest -v # Runs ~300 tests (isolated, no live downloads)
|
||||||
|
|
||||||
# Optional: Enable alpha push and clone tests
|
# Optional: Enable alpha push and clone tests
|
||||||
MLXK2_ENABLE_ALPHA_FEATURES=1 pytest -v # 298 passed, 11 skipped
|
MLXK2_ENABLE_ALPHA_FEATURES=1 pytest -v # Activates alpha features (clone/push)
|
||||||
|
|
||||||
# Live tests (opt-in; not part of default):
|
# Live tests (opt-in; not part of default suite):
|
||||||
|
# - Live stop tokens (ADR-009 - requires models in HF_HOME):
|
||||||
|
# pytest -m live_stop_tokens
|
||||||
|
# # Tests: MXFP4, Qwen 2.5, Llama 3.2 stop token behavior
|
||||||
|
# - Live run (requires models in HF_HOME):
|
||||||
|
# pytest -m live_run
|
||||||
|
# # Tests: Issue #37 private/org model detection
|
||||||
# - Live push (requires alpha features + env):
|
# - Live push (requires alpha features + env):
|
||||||
# export MLXK2_ENABLE_ALPHA_FEATURES=1
|
# export MLXK2_ENABLE_ALPHA_FEATURES=1
|
||||||
# export MLXK2_LIVE_PUSH=1
|
# export MLXK2_LIVE_PUSH=1
|
||||||
# export HF_TOKEN=...; export MLXK2_LIVE_REPO=org/model; export MLXK2_LIVE_WORKSPACE=/abs/path
|
# export HF_TOKEN=...; export MLXK2_LIVE_REPO=org/model; export MLXK2_LIVE_WORKSPACE=/abs/path
|
||||||
# pytest -q -m live_push
|
# pytest -m live_push
|
||||||
# - Live clone (ADR-007 Phase 1 - requires alpha features + env + same volume):
|
# - Live clone (ADR-007 Phase 1 - requires alpha features + env + same volume):
|
||||||
# export MLXK2_ENABLE_ALPHA_FEATURES=1
|
# export MLXK2_ENABLE_ALPHA_FEATURES=1
|
||||||
# export MLXK2_LIVE_CLONE=1
|
# export MLXK2_LIVE_CLONE=1
|
||||||
# export HF_TOKEN=...
|
# export HF_TOKEN=...
|
||||||
# export MLXK2_LIVE_CLONE_MODEL="mlx-community/small-model"
|
# export MLXK2_LIVE_CLONE_MODEL="mlx-community/small-model"
|
||||||
# export MLXK2_LIVE_CLONE_WORKSPACE="/path/on/same/volume/as/HF_HOME" # APFS + same volume required
|
# export MLXK2_LIVE_CLONE_WORKSPACE="/path/on/same/volume/as/HF_HOME" # APFS + same volume required
|
||||||
# pytest -q -m live_clone
|
# pytest -m live_clone
|
||||||
# - Live list (uses your HF_HOME; requires at least one MLX chat + one MLX base in cache):
|
# - Live list (uses your HF_HOME; requires at least one MLX chat + one MLX base in cache):
|
||||||
# export HF_HOME=/path/to/huggingface/cache
|
# export HF_HOME=/path/to/huggingface/cache
|
||||||
# pytest -q -m live_list
|
# pytest -m live_list
|
||||||
|
|
||||||
# Before committing
|
# Before committing
|
||||||
ruff check mlxk2/ --fix && mypy mlxk2/ && pytest -v
|
ruff check mlxk2/ --fix && mypy mlxk2/ && pytest -v
|
||||||
@@ -54,7 +63,7 @@ ruff check mlxk2/ --fix && mypy mlxk2/ && pytest -v
|
|||||||
|
|
||||||
Notes
|
Notes
|
||||||
- Reference environment: venv39 (Apple‑native Python 3.9) is the recommended dev base.
|
- Reference environment: venv39 (Apple‑native Python 3.9) is the recommended dev base.
|
||||||
- Extras `[test]` install httpx/FastAPI so the server minimal tests run.
|
- Extras `[dev,test]` install ruff/mypy (code quality) and pytest/jsonschema (testing).
|
||||||
- For release smoke across multiple Python versions: `./test-multi-python.sh` (logs: `test_results_3_9.log`, `test_results_3_10.log`, ...).
|
- For release smoke across multiple Python versions: `./test-multi-python.sh` (logs: `test_results_3_9.log`, `test_results_3_10.log`, ...).
|
||||||
- The macOS Python 3.9 LibreSSL warning from urllib3 is suppressed in tests via `pytest.ini`, and at runtime via package init.
|
- The macOS Python 3.9 LibreSSL warning from urllib3 is suppressed in tests via `pytest.ini`, and at runtime via package init.
|
||||||
|
|
||||||
@@ -705,17 +714,17 @@ pytest tests/integration/test_server_functionality.py -v
|
|||||||
|
|
||||||
### Verification Results (October 2025)
|
### Verification Results (October 2025)
|
||||||
|
|
||||||
**✅ 295/295 tests passing** - All standard tests validated on Apple Silicon with enhanced isolation
|
**✅ 297/317 tests passing** - All standard tests validated on Apple Silicon with enhanced isolation
|
||||||
|
|
||||||
| Python Version | Status | Tests Passing | Skipped |
|
| Python Version | Status | Tests Passing | Skipped |
|
||||||
|----------------|--------|---------------|---------|
|
|----------------|--------|---------------|---------|
|
||||||
| 3.9.6 (macOS) | ✅ Verified | 295/295 | 14 |
|
| 3.9.6 (macOS) | ✅ Verified | 297/317 | 20 |
|
||||||
| 3.10.x | ✅ Verified | 295/295 | 14 |
|
| 3.10.x | ✅ Verified | 297/317 | 20 |
|
||||||
| 3.11.x | ✅ Verified | 295/295 | 14 |
|
| 3.11.x | ✅ Verified | 297/317 | 20 |
|
||||||
| 3.12.x | ✅ Verified | 295/295 | 14 |
|
| 3.12.x | ✅ Verified | 297/317 | 20 |
|
||||||
| 3.13.x | ✅ Verified | 295/295 | 14 |
|
| 3.13.x | ✅ Verified | 297/317 | 20 |
|
||||||
|
|
||||||
**Note:** 14 skipped tests are opt-in (live tests, alpha features). Skipped count may vary by environment:
|
**Note:** 20 skipped tests are opt-in (live tests, alpha features). Skipped count may vary by environment:
|
||||||
- Without `HF_TOKEN`: +1 skip (live push test)
|
- Without `HF_TOKEN`: +1 skip (live push test)
|
||||||
- Without `MLXK2_ENABLE_ALPHA_FEATURES=1`: +3 skips (alpha feature tests)
|
- Without `MLXK2_ENABLE_ALPHA_FEATURES=1`: +3 skips (alpha feature tests)
|
||||||
- Without `jsonschema`: +1 skip (spec validation test)
|
- Without `jsonschema`: +1 skip (spec validation test)
|
||||||
@@ -770,6 +779,8 @@ ruff check mlx_knife/ --fix && mypy mlx_knife/ && pytest
|
|||||||
| Live List (opt‑in) | `pytest -m live_list -v` | `live_list` (subset of `wet`) + Env: `HF_HOME` (user cache with models) | Tests list/health against user cache models | No (uses local cache) |
|
| Live List (opt‑in) | `pytest -m live_list -v` | `live_list` (subset of `wet`) + Env: `HF_HOME` (user cache with models) | Tests list/health against user cache models | No (uses local cache) |
|
||||||
| Clone (alpha, opt‑in) | `MLXK2_ENABLE_ALPHA_FEATURES=1 pytest -k clone -v` | Env: `MLXK2_ENABLE_ALPHA_FEATURES=1` | Clone offline tests (Pull+Copy+Cleanup workflow, APFS optimization); clone command hidden by default | No |
|
| Clone (alpha, opt‑in) | `MLXK2_ENABLE_ALPHA_FEATURES=1 pytest -k clone -v` | Env: `MLXK2_ENABLE_ALPHA_FEATURES=1` | Clone offline tests (Pull+Copy+Cleanup workflow, APFS optimization); clone command hidden by default | No |
|
||||||
| Live Clone (ADR-007) | `MLXK2_ENABLE_ALPHA_FEATURES=1 pytest -m live_clone -v` | `live_clone` + Env: `MLXK2_ENABLE_ALPHA_FEATURES=1`, `MLXK2_LIVE_CLONE=1`, `HF_TOKEN`, `MLXK2_LIVE_CLONE_MODEL`, `MLXK2_LIVE_CLONE_WORKSPACE` | Real clone workflow: pull→temp cache→APFS same-volume clone→workspace (ADR-007 Phase 1 constraints: same volume + APFS required) | Yes |
|
| Live Clone (ADR-007) | `MLXK2_ENABLE_ALPHA_FEATURES=1 pytest -m live_clone -v` | `live_clone` + Env: `MLXK2_ENABLE_ALPHA_FEATURES=1`, `MLXK2_LIVE_CLONE=1`, `HF_TOKEN`, `MLXK2_LIVE_CLONE_MODEL`, `MLXK2_LIVE_CLONE_WORKSPACE` | Real clone workflow: pull→temp cache→APFS same-volume clone→workspace (ADR-007 Phase 1 constraints: same volume + APFS required) | Yes |
|
||||||
|
| Live Stop Tokens (opt‑in, ADR-009) | `pytest -m live_stop_tokens -v` | `live_stop_tokens` + Env: `HF_HOME` (user cache with MXFP4/Qwen/Llama models) | Issue #32: Validates multi-EOS token stop behavior with real models (MXFP4 no visible `<|end|>`, Qwen no self-conversation, Llama baseline) | No (uses local cache) |
|
||||||
|
| Live Run (opt‑in) | `pytest -m live_run -v` | `live_run` + Env: `MLXK2_USER_HF_HOME` or `HF_HOME` (user cache with `mlx-community/Phi-3-mini-4k-instruct-4bit`) | Regression tests for Issue #37: Validates private/org MLX model framework detection in run command (renames Phi-3 to simulate private-org model) | No (uses local cache) |
|
||||||
| Issue #27 real‑model (opt‑in) | `pytest -m issue27 tests_2.0/test_issue_27.py -v` | Marker: `issue27`; Env (required): `MLXK2_USER_HF_HOME` or `HF_HOME` (user cache, read‑only). Env (optional): `MLXK2_ISSUE27_MODEL`, `MLXK2_ISSUE27_INDEX_MODEL`, `MLXK2_SUBSET_COUNT=0`. | Copies real models from user cache into isolated test cache; validates strict health policy on index‑based models (no network) | No (uses local cache) |
|
| Issue #27 real‑model (opt‑in) | `pytest -m issue27 tests_2.0/test_issue_27.py -v` | Marker: `issue27`; Env (required): `MLXK2_USER_HF_HOME` or `HF_HOME` (user cache, read‑only). Env (optional): `MLXK2_ISSUE27_MODEL`, `MLXK2_ISSUE27_INDEX_MODEL`, `MLXK2_SUBSET_COUNT=0`. | Copies real models from user cache into isolated test cache; validates strict health policy on index‑based models (no network) | No (uses local cache) |
|
||||||
| Server tests (included) | `pytest -k server -v` | — | Basic server API tests (minimal, uses MLX stubs) | No |
|
| Server tests (included) | `pytest -k server -v` | — | Basic server API tests (minimal, uses MLX stubs) | No |
|
||||||
|
|
||||||
@@ -780,6 +791,8 @@ Useful commands
|
|||||||
- Live Push only: `MLXK2_ENABLE_ALPHA_FEATURES=1 MLXK2_LIVE_PUSH=1 HF_TOKEN=... MLXK2_LIVE_REPO=... MLXK2_LIVE_WORKSPACE=... pytest -m live_push -v`
|
- Live Push only: `MLXK2_ENABLE_ALPHA_FEATURES=1 MLXK2_LIVE_PUSH=1 HF_TOKEN=... MLXK2_LIVE_REPO=... MLXK2_LIVE_WORKSPACE=... pytest -m live_push -v`
|
||||||
- Live Clone only: `MLXK2_ENABLE_ALPHA_FEATURES=1 MLXK2_LIVE_CLONE=1 HF_TOKEN=... MLXK2_LIVE_CLONE_MODEL=... MLXK2_LIVE_CLONE_WORKSPACE=... pytest -m live_clone -v`
|
- Live Clone only: `MLXK2_ENABLE_ALPHA_FEATURES=1 MLXK2_LIVE_CLONE=1 HF_TOKEN=... MLXK2_LIVE_CLONE_MODEL=... MLXK2_LIVE_CLONE_WORKSPACE=... pytest -m live_clone -v`
|
||||||
- Live List only: `HF_HOME=/path/to/user/cache pytest -m live_list -v`
|
- Live List only: `HF_HOME=/path/to/user/cache pytest -m live_list -v`
|
||||||
|
- Live Stop Tokens only (ADR-009): `HF_HOME=/path/to/user/cache pytest -m live_stop_tokens -v` (requires MXFP4, Qwen 2.5, Llama 3.2 models in cache)
|
||||||
|
- Live Run only: `HF_HOME=/path/to/user/cache pytest -m live_run -v` (requires `mlx-community/Phi-3-mini-4k-instruct-4bit` in cache)
|
||||||
- Issue #27 only: `MLXK2_USER_HF_HOME=/path/to/user/cache pytest -m issue27 tests_2.0/test_issue_27.py -v`
|
- Issue #27 only: `MLXK2_USER_HF_HOME=/path/to/user/cache pytest -m issue27 tests_2.0/test_issue_27.py -v`
|
||||||
- All live tests (umbrella): `MLXK2_ENABLE_ALPHA_FEATURES=1 pytest -m wet -v` (includes live_push, live_clone, live_list)
|
- All live tests (umbrella): `MLXK2_ENABLE_ALPHA_FEATURES=1 pytest -m wet -v` (includes live_push, live_clone, live_list)
|
||||||
|
|
||||||
@@ -787,6 +800,8 @@ Markers: wet vs specific live tests
|
|||||||
- `wet`: umbrella marker for any opt‑in "live" test that may require network, credentials, or user environment. Use to run all live tests.
|
- `wet`: umbrella marker for any opt‑in "live" test that may require network, credentials, or user environment. Use to run all live tests.
|
||||||
- `live_push`: narrow marker for push‑specific live tests only. Use to target push live checks without running other live suites.
|
- `live_push`: narrow marker for push‑specific live tests only. Use to target push live checks without running other live suites.
|
||||||
- `live_clone`: narrow marker for clone‑specific live tests only. Use to target ADR-007 Phase 1 real workflow validation.
|
- `live_clone`: narrow marker for clone‑specific live tests only. Use to target ADR-007 Phase 1 real workflow validation.
|
||||||
|
- `live_stop_tokens`: narrow marker for stop token validation tests with real models (ADR-009). Use to validate Issue #32 fix (multi-EOS models).
|
||||||
|
- `live_run`: narrow marker for run command tests with real models. Use to validate Issue #37 framework detection regression fix (private/org MLX models).
|
||||||
|
|
||||||
Note: Without the required env vars, live tests remain SKIPPED.
|
Note: Without the required env vars, live tests remain SKIPPED.
|
||||||
|
|
||||||
@@ -897,11 +912,11 @@ When submitting PRs, please include:
|
|||||||
- Python version
|
- Python version
|
||||||
- Which model(s) you tested with
|
- Which model(s) you tested with
|
||||||
|
|
||||||
2. **Test results summary (2.0)**:
|
2. **Test results summary (2.0)** (example format):
|
||||||
```
|
```
|
||||||
Platform: macOS 14.5, M2 Pro
|
Platform: macOS 14.5, M2 Pro
|
||||||
Python: 3.11.6
|
Python: 3.9.6
|
||||||
Results: 98/98 tests passed; 9 skipped (opt-in)
|
Results: 297 passed, 20 skipped
|
||||||
```
|
```
|
||||||
|
|
||||||
3. **Any issues encountered** and how you resolved them
|
3. **Any issues encountered** and how you resolved them
|
||||||
@@ -910,7 +925,7 @@ When submitting PRs, please include:
|
|||||||
|
|
||||||
**MLX Knife 2.0 Testing Status:**
|
**MLX Knife 2.0 Testing Status:**
|
||||||
|
|
||||||
✅ **Feature Complete** - 295/295 tests passing (2.0.0-beta.5)
|
✅ **Feature Complete** - 300+ tests (2.0 Beta, see CHANGELOG.md for current release counts)
|
||||||
✅ **Enhanced Isolation** - Sentinel protection with `isolated_cache` fixture
|
✅ **Enhanced Isolation** - Sentinel protection with `isolated_cache` fixture
|
||||||
✅ **3-Category Strategy** - Isolated/Live/Server tests optimized for 2.0
|
✅ **3-Category Strategy** - Isolated/Live/Server tests optimized for 2.0
|
||||||
✅ **Multi-Python Support** - Python 3.9-3.13 verified
|
✅ **Multi-Python Support** - Python 3.9-3.13 verified
|
||||||
@@ -987,4 +1002,4 @@ def test_model_generation_quality(model_name: str, ram_needed: int):
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
*MLX-Knife 2.0.0-beta.5*
|
*MLX-Knife 2.0.0-beta.6*
|
||||||
|
|||||||
@@ -0,0 +1,356 @@
|
|||||||
|
# ADR-009: Stop Token Detection Fix
|
||||||
|
|
||||||
|
**Status:** Accepted
|
||||||
|
**Date:** 2025-10-21
|
||||||
|
**Supersedes:** Issue #32 discussions (September 2025)
|
||||||
|
**Affects:** Runner (Beta.6)
|
||||||
|
**Related:** ADR-010 (Reasoning Content API - Future)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
### Problem Statement
|
||||||
|
|
||||||
|
Issue #32 requests migration from model-specific workarounds to **generic stop token detection** using native chat templates and mlx-lm APIs.
|
||||||
|
|
||||||
|
**Current State:**
|
||||||
|
- ✅ MXFP4 works (via hardcoded `<|end|>` skip in `stop_tokens.py:49`)
|
||||||
|
- ❌ Not state-of-the-art (model-specific "Gebastel")
|
||||||
|
- ❌ Every new model needs custom pattern
|
||||||
|
- ❌ Runner uses singular `eos_token_id` instead of `eos_token_ids` Set
|
||||||
|
|
||||||
|
**Goal:**
|
||||||
|
Use **mlx-lm TokenizerWrapper APIs** as primary mechanism, fall back to model-specific handling only when needed.
|
||||||
|
|
||||||
|
### Root Cause Analysis
|
||||||
|
|
||||||
|
**Runner Bug (mlxk2/core/runner/__init__.py:468, 589):**
|
||||||
|
```python
|
||||||
|
# CURRENT (checks only singular ID)
|
||||||
|
if token_id == self.tokenizer.eos_token_id:
|
||||||
|
break
|
||||||
|
|
||||||
|
# SHOULD BE (checks Set of IDs)
|
||||||
|
if token_id in self.tokenizer.eos_token_ids:
|
||||||
|
break
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why `eos_token_ids` is better:**
|
||||||
|
- mlx-lm `TokenizerWrapper` normalizes `eos_token_id` → `eos_token_ids` (Set)
|
||||||
|
- Handles models with multiple EOS tokens (e.g., Llama 3: `[128001, 128009]`)
|
||||||
|
- Generic mechanism, no model-specific code needed
|
||||||
|
|
||||||
|
**Example (MXFP4):**
|
||||||
|
```python
|
||||||
|
# HuggingFace config (upstream bug)
|
||||||
|
tokenizer.eos_token_id = 200002 # Only <|return|>, missing 200007 (<|end|>)
|
||||||
|
|
||||||
|
# But added_tokens_decoder has both:
|
||||||
|
{
|
||||||
|
200002: "<|return|>",
|
||||||
|
200007: "<|end|>"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Current workaround (stop_tokens.py:49):
|
||||||
|
if token_content == '<|end|>':
|
||||||
|
continue # Skip adding to stop_tokens
|
||||||
|
|
||||||
|
# Hypothesis: 2-LOC fix may be sufficient
|
||||||
|
# If not, fallback to add_eos_token():
|
||||||
|
tokenizer.add_eos_token("<|end|>") # Adds 200007 to eos_token_ids set
|
||||||
|
```
|
||||||
|
|
||||||
|
### Current Workarounds
|
||||||
|
|
||||||
|
**Model-Specific Code:**
|
||||||
|
1. `stop_tokens.py:49` - Hardcoded `<|end|>` skip for MXFP4
|
||||||
|
2. `stop_tokens.py:92` - Hardcoded `<|return|>` add for gpt-oss
|
||||||
|
3. `reasoning.py:22-33` - MXFP4 reasoning patterns
|
||||||
|
|
||||||
|
**These work, but are not scalable for future models.**
|
||||||
|
|
||||||
|
### Constraints
|
||||||
|
|
||||||
|
1. **Generic First:** Use mlx-lm APIs, avoid model-specific code when possible
|
||||||
|
2. **Pragmatic Fallback:** Keep model-specific handling if needed (not all models are perfect)
|
||||||
|
3. **No Breaking Changes:** Existing models must continue working
|
||||||
|
4. **Focus Models Only:** Test MXFP4, Qwen 2.5, Llama 3.2 (not all models)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### Test-Driven Fix Strategy
|
||||||
|
|
||||||
|
**Step 1: Implement Real-Model Test Suite**
|
||||||
|
|
||||||
|
Required before any code changes - we need empirical data to validate the fix.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# tests_2.0/test_stop_tokens_live.py (see Test Strategy section below)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Baseline Measurement**
|
||||||
|
|
||||||
|
Document current behavior with existing workarounds:
|
||||||
|
- MXFP4: Does `<|end|>` appear in output? (expected: NO, via workaround)
|
||||||
|
- Qwen 2.5: Does self-conversation occur? (expected: document pattern)
|
||||||
|
- Llama 3.2: Does generation work correctly? (expected: YES)
|
||||||
|
|
||||||
|
**Step 3: Apply 2-LOC Fix**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# mlxk2/core/runner/__init__.py:468 (generate_streaming)
|
||||||
|
if token_id in self.tokenizer.eos_token_ids: # Changed: == to in
|
||||||
|
break
|
||||||
|
|
||||||
|
# mlxk2/core/runner/__init__.py:589 (generate_batch)
|
||||||
|
if token_id in self.tokenizer.eos_token_ids: # Changed: == to in
|
||||||
|
break
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 4: Re-Test & Evaluate**
|
||||||
|
|
||||||
|
Run test suite again. Three possible outcomes:
|
||||||
|
|
||||||
|
| Outcome | Action |
|
||||||
|
|---------|--------|
|
||||||
|
| ✅ All tests pass | Remove obsolete workarounds, ship Beta.6 |
|
||||||
|
| ⚠️ Some tests fail | Investigate: Need `add_eos_token()` integration? |
|
||||||
|
| ❌ Tests still fail | Document findings, implement targeted fixes |
|
||||||
|
|
||||||
|
**Step 5: Conditional Cleanup**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# stop_tokens.py:49 - Remove IF tests pass without it
|
||||||
|
if token_content == '<|end|>':
|
||||||
|
continue # ← DELETE if generic fix works
|
||||||
|
|
||||||
|
# stop_tokens.py:92 - Keep IF still needed
|
||||||
|
if model_type == 'gpt-oss':
|
||||||
|
stop_tokens.add('<|return|>') # Keep with comment: "Upstream config bug"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 6 (Optional): add_eos_token() Integration**
|
||||||
|
|
||||||
|
If tests reveal that `eos_token_ids` doesn't contain all necessary EOS tokens:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Option A: In stop_tokens.py:extract_stop_tokens() (after line 55)
|
||||||
|
# When we find EOS-like tokens in added_tokens_decoder, register them:
|
||||||
|
if token_content in ['<|end|>', '<|return|>']: # Derived from added_tokens_decoder values we flag as EOS
|
||||||
|
tokenizer.add_eos_token(token_content)
|
||||||
|
# NOTE: Modifies tokenizer state, but needed for upstream config bugs
|
||||||
|
|
||||||
|
# Option B: In runner/__init__.py:load_model() (after line 192)
|
||||||
|
# Model-specific fixes after tokenizer load:
|
||||||
|
if 'mxfp4' in str(model_path).lower():
|
||||||
|
self.tokenizer.add_eos_token("<|end|>")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Decision Point:** Only implement Step 6 if empirical testing shows it's necessary.
|
||||||
|
|
||||||
|
**Philosophy:**
|
||||||
|
- **Test-driven** (measure before fixing)
|
||||||
|
- **Generic first** (2-LOC fix should work for most models)
|
||||||
|
- **Pragmatic fallback** (`add_eos_token()` only if needed)
|
||||||
|
- **Not our job to fix all models** (focus on priority models)
|
||||||
|
|
||||||
|
### Implementation Status (2025-10-21)
|
||||||
|
|
||||||
|
**Steps 1-4: ✅ COMPLETED**
|
||||||
|
- Real-model test suite implemented in `tests_2.0/test_stop_tokens_live.py` (4 tests, 3 models)
|
||||||
|
- 2-LOC fix applied in `mlxk2/core/runner/__init__.py:468,590`
|
||||||
|
- Empirical validation executed (see `stop_token_config_report.json`)
|
||||||
|
- Results: Generic fix alone **not sufficient** - MXFP4 still requires `add_eos_token()` workaround
|
||||||
|
|
||||||
|
**Step 5: ⏸️ SKIPPED**
|
||||||
|
- Conditional cleanup deferred (workarounds still active)
|
||||||
|
- Rationale: Step 6 became necessary, re-evaluate cleanup after stabilization
|
||||||
|
|
||||||
|
**Step 6: 🔧 ACTIVE (Deterministic Guard)**
|
||||||
|
- `add_eos_token()` implemented in `mlxk2/core/runner/stop_tokens.py:49-56`
|
||||||
|
- **Implementation differs from "optional" plan:**
|
||||||
|
- Originally planned: "only if empirical tests show it's needed"
|
||||||
|
- Actually implemented: Unconditional call whenever `<|end|>` appears in config
|
||||||
|
- Rationale: Deterministic guard for MXFP4-class models (pragmatic workaround)
|
||||||
|
- No tokenizer state mutation side-effects observed (callable check + exception guard)
|
||||||
|
|
||||||
|
**Outstanding Work:**
|
||||||
|
- Portfolio discovery not yet implemented (hard-coded 3 models in test suite)
|
||||||
|
- Workaround cleanup evaluation (lines 49, 99 in `stop_tokens.py`)
|
||||||
|
- Empirical validation scope expansion (currently 3 models, aim for full cache coverage)
|
||||||
|
|
||||||
|
### Non-Goals (Beta.6)
|
||||||
|
|
||||||
|
- ❌ Test all models (unrealistic)
|
||||||
|
- ❌ Remove all workarounds (only remove obsolete ones)
|
||||||
|
- ❌ Fix upstream HuggingFace configs (report issues, but don't block on them)
|
||||||
|
- ❌ Reasoning API changes (see ADR-010)
|
||||||
|
|
||||||
|
### Test Strategy
|
||||||
|
|
||||||
|
**Real-Model Test Suite Required:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# tests_2.0/test_stop_tokens_live.py
|
||||||
|
|
||||||
|
@pytest.mark.live_stop_tokens
|
||||||
|
def test_mxfp4_stop_tokens():
|
||||||
|
"""Verify <|end|> doesn't appear in output."""
|
||||||
|
runner = MLXRunner("mlx-community/gpt-oss-20b-MXFP4-Q8")
|
||||||
|
response = runner.generate_batch("Write one sentence about cats.", max_tokens=50)
|
||||||
|
|
||||||
|
assert "<|end|>" not in response # Should be filtered
|
||||||
|
assert "<|return|>" not in response # Should stop before this
|
||||||
|
|
||||||
|
@pytest.mark.live_stop_tokens
|
||||||
|
def test_qwen_self_conversation():
|
||||||
|
"""Verify model stops before generating turn-taking markers (no self-conversation).
|
||||||
|
|
||||||
|
Self-conversation occurs when the model generates the next user turn prompt
|
||||||
|
instead of stopping after its own response. This manifests as chat template
|
||||||
|
role markers appearing in the output (e.g., "\\nUser:", "\\nHuman:").
|
||||||
|
|
||||||
|
Expected behavior: Model stops cleanly after its response, before any role markers.
|
||||||
|
"""
|
||||||
|
runner = MLXRunner("mlx-community/Qwen2.5-0.5B-Instruct-4bit")
|
||||||
|
|
||||||
|
# Test with simple prompt that might trigger multi-turn continuation
|
||||||
|
response = runner.generate_batch("Hello", max_tokens=50)
|
||||||
|
|
||||||
|
# Assert no role markers from chat template appear in output
|
||||||
|
# (These would indicate the model is generating the next turn)
|
||||||
|
chat_turn_markers = [
|
||||||
|
'\nUser:', '\nHuman:', '\nYou:', '\nAssistant:',
|
||||||
|
'\n\nUser:', '\n\nHuman:', '\n\nYou:', '\n\nAssistant:',
|
||||||
|
'<|im_start|>user', '<|im_start|>assistant' # Qwen-specific markers
|
||||||
|
]
|
||||||
|
|
||||||
|
for marker in chat_turn_markers:
|
||||||
|
assert marker not in response, (
|
||||||
|
f"Self-conversation detected: Found '{marker}' in response. "
|
||||||
|
f"Model should stop before generating next turn."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Baseline: Verify we got a non-empty response
|
||||||
|
assert response.strip(), "Response should not be empty"
|
||||||
|
|
||||||
|
@pytest.mark.live_stop_tokens
|
||||||
|
def test_llama_regression():
|
||||||
|
"""Ensure Llama still works (control)."""
|
||||||
|
runner = MLXRunner("mlx-community/Llama-3.2-3B-Instruct-4bit")
|
||||||
|
response = runner.generate_batch("Hi", max_tokens=20)
|
||||||
|
|
||||||
|
assert response # Should generate something
|
||||||
|
assert "<|eot_id|>" not in response # Stop token filtered
|
||||||
|
```
|
||||||
|
|
||||||
|
**Test Phases:**
|
||||||
|
1. **Baseline:** Document current behavior (with workarounds)
|
||||||
|
2. **Generic Fix:** Apply 2-LOC change, test all 3 models
|
||||||
|
3. **Cleanup:** Remove obsolete workarounds if tests pass
|
||||||
|
|
||||||
|
**See:** `docs/ADR/appendix/ADR-009-test-plan.md` for details
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
|
||||||
|
- ✅ **State-of-the-Art:** Uses mlx-lm APIs (same as reference implementation)
|
||||||
|
- ✅ **Minimal Code Change:** 2 LOC fix (`==` → `in`, twice)
|
||||||
|
- ✅ **Scalable:** New models automatically supported if configs are correct
|
||||||
|
- ✅ **Pragmatic:** Model-specific code stays if needed (with clear comments)
|
||||||
|
- ✅ **Non-Breaking:** Existing models continue working
|
||||||
|
|
||||||
|
### Negative
|
||||||
|
|
||||||
|
- ⚠️ **Upstream Bugs Remain:** HuggingFace configs may be incomplete
|
||||||
|
- ⚠️ **Test Dependency:** Requires real models (~3GB download for CI)
|
||||||
|
- ⚠️ **Partial Coverage:** Only focus models validated, not all models
|
||||||
|
|
||||||
|
### Risks & Mitigation
|
||||||
|
|
||||||
|
| Risk | Mitigation |
|
||||||
|
|------|------------|
|
||||||
|
| Generic approach breaks MXFP4 | Keep workaround if tests fail |
|
||||||
|
| Unknown models have issues | Users report, we fix incrementally |
|
||||||
|
| CI becomes slow (model downloads) | Use cached models, mark tests as slow |
|
||||||
|
|
||||||
|
### Trade-offs
|
||||||
|
|
||||||
|
**Accepted:**
|
||||||
|
- Not testing all models (focus on priority models only)
|
||||||
|
- Keeping some workarounds if needed (pragmatism over purity)
|
||||||
|
- Incremental improvement (not perfect, but better than status quo)
|
||||||
|
|
||||||
|
**Rejected:**
|
||||||
|
- Testing all models (unrealistic)
|
||||||
|
- Removing all workarounds blindly (risky)
|
||||||
|
- Waiting for upstream fixes (blocks progress)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Implementation Plan
|
||||||
|
|
||||||
|
**Priority:** CRITICAL (Issue #32 open since September)
|
||||||
|
|
||||||
|
**Tasks:**
|
||||||
|
1. ✅ Research findings documented (this ADR)
|
||||||
|
2. ⏳ Implement real-model test suite (`test_stop_tokens_live.py`)
|
||||||
|
3. ⏳ Baseline measurement (document current behavior with all 3 models)
|
||||||
|
4. ⏳ Apply 2-LOC fix (runner/__init__.py:468, 589)
|
||||||
|
5. ⏳ Re-test & evaluate (compare before/after behavior)
|
||||||
|
6. ⏳ Conditional: Implement `add_eos_token()` integration (ONLY if tests fail)
|
||||||
|
7. ⏳ Conditional: Remove obsolete workarounds (ONLY if tests pass without them)
|
||||||
|
8. ⏳ Update TESTING.md + CHANGELOG.md + close Issue #32
|
||||||
|
|
||||||
|
**Estimated Effort:** 2-3 sessions (test suite implementation is non-trivial)
|
||||||
|
**Blocker for:** 2.0.0 stable release
|
||||||
|
|
||||||
|
**Key Decision Gate:** Step 5 → Step 6 (empirical testing determines if `add_eos_token()` is needed)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
### mlx-lm APIs Used
|
||||||
|
|
||||||
|
**TokenizerWrapper** (returned by `mlx_lm.load()`):
|
||||||
|
```python
|
||||||
|
# Property
|
||||||
|
tokenizer.eos_token_ids -> set[int] # All EOS token IDs
|
||||||
|
|
||||||
|
# Method
|
||||||
|
tokenizer.add_eos_token(token: str) -> None # Add token to EOS set
|
||||||
|
```
|
||||||
|
|
||||||
|
**Source:**
|
||||||
|
- `mlx_lm/tokenizer_utils.py:254` (TokenizerWrapper class)
|
||||||
|
- `mlx_lm/generate.py:701` (usage example: `if token in tokenizer.eos_token_ids`)
|
||||||
|
|
||||||
|
### Internal Documents
|
||||||
|
|
||||||
|
- **Research Findings (historical background):** `docs/ADR/appendix/ADR-009-research-findings.md`
|
||||||
|
- **Live Test Plan (authoritative):** `docs/ADR/appendix/ADR-009-test-plan.md`
|
||||||
|
- **Historical Transcripts:** `docs/ADR/appendix/ADR-009-gpt-oss-interview.md`, `docs/ADR/appendix/ADR-009-september-reasoning-discussion.md`
|
||||||
|
|
||||||
|
### External References
|
||||||
|
|
||||||
|
- **mlx-lm Source:** https://github.com/ml-explore/mlx-lm
|
||||||
|
- **HuggingFace MXFP4:** https://huggingface.co/mlx-community/gpt-oss-20b-MXFP4-Q8
|
||||||
|
|
||||||
|
### Related Issues
|
||||||
|
|
||||||
|
- **GitHub Issue #32:** Replace custom chat format with native Chat Templates
|
||||||
|
- **Issue #20:** End-Token filtering (defense-in-depth)
|
||||||
|
- **ADR-010:** Reasoning Content API (Phase 2)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Next Review:** After test suite implementation
|
||||||
|
**Decision Makers:** Project maintainer
|
||||||
|
**Stakeholders:** Beta.6 testers, downstream users
|
||||||
@@ -0,0 +1,249 @@
|
|||||||
|
# ADR-011: E2E Live Test Architecture
|
||||||
|
|
||||||
|
**Status:** Proposed (Planned for Post-Beta.6 / Stable 2.0)
|
||||||
|
**Date:** 2025-10-21
|
||||||
|
**Supersedes:** 1.1.1 `test_end_token_issue.py` comprehensive testing
|
||||||
|
**Affects:** Test Suite (Stable 2.0)
|
||||||
|
**Related:** ADR-009 (Stop Token Detection - provides Portfolio Discovery infrastructure)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
### Problem Statement
|
||||||
|
|
||||||
|
**1.1.1 Had Comprehensive E2E Testing:**
|
||||||
|
- `test_end_token_issue.py` validated full model portfolio
|
||||||
|
- Server/HTTP API endpoints tested
|
||||||
|
- Streaming vs. Non-Streaming parity (Issue #20)
|
||||||
|
- CLI integration (`run`, `show`)
|
||||||
|
- RAM-aware portfolio testing
|
||||||
|
|
||||||
|
**2.0 Beta Gaps:**
|
||||||
|
- 95%+ unit tests with mocks/stubs
|
||||||
|
- <5% live tests (3 hard-coded models in `test_stop_tokens_live.py`)
|
||||||
|
- No E2E validation for Server/HTTP/CLI paths
|
||||||
|
- No systematic portfolio coverage beyond stop tokens
|
||||||
|
|
||||||
|
**Risk:**
|
||||||
|
Without E2E live tests, we cannot validate production behavior before Stable release:
|
||||||
|
- Server API correctness across model portfolio
|
||||||
|
- Streaming vs. batch parity (Issue #20 regression)
|
||||||
|
- CLI integration with real models
|
||||||
|
- Real-world usage patterns
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### E2E Live Test Suite for Stable 2.0
|
||||||
|
|
||||||
|
**Reuses ADR-009 Infrastructure:**
|
||||||
|
- Portfolio discovery (`discover_mlx_models_in_cache()`)
|
||||||
|
- RAM gating (`get_safe_ram_budget_gb()`, `should_skip_model()`)
|
||||||
|
- Test fixtures (`_use_real_mlx_modules`, `requires_hf_home`)
|
||||||
|
|
||||||
|
**New Test Areas:**
|
||||||
|
|
||||||
|
#### 1. Server/HTTP API Validation
|
||||||
|
```python
|
||||||
|
# tests_2.0/live/test_server_e2e.py
|
||||||
|
|
||||||
|
@pytest.mark.live_e2e
|
||||||
|
def test_server_streaming_portfolio():
|
||||||
|
"""Validate /v1/chat/completions SSE streaming over portfolio."""
|
||||||
|
for model in discover_portfolio():
|
||||||
|
with LocalServer(model) as server:
|
||||||
|
response = requests.post(f"{server.url}/v1/chat/completions",
|
||||||
|
json={"stream": True, ...})
|
||||||
|
# Validate SSE format, stop tokens, no visible EOS
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 2. Streaming vs. Non-Streaming Parity (Issue #20)
|
||||||
|
```python
|
||||||
|
# tests_2.0/live/test_streaming_parity.py
|
||||||
|
|
||||||
|
@pytest.mark.live_e2e
|
||||||
|
def test_streaming_nonstreaming_parity_portfolio():
|
||||||
|
"""Validate streaming and non-streaming produce identical output (Issue #20)."""
|
||||||
|
for model in discover_portfolio():
|
||||||
|
runner = MLXRunner(model)
|
||||||
|
batch_output = runner.generate_batch(prompt, max_tokens=50)
|
||||||
|
stream_output = "".join(runner.generate_streaming(prompt, max_tokens=50))
|
||||||
|
|
||||||
|
# Issue #20: non-streaming previously had visible stop tokens
|
||||||
|
assert batch_output == stream_output
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 3. CLI Integration
|
||||||
|
```python
|
||||||
|
# tests_2.0/live/test_cli_e2e.py
|
||||||
|
|
||||||
|
@pytest.mark.live_e2e
|
||||||
|
def test_run_command_portfolio():
|
||||||
|
"""Validate mlxk run across portfolio."""
|
||||||
|
for model in discover_portfolio():
|
||||||
|
result = subprocess.run(
|
||||||
|
["mlxk", "run", model.id, "--prompt", "Test"],
|
||||||
|
capture_output=True, text=True
|
||||||
|
)
|
||||||
|
assert result.returncode == 0
|
||||||
|
assert "<|end|>" not in result.stdout
|
||||||
|
```
|
||||||
|
|
||||||
|
### Safety Requirements
|
||||||
|
|
||||||
|
**Read-Only Cache Access:**
|
||||||
|
- No pull/rm operations during tests
|
||||||
|
- Sentinel protection (`TEST-CACHE-SENTINEL` abort)
|
||||||
|
- Reuses ADR-007 CoW constraints
|
||||||
|
|
||||||
|
**RAM Gating:**
|
||||||
|
- Progressive budget (40%-70%, already implemented in ADR-009)
|
||||||
|
- Auto-skip models exceeding available RAM
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
**Requires ADR-009 (Beta.6):**
|
||||||
|
- Portfolio discovery infrastructure
|
||||||
|
- RAM gating logic
|
||||||
|
- Test fixtures
|
||||||
|
|
||||||
|
**Relationship:**
|
||||||
|
- **ADR-009:** Develops portfolio infrastructure, tests Runner-level stop tokens
|
||||||
|
- **ADR-011:** Reuses portfolio infrastructure, tests E2E APIs
|
||||||
|
|
||||||
|
**No overlap:** ADR-009 = Runner tests, ADR-011 = E2E tests
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Implementation Plan
|
||||||
|
|
||||||
|
**Priority:** HIGH (Required for Stable 2.0)
|
||||||
|
**Timeline:** Post-Beta.6, before Stable release
|
||||||
|
|
||||||
|
**Tasks:**
|
||||||
|
1. ⏳ **Implement ADR-009 Portfolio Discovery** (prerequisite for E2E tests)
|
||||||
|
- `discover_mlx_models_in_cache()` helper
|
||||||
|
- RAM gating logic (`should_skip_model()`)
|
||||||
|
2. ⏳ **Server E2E Tests** (`test_server_e2e.py`)
|
||||||
|
- HTTP API validation
|
||||||
|
- SSE streaming format
|
||||||
|
3. ⏳ **Streaming Parity Tests** (`test_streaming_parity.py`)
|
||||||
|
- Issue #20 regression protection
|
||||||
|
4. ⏳ **CLI Integration Tests** (`test_cli_e2e.py`)
|
||||||
|
- `mlxk run` validation
|
||||||
|
- Exit codes, error messages
|
||||||
|
5. ⏳ **Documentation Updates**
|
||||||
|
- TESTING.md: E2E test coverage section
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Implementation Status (2025-10-21)
|
||||||
|
|
||||||
|
**Status: NOT STARTED**
|
||||||
|
|
||||||
|
All tasks above are pending. This ADR documents the **planned architecture** for E2E tests.
|
||||||
|
|
||||||
|
**Current Reality:**
|
||||||
|
- No E2E test suite exists (`tests_2.0/live/test_server_e2e.py` etc. not created)
|
||||||
|
- Portfolio discovery not implemented (hard-coded 3 models in `test_stop_tokens_live.py:174`)
|
||||||
|
- ADR-009 provides **test plan** for portfolio discovery, but implementation deferred
|
||||||
|
|
||||||
|
**Blocker:**
|
||||||
|
- Requires Portfolio Discovery implementation (ADR-009 Step 1, currently incomplete)
|
||||||
|
|
||||||
|
**Next Steps:**
|
||||||
|
1. Complete ADR-009 Portfolio Discovery (Beta.6 scope)
|
||||||
|
2. Implement E2E test suite (Post-Beta.6, pre-Stable 2.0)
|
||||||
|
|
||||||
|
**Estimated Effort:** 2-3 sessions (reuses ADR-009 infrastructure)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Test Organization
|
||||||
|
|
||||||
|
**File Structure:**
|
||||||
|
```
|
||||||
|
tests_2.0/
|
||||||
|
├── test_stop_tokens_live.py # ADR-009: Runner stop tokens + portfolio
|
||||||
|
├── live/
|
||||||
|
│ ├── test_server_e2e.py # ADR-011: Server/HTTP
|
||||||
|
│ ├── test_streaming_parity.py # ADR-011: Issue #20
|
||||||
|
│ └── test_cli_e2e.py # ADR-011: CLI
|
||||||
|
```
|
||||||
|
|
||||||
|
**Markers:**
|
||||||
|
```python
|
||||||
|
@pytest.mark.live_e2e # E2E tests (ADR-011)
|
||||||
|
@pytest.mark.live_stop_tokens # Stop token tests (ADR-009)
|
||||||
|
@pytest.mark.slow # Both
|
||||||
|
```
|
||||||
|
|
||||||
|
**Run Strategy** (see TESTING.md for details):
|
||||||
|
```bash
|
||||||
|
pytest -m live_stop_tokens # ADR-009 only
|
||||||
|
pytest -m live_e2e # ADR-011 only
|
||||||
|
pytest # Unit tests (skips all live)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
- ✅ Production confidence before Stable release
|
||||||
|
- ✅ 1.1.1 test parity restored
|
||||||
|
- ✅ Issue #20/#32 regression protection
|
||||||
|
- ✅ Portfolio coverage (not limited to 3 models)
|
||||||
|
- ✅ Reusable infrastructure from ADR-009
|
||||||
|
|
||||||
|
### Negative
|
||||||
|
- ⚠️ Portfolio tests may take 10-30 minutes (10-50 models)
|
||||||
|
- ⚠️ Maintenance overhead if Server API changes
|
||||||
|
|
||||||
|
### Trade-offs
|
||||||
|
|
||||||
|
**Accepted:**
|
||||||
|
- Live tests remain opt-in (see TESTING.md)
|
||||||
|
- Portfolio limited to user's cache (not all HF models)
|
||||||
|
|
||||||
|
**Rejected:**
|
||||||
|
- Testing all HuggingFace Hub models (unrealistic)
|
||||||
|
- Hard-coding model lists (not scalable)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
### Related Issues
|
||||||
|
- Issue #20: End token filtering (streaming vs. non-streaming)
|
||||||
|
- Issue #32: Stop token detection (ADR-009)
|
||||||
|
|
||||||
|
### Related ADRs
|
||||||
|
- ADR-009: Stop Token Detection Fix (provides portfolio infrastructure)
|
||||||
|
- ADR-007: Clone Implementation (CoW constraints)
|
||||||
|
- ADR-004: Enhanced Error Handling (error envelope validation)
|
||||||
|
|
||||||
|
### 1.1.1 Test Suite
|
||||||
|
- `test_end_token_issue.py`: Original comprehensive test (reference)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Success Criteria
|
||||||
|
|
||||||
|
**Beta.6 → Stable Transition:**
|
||||||
|
1. ✅ ADR-009 portfolio discovery implemented
|
||||||
|
2. ✅ Server E2E tests cover ≥3 models (MXFP4, Qwen, Llama)
|
||||||
|
3. ✅ Streaming parity validated (Issue #20)
|
||||||
|
4. ✅ CLI integration tested
|
||||||
|
5. ✅ Documentation updated
|
||||||
|
|
||||||
|
**Definition of Done:**
|
||||||
|
```bash
|
||||||
|
pytest -m live_e2e -v # All tests pass or skip gracefully
|
||||||
|
```
|
||||||
|
|
||||||
|
No failures - only passes or skips (RAM/availability).
|
||||||
@@ -15,6 +15,10 @@ This directory contains Architecture Decision Records (ADRs) that document signi
|
|||||||
| [ADR-005](ADR-005-Clone-Implementation-Beta3.md) | Clone Implementation Beta3 | Superseded by ADR-007 | 2025-09-18 |
|
| [ADR-005](ADR-005-Clone-Implementation-Beta3.md) | Clone Implementation Beta3 | Superseded by ADR-007 | 2025-09-18 |
|
||||||
| [ADR-006](ADR-006-Clone-Implementation-Revised.md) | Clone Implementation Revised | Superseded by ADR-007 | 2025-09-18 |
|
| [ADR-006](ADR-006-Clone-Implementation-Revised.md) | Clone Implementation Revised | Superseded by ADR-007 | 2025-09-18 |
|
||||||
| [ADR-007](ADR-007-Clone-Implementation-Fixed.md) | Clone Implementation Fixed Strategy | Accepted | 2025-09-18 |
|
| [ADR-007](ADR-007-Clone-Implementation-Fixed.md) | Clone Implementation Fixed Strategy | Accepted | 2025-09-18 |
|
||||||
|
| [ADR-008](ADR-008-MLXModel-Package-Format.md) | MLXModel Package Format | Accepted | 2025-10-17 |
|
||||||
|
| [ADR-009](ADR-009-Stop-Token-Detection-Fix.md) | Stop Token Detection Fix | Accepted | 2025-10-21 |
|
||||||
|
| [ADR-010](ADR-010-Reasoning-Content-API.md) | Reasoning Content API | Draft | 2025-10-21 |
|
||||||
|
| [ADR-011](ADR-011-E2E-Live-Test-Architecture.md) | E2E Live Test Architecture | Accepted | 2025-10-21 |
|
||||||
|
|
||||||
## ADR Format
|
## ADR Format
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,239 @@
|
|||||||
|
# ADR-009 Appendix: GPT-OSS Interview (Historical Transcript)
|
||||||
|
|
||||||
|
**Status:** Archived discovery log – kept for provenance; not part of the current implementation scope
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mlxk run gpt-oss-20b-MXFP4-Q8
|
||||||
|
```
|
||||||
|
# The Chat Asking gpt-oss :D
|
||||||
|
|
||||||
|
Starting interactive chat. Type 'exit' or 'quit' to end.
|
||||||
|
|
||||||
|
**You:** What is a Chat Template and how to use it with MLX/chat to retrieve the proper end-token for a chat bot ?
|
||||||
|
|
||||||
|
**Assistant:** <|channel|>analysis<|message|>We need to explain what a Chat Template is, likely referring to chat templates in the context of MLX/chat? Maybe MLX refers to a library for machine learning e.g., "mlx" by Meta? Wait, there is "mlx" open-source library for machine learning from Meta. Or "MLX chat" might refer to a specific chat interface built on top of MLX? Alternatively, Chat Template could be in the context of LLMs and tokenization: It's a template that defines how to format a conversation, including role tags like system, user, assistant, and maybe includes end tokens. The question: "What is a Chat Template and how to use it with MLX/chat to retrieve the proper end-token for a chat bot?" So we need to explain that Chat Template is like prompt template specifying the format of conversation, used by tokenization and decoding to know which tokens signal end-of-message. In MLX/chat, you would use the ChatTemplate class to format input and get end token ID. Provide code example.
|
||||||
|
|
||||||
|
Let's recall MLX chat: I think there's a library "mlx.chat" that provides utilities for LLM chat. The ChatTemplate might be a class to format messages into tokens, with placeholders like {role} and {content}. You can retrieve the end token via tokenizer.eos_token_id or chat_template.get_end_token().
|
||||||
|
|
||||||
|
Let's search memory: In Hugging Face tokenizers, you have .eos_token_id. For chat templates, you might use "get_end_token" method.
|
||||||
|
|
||||||
|
We can propose:
|
||||||
|
|
||||||
|
- ChatTemplate is a tool that maps the chat history into token IDs, respecting role-specific prefixes and suffixes defined by the underlying model's tokenizer.
|
||||||
|
|
||||||
|
- It ensures that you use the correct end-of-message token (e.g., for Llama2 it's 128001 or something). The chat template might provide a property `eos_token_id`.
|
||||||
|
|
||||||
|
- In MLX/chat, you instantiate a ChatTemplate with the tokenizer and optionally a template string. Then you can call `chat_template.encode(messages)` to get tokens, and `chat_template.eos_token` gives the token ID for message termination.
|
||||||
|
|
||||||
|
- Example:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from mlx import chat
|
||||||
|
|
||||||
|
tokenizer = chat.load_tokenizer('meta/llama2-7b')
|
||||||
|
chat_template = chat.ChatTemplate(tokenizer)
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content":"You are a helpful assistant."},
|
||||||
|
{"role": "user", "content":"Hello!"}
|
||||||
|
]
|
||||||
|
|
||||||
|
input_ids = chat_template.encode(messages)
|
||||||
|
eos_id = chat_template.eos_token # or tokenizer.eos_token_id
|
||||||
|
|
||||||
|
output_ids = model.generate(input_ids, eos_token_id=eos_id)
|
||||||
|
```
|
||||||
|
|
||||||
|
- The proper end-token ensures that the model stops generating when it reaches the token that signals message termination, preventing truncation or over-generation.
|
||||||
|
|
||||||
|
Also mention that some models use separate end-of-message tokens per role; the template may provide mapping like `message_end_ids` for user, assistant.
|
||||||
|
|
||||||
|
Explain usage: ChatTemplate can be used to format prompts and retrieve end token. It may provide `ChatTemplate.get_end_token(role)`.
|
||||||
|
|
||||||
|
Let's craft a detailed answer: definition, purpose, how to use with MLX/chat. Provide steps, code.
|
||||||
|
|
||||||
|
Also mention difference between end-of-sequence (EOS) and message-end token. In some models like Llama 2, there's a separate "end of user message" (e.g., 128001) and "assistant message" (128000). ChatTemplate ensures correct usage.
|
||||||
|
|
||||||
|
Let's produce final answer.<|end|><|start|>assistant<|channel|>final<|message|>## 1. What is a **Chat Template**?
|
||||||
|
|
||||||
|
A *chat template* is a small piece of logic that knows how to turn a list of “messages” (system, user, assistant) into the token stream that an LLM expects **and** knows which token(s) signal the end of a message.
|
||||||
|
|
||||||
|
*Why it matters*
|
||||||
|
|
||||||
|
| What you feed the model | Why it needs a template |
|
||||||
|
|------------------------|-------------------------|
|
||||||
|
| `["Hello"]` (plain text) | The model doesn’t know where the user’s turn ends. |
|
||||||
|
| `{"role":"user","content":"Hello"}` | The model needs the exact string that marks *the end of a user turn* (e.g., `������` or `<|assistant|>`). |
|
||||||
|
| Multiple turns | The model needs to keep track of *who* is speaking so it can insert the right role‑prefix and end‑token. |
|
||||||
|
|
||||||
|
A chat template does two things:
|
||||||
|
|
||||||
|
1. **Formats** the conversation into a sequence of tokens that respects the model’s required role prefixes and separators.
|
||||||
|
2. **Provides** the *end‑token id(s)* that you should pass to the generation routine so the model stops at the correct boundary.
|
||||||
|
|
||||||
|
### Typical components of a chat template
|
||||||
|
|
||||||
|
| Component | What it looks like in code | Example for Llama‑2 |
|
||||||
|
|-----------|---------------------------|---------------------|
|
||||||
|
| `role_prefix` | `"��<|assistant|>"`, `"��<|user|>"` | `"<s>"` or `<|assistant|>` |
|
||||||
|
| `role_suffix` | `"��<|assistant|>"`, `"<|assistant|>"` | `<eos>` or `<|end_of_text|>` |
|
||||||
|
| `eos_token_id` | Integer ID of the EOS token | 128001 for user, 128000 for assistant |
|
||||||
|
| `system_prompt` | Optional system text that is prepended once | `"You are a helpful assistant."` |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Using the Chat Template in **MLX/chat**
|
||||||
|
|
||||||
|
Meta’s `mlx` library ships a lightweight chat helper that abstracts away these details. Below is a step‑by‑step walkthrough of how to:
|
||||||
|
|
||||||
|
1. Load a tokenizer and model.
|
||||||
|
2. Build the chat template (or load an existing one).
|
||||||
|
3. Encode a conversation to get input ids.
|
||||||
|
4. Retrieve the proper *end‑token* id for generation.
|
||||||
|
|
||||||
|
> **Note**: The examples below assume you have the `mlx` package installed (`pip install mlx`). Replace `"meta/llama2-7b"` with the model you actually want.
|
||||||
|
|
||||||
|
### 2.1 Import and load
|
||||||
|
|
||||||
|
```python
|
||||||
|
from mlx import chat, model, tokenizer # MLX’s sub‑packages
|
||||||
|
|
||||||
|
# Load a pretrained tokenizer
|
||||||
|
tok = tokenizer.load("meta/llama2-7b")
|
||||||
|
|
||||||
|
# Load the model (weights are loaded lazily on first use)
|
||||||
|
llama = model.load("meta/llama2-7b")
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2.2 Create a ChatTemplate
|
||||||
|
|
||||||
|
You can let MLX pick the *canonical* template for the model, or you can hand‑craft one.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Option 1: Let MLX infer the default template for this tokenizer/model
|
||||||
|
chat_tmpl = chat.ChatTemplate(tok)
|
||||||
|
|
||||||
|
# Option 2: Provide a custom template string (rarely needed)
|
||||||
|
custom_tmpl_str = (
|
||||||
|
"<s>"
|
||||||
|
"### System:\n{system}\n"
|
||||||
|
"### User:\n{user}"
|
||||||
|
"\n### Assistant: "
|
||||||
|
)
|
||||||
|
chat_tmpl = chat.ChatTemplate(tok, template=custom_tmpl_str)
|
||||||
|
```
|
||||||
|
|
||||||
|
> **Tip**: `chat.ChatTemplate` automatically detects the role‑prefixes/suffixes defined in the tokenizer’s `token_to_id` mapping, so you usually just need to pass the tokenizer.
|
||||||
|
|
||||||
|
### 2.3 Prepare a conversation
|
||||||
|
|
||||||
|
```python
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": "You are a helpful assistant."},
|
||||||
|
{"role": "user", "content": "How do I bake a cake?"},
|
||||||
|
# The assistant will reply next
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2.4 Encode the conversation
|
||||||
|
|
||||||
|
```python
|
||||||
|
input_ids = chat_tmpl.encode(messages)
|
||||||
|
print("Input token ids:", input_ids)
|
||||||
|
```
|
||||||
|
|
||||||
|
`encode()` internally:
|
||||||
|
|
||||||
|
1. Inserts role prefixes (`��<|assistant|>`, etc.).
|
||||||
|
2. Adds the *role‑specific* end‑token after each message.
|
||||||
|
3. Returns a `numpy.ndarray` or `torch.Tensor` of ids.
|
||||||
|
|
||||||
|
### 2.5 Retrieve the *proper* end‑token id
|
||||||
|
|
||||||
|
The chat template knows which token ends a message of a given role. You can fetch it like this:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# For the assistant turn – the token that marks *end of assistant message*
|
||||||
|
assistant_eos_id = chat_tmpl.eos_token_ids["assistant"]
|
||||||
|
print("Assistant EOS token id:", assistant_eos_id)
|
||||||
|
|
||||||
|
# For the user turn – if you ever need to truncate or stop on a user message
|
||||||
|
user_eos_id = chat_tmpl.eos_token_ids["user"]
|
||||||
|
```
|
||||||
|
|
||||||
|
If you just need the *generic* EOS token (the one that terminates a sequence), most models expose it as:
|
||||||
|
|
||||||
|
```python
|
||||||
|
generic_eos_id = tok.eos_token_id # e.g., 128001 for Llama‑2
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2.6 Generate a response
|
||||||
|
|
||||||
|
Now feed everything into the model’s generate routine, making sure to pass the correct `eos_token_id`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Append a placeholder for the assistant reply; some libraries do this automatically
|
||||||
|
input_ids = input_ids.tolist() # convert to Python list if needed
|
||||||
|
|
||||||
|
# Generate tokens until the assistant EOS token appears
|
||||||
|
generated_ids = llama.generate(
|
||||||
|
input_ids, # initial conversation
|
||||||
|
max_new_tokens=200,
|
||||||
|
eos_token_id=assistant_eos_id, # <-- this is the key
|
||||||
|
)
|
||||||
|
|
||||||
|
# Decode the full output (conversation + assistant reply)
|
||||||
|
full_output = tok.decode(generated_ids, skip_special_tokens=False)
|
||||||
|
print(full_output)
|
||||||
|
```
|
||||||
|
|
||||||
|
Because you passed the **assistant‑specific** EOS token, generation stops *exactly* when the model emits that token – you won’t get a partial answer or an extra “\<eos\>” that belongs to the user turn.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Why “proper end‑token” matters
|
||||||
|
|
||||||
|
| Scenario | What can go wrong if you use the wrong EOS token? |
|
||||||
|
|----------|---------------------------------------------------|
|
||||||
|
| **You pass the generic EOS (`<eos>`)** | The model may stop too early (after a user message) or generate garbage after the assistant reply. |
|
||||||
|
| **You pass the user EOS** | The model will stop *before* it finishes its answer, cutting off sentences. |
|
||||||
|
| **You don’t pass any EOS** | The model might keep generating indefinitely until a hard max‑token limit is hit. |
|
||||||
|
|
||||||
|
The chat template abstracts this logic so you never have to remember which integer corresponds to which role.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Quick‑Reference Cheat Sheet
|
||||||
|
|
||||||
|
| Task | Code |
|
||||||
|
|------|------|
|
||||||
|
| Load tokenizer & model | `tok = tokenizer.load("meta/llama2-7b")`<br>`llm = model.load("meta/llama2-7b")` |
|
||||||
|
| Build template | `tmpl = chat.ChatTemplate(tok)` |
|
||||||
|
| Encode messages | `ids = tmpl.encode(messages)` |
|
||||||
|
| Get assistant EOS id | `assistant_eos = tmpl.eos_token_ids["assistant"]` |
|
||||||
|
| Generate reply | `out = llm.generate(ids, eos_token_id=assistant_eos)` |
|
||||||
|
| Decode output | `text = tok.decode(out)` |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Common Pitfalls & FAQs
|
||||||
|
|
||||||
|
| Question | Answer |
|
||||||
|
|----------|--------|
|
||||||
|
| *Does every LLM have a separate assistant‑EOS token?* | Not always. Models like GPT‑3 use the same EOS for all roles, but Llama‑2, Claude, and others have role‑specific tokens (e.g., `128001` for user, `128000` for assistant). |
|
||||||
|
| *What if my tokenizer doesn’t expose `eos_token_id`?* | Use the chat template’s mapping: `tmpl.eos_token_ids[role]`. |
|
||||||
|
| *Can I change the role prefixes?* | Yes – pass a custom template string to `ChatTemplate`. |
|
||||||
|
| *Why does the assistant reply sometimes start with “<s>”?* | `<s>` is the “start‑of‑sequence” token that some models require before every generation. The template automatically inserts it. |
|
||||||
|
| *Do I need to truncate the user’s message?* | The template already adds the correct EOS after each user turn. Just keep feeding the full conversation to `generate`. |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. TL;DR
|
||||||
|
|
||||||
|
- **Chat Template** = a helper that formats conversations into the exact token stream your LLM expects and tells you which *end‑token* marks the end of a particular role’s turn.
|
||||||
|
- In **MLX/chat** you create one with `chat.ChatTemplate(tokenizer)`.
|
||||||
|
- Encode your conversation: `ids = tmpl.encode(messages)`.
|
||||||
|
- Retrieve the proper EOS id for the role you’re generating (assistant): `eos_id = tmpl.eos_token_ids["assistant"]`.
|
||||||
|
- Pass that to the model’s `generate`: `llm.generate(ids, eos_token_id=eos_id)`.
|
||||||
|
|
||||||
|
That’s it—now your chat bot will stop generating *exactly* when the model signals that its turn is finished. Happy chatting!
|
||||||
@@ -0,0 +1,295 @@
|
|||||||
|
# Chat Templates, Stop Tokens & Reasoning: Konsolidierte Erkenntnisse
|
||||||
|
|
||||||
|
**Status:** Historical background – Stop-token findings inform ADR-009; reasoning/API notes preserved for future ADRs
|
||||||
|
**Dokumentiert:** 2025-09-08 (Initial), 2025-10-21 (Konsolidierung)
|
||||||
|
**Related:** Issue #32, ADR-004
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Executive Summary
|
||||||
|
|
||||||
|
**Problem:** Models generieren andere Stop-Tokens als konfiguriert, Reasoning-Content nicht API-konform
|
||||||
|
**Root Cause (gefunden 2025-10-21):** HuggingFace tokenizer configs unvollständig + unser Code nutzt falsche API
|
||||||
|
**Lösung:** 2-Phasen Approach (Beta.6: Stop-Token-Fix, 2.1+: Reasoning-API)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Die Kernfrage (September 2025)
|
||||||
|
**Welches End-Token gilt für welches Modell?**
|
||||||
|
|
||||||
|
## Was wir gelernt haben (September → Oktober)
|
||||||
|
|
||||||
|
### 1. Chat Templates sind NICHT Protokolle
|
||||||
|
- Chat Templates sind **Jinja2-Formatierungsanweisungen**
|
||||||
|
- Sie konvertieren strukturierte Messages zu Token-Sequenzen
|
||||||
|
- Sie replizieren das Format aus dem Training
|
||||||
|
- Sie definieren NICHT das Stop-Verhalten
|
||||||
|
|
||||||
|
### 2. End-Token Verwirrung
|
||||||
|
|
||||||
|
#### MXFP4 Modell Beispiel:
|
||||||
|
- **EOS Token**: `<|return|>` (tokenizer config)
|
||||||
|
- **Generiert aber**: `<|end|>` nach Messages
|
||||||
|
- **Problem**: `<|end|>` wird nicht als Stop-Token erkannt
|
||||||
|
- **Test erwartet**: `<|end|>` sollte gefiltert werden
|
||||||
|
|
||||||
|
#### Token-Typen:
|
||||||
|
1. **Control Tokens** (aus Training):
|
||||||
|
- `<|end|>` - Message-Ende Marker (MXFP4)
|
||||||
|
- `<|im_end|>` - Message-Ende (Qwen)
|
||||||
|
|
||||||
|
2. **Stop Tokens** (Generation beenden):
|
||||||
|
- `<|return|>` (MXFP4)
|
||||||
|
- `</s>` (Llama)
|
||||||
|
- `<|endoftext|>` (GPT)
|
||||||
|
|
||||||
|
3. **Template Tokens** (nur Formatierung):
|
||||||
|
- `<|start|>`, `<|message|>` etc.
|
||||||
|
|
||||||
|
### 3. Das eigentliche Problem
|
||||||
|
|
||||||
|
Modelle generieren verschiedene Tokens als "ich bin fertig":
|
||||||
|
- Manche nutzen ihr definiertes EOS Token
|
||||||
|
- Manche nutzen gelernte Pattern aus dem Training
|
||||||
|
- Manche nutzen beides
|
||||||
|
|
||||||
|
**MLX Knife muss wissen**:
|
||||||
|
- Was ist das offizielle EOS Token? (aus tokenizer config)
|
||||||
|
- Was generiert das Modell tatsächlich? (empirisch)
|
||||||
|
- Was sollte gefiltert werden? (beide?)
|
||||||
|
|
||||||
|
### 4. Unsere bisherige Implementierung
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Aktuell in mlx_runner.py:
|
||||||
|
- Extrahiert EOS aus tokenizer
|
||||||
|
- Sucht nach "end"-ähnlichen Tokens
|
||||||
|
- ABER: Verpasst modell-spezifische Patterns wie <|end|>
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. Server-Test Failures
|
||||||
|
|
||||||
|
- **MXFP4**: Generiert `<|end|>`, wird nicht gefiltert → Test fail
|
||||||
|
- **Qwen3**: Self-conversation (vermutlich andere Ursache)
|
||||||
|
|
||||||
|
## Offene Fragen
|
||||||
|
|
||||||
|
1. Sollten wir ALLE "end-like" Tokens aus dem Training als Stop-Tokens behandeln?
|
||||||
|
2. Oder nur die explizit als EOS definierten?
|
||||||
|
3. Wie gehen andere Implementierungen (Ollama, vLLM) damit um?
|
||||||
|
4. Brauchen wir modell-spezifische Stop-Token Listen?
|
||||||
|
5. **Legacy-Modelle**: Was ist mit alten Modellen ohne Chat Templates?
|
||||||
|
- Sind sie mit der neuen Implementation kompatibel?
|
||||||
|
- Brauchen wir einen Fallback auf Human:/Assistant:?
|
||||||
|
- Oder verweigern wir Support für template-lose Modelle?
|
||||||
|
|
||||||
|
## Legacy-Modell Kompatibilität
|
||||||
|
|
||||||
|
### Aktuelle Implementation
|
||||||
|
```python
|
||||||
|
# mlx_runner.py _format_conversation():
|
||||||
|
if use_chat_template and hasattr(self.tokenizer, 'chat_template') and self.tokenizer.chat_template:
|
||||||
|
# Use chat template
|
||||||
|
else:
|
||||||
|
# Fallback to _legacy_format_conversation (Human:/Assistant:)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Fragen zur Klärung:
|
||||||
|
- Gibt es überhaupt MLX-Modelle ohne Chat Templates?
|
||||||
|
- Wenn ja, funktioniert Human:/Assistant: für diese?
|
||||||
|
- Sollten wir sie überhaupt unterstützen?
|
||||||
|
|
||||||
|
## Nächste Schritte
|
||||||
|
|
||||||
|
1. **Inventur**: Welche Modelle haben keine Chat Templates?
|
||||||
|
2. **Empirisch testen**: Welche Tokens generieren die Modelle tatsächlich?
|
||||||
|
3. **Stop-Token Strategie**: Klare Regeln definieren
|
||||||
|
4. **Legacy-Strategie**: Fallback oder Deprecation?
|
||||||
|
5. **Implementation**: Robuste Token-Erkennung
|
||||||
|
6. **Tests anpassen**: Realistische Erwartungen
|
||||||
|
|
||||||
|
## Neue Erkenntnisse (Oktober 2025)
|
||||||
|
|
||||||
|
### Root Cause gefunden: HuggingFace + mlx_knife Code Bugs
|
||||||
|
|
||||||
|
**MXFP4 Tokenizer Config (HuggingFace):**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"eos_token": "<|return|>", // ID 200002
|
||||||
|
"eos_token_id": 200002, // SINGLE ID (falsch!)
|
||||||
|
"extra_special_tokens": {} // Leer!
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Was richtig wäre (wie Llama 3):**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"eos_token_id": [200002, 200007] // ARRAY: <|return|> UND <|end|>
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Unser Code Bug:**
|
||||||
|
```python
|
||||||
|
# mlxk2/core/runner/__init__.py:468, 589
|
||||||
|
if token_id == self.tokenizer.eos_token_id: # SINGULAR (falsch!)
|
||||||
|
break
|
||||||
|
```
|
||||||
|
|
||||||
|
**mlx-lm macht es richtig:**
|
||||||
|
```python
|
||||||
|
# mlx_lm/generate.py:stream_generate()
|
||||||
|
if token in tokenizer.eos_token_ids: # SET (korrekt!)
|
||||||
|
break
|
||||||
|
```
|
||||||
|
|
||||||
|
### mlx-lm Architektur-Analyse
|
||||||
|
|
||||||
|
**Pattern:** Keine model-spezifischen Workarounds in `mlx_lm/models/*.py`
|
||||||
|
- `gpt_oss.py`, `qwen2.py`, `llama.py` - Reine Architektur (forward pass)
|
||||||
|
- Stop-Token Handling: Nur in `generate.py` (generisch via tokenizer metadata)
|
||||||
|
- API: `tokenizer.add_eos_token(token)` für Runtime-Additions
|
||||||
|
|
||||||
|
**Erkenntnis:** mlx-lm vertraut auf korrekte HuggingFace configs. Broken configs → broken generation.
|
||||||
|
|
||||||
|
### Reasoning-Token Analyse
|
||||||
|
|
||||||
|
**OpenAI o1 / Responses API:**
|
||||||
|
- Reasoning bleibt **hidden** (nur token count sichtbar)
|
||||||
|
- Reasoning summaries via `reasoning.summary: "auto"`
|
||||||
|
- Keine `reasoning_content` im Chat Completions API
|
||||||
|
|
||||||
|
**DeepSeek R1 API:**
|
||||||
|
```python
|
||||||
|
response.choices[0].message.reasoning_content # Separates Feld!
|
||||||
|
response.choices[0].message.content # Final answer
|
||||||
|
```
|
||||||
|
|
||||||
|
**Status Quo (mlx_knife):**
|
||||||
|
- Inline filtering via `StreamingReasoningParser`
|
||||||
|
- `hide_reasoning` Parameter (bereits vorhanden)
|
||||||
|
- Marker-basiert: `<|channel|>analysis<|message|>...` → entfernt
|
||||||
|
|
||||||
|
**Problem:** Nicht API-standard-konform, Client kann Reasoning nicht separat rendern
|
||||||
|
|
||||||
|
## Roadmap: 2-Phasen Approach
|
||||||
|
|
||||||
|
### Phase 1: Beta.6 - Stop Token Fix (BLOCKER)
|
||||||
|
|
||||||
|
**Scope:** Generische Mechanismen implementieren (KEIN Workaround-Gefrickel)
|
||||||
|
|
||||||
|
**Changes:**
|
||||||
|
1. ✅ **Fix Runner Stop-Check:**
|
||||||
|
```python
|
||||||
|
# Vorher (broken):
|
||||||
|
if token_id == self.tokenizer.eos_token_id:
|
||||||
|
|
||||||
|
# Nachher (correct):
|
||||||
|
if token_id in self.tokenizer.eos_token_ids:
|
||||||
|
```
|
||||||
|
|
||||||
|
2. ✅ **Add Stop Tokens via API:**
|
||||||
|
```python
|
||||||
|
# In _extract_stop_tokens():
|
||||||
|
for stop_token in self._stop_tokens:
|
||||||
|
self.tokenizer.add_eos_token(stop_token)
|
||||||
|
```
|
||||||
|
|
||||||
|
3. ✅ **Defense-in-Depth behalten:**
|
||||||
|
- String-based filtering (Issue #20) bleibt als Fallback
|
||||||
|
- Reasoning parser bleibt wie ist
|
||||||
|
|
||||||
|
**Non-Scope (Beta.6):**
|
||||||
|
- ❌ KEINE Reasoning-API Changes (breaking)
|
||||||
|
- ❌ KEINE HuggingFace Issues melden (noch nicht)
|
||||||
|
- ❌ KEINE model-spezifischen Workarounds (erst nach Real-Model Tests)
|
||||||
|
|
||||||
|
**Test Strategy:**
|
||||||
|
- Real-Model Test Suite (MXFP4, Qwen3, Llama3.2)
|
||||||
|
- Validate stop token detection
|
||||||
|
- Measure before/after behavior
|
||||||
|
|
||||||
|
### Phase 2: 2.0.1+ - Reasoning API (Enhancement)
|
||||||
|
|
||||||
|
**Goal:** API-standard-konforme Reasoning-Unterstützung
|
||||||
|
|
||||||
|
**Design:** DeepSeek-Style (Option B)
|
||||||
|
```python
|
||||||
|
# Response structure:
|
||||||
|
{
|
||||||
|
"choices": [{
|
||||||
|
"message": {
|
||||||
|
"content": "Final answer", # Existing
|
||||||
|
"reasoning_content": "CoT...", # NEW
|
||||||
|
"role": "assistant"
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Streaming:**
|
||||||
|
```python
|
||||||
|
# SSE chunks:
|
||||||
|
data: {"choices":[{"delta":{"content":"Hello"}}]}
|
||||||
|
data: {"choices":[{"delta":{"reasoning":"step 1..."}}]}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Client Benefits:**
|
||||||
|
- Web UI kann Reasoning optional einblenden (wie GPT-5 chat)
|
||||||
|
- Lokale Clients haben klare API-Struktur
|
||||||
|
- Runner code als Vorlage für broke cluster
|
||||||
|
|
||||||
|
**Implementation Tasks:**
|
||||||
|
1. Extend `ChatCompletionResponse` model
|
||||||
|
2. Modify `StreamingReasoningParser` → separate output streams
|
||||||
|
3. Add `include_reasoning` request parameter
|
||||||
|
4. Update server endpoints
|
||||||
|
5. Write API docs + examples
|
||||||
|
|
||||||
|
**Breaking Changes:**
|
||||||
|
- Opt-in: Default `include_reasoning=false` (backward compat)
|
||||||
|
- Existing clients funktionieren weiter
|
||||||
|
|
||||||
|
## Issue #32 Status Update
|
||||||
|
|
||||||
|
**Original Problem (September):** Hardcodiertes Human:/Assistant: Format
|
||||||
|
- ✅ **Gelöst:** Chat Templates werden verwendet
|
||||||
|
|
||||||
|
**Problem 1 (Oktober):** Stop-Token Detection
|
||||||
|
- 🔄 **Beta.6:** Generischer Fix (eos_token_ids Set)
|
||||||
|
- 📅 **Status:** Implementierung anstehend
|
||||||
|
|
||||||
|
**Problem 2 (Future):** Reasoning API
|
||||||
|
- 📋 **2.0.1+:** Separate `reasoning_content` field
|
||||||
|
- 📅 **Status:** Konzept definiert, Implementation später
|
||||||
|
|
||||||
|
## Offene Fragen (für später)
|
||||||
|
|
||||||
|
1. **HuggingFace Issues melden?**
|
||||||
|
- MXFP4 tokenizer config fix (`eos_token_id` → array)
|
||||||
|
- Erst nach Validation mit Real-Model Tests
|
||||||
|
|
||||||
|
2. **mlx-lm Enhancement vorschlagen?**
|
||||||
|
- Warning wenn chat_template tokens nicht in `eos_token_ids`
|
||||||
|
- Bessere Docs für `--extra-eos-token`
|
||||||
|
- Erst nach Beta.6 Validation
|
||||||
|
|
||||||
|
3. **Legacy-Modelle ohne Chat Templates?**
|
||||||
|
- Inventur durchführen (gibt es überhaupt welche?)
|
||||||
|
- Fallback behalten oder deprecaten?
|
||||||
|
|
||||||
|
## Referenzen
|
||||||
|
|
||||||
|
- **September Research:** docs/GPT5_CHAT_REASONING_FINDINGS.md
|
||||||
|
- **Model Interview:** docs/GPT-OSS_EXPLAINS_CHAT-TEMPLATES.md (Appendix)
|
||||||
|
- **Upstream Findings:** docs/ISSUES/ISSUE-32-UPSTREAM-FINDINGS.md
|
||||||
|
- **mlx-lm Source:** https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/generate.py
|
||||||
|
- **DeepSeek API:** https://api-docs.deepseek.com/guides/reasoning_model
|
||||||
|
- **OpenAI Responses API:** https://cookbook.openai.com/examples/responses_api/reasoning_items
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Next Session:**
|
||||||
|
- [ ] Implement stop token fix (Phase 1)
|
||||||
|
- [ ] Run Real-Model Test Suite (validation)
|
||||||
|
- [ ] Create Issue for Phase 2 (Reasoning API)
|
||||||
|
- [ ] Consider upstream issue reports (after validation)
|
||||||
@@ -0,0 +1,209 @@
|
|||||||
|
## ADR-009 Appendix: Reasoning Scope Notes (September 2025)
|
||||||
|
|
||||||
|
**Status:** Archived exploration – captures early reasoning API considerations now owned by future ADRs
|
||||||
|
|
||||||
|
## Meine Fragen im ChatGPT-Dialog (GPT5-Thinking):
|
||||||
|
|
||||||
|
```plaintext
|
||||||
|
wie sollte ein OpenAI kompatibler Server optimal mit gpt-oss umgehen?
|
||||||
|
...
|
||||||
|
ich frage, wie das mlx-knife handhaben soll - eine MLX version ist verfügbar mit MXFP4 unterstützung.
|
||||||
|
````
|
||||||
|
|
||||||
|
## Antwort
|
||||||
|
|
||||||
|
**Kurz gesagt:**
|
||||||
|
|
||||||
|
Erwarte Reasoning nie „heimlich“ (keine impliziten Chain-of-Thought-Leaks). Erzeuge Reasoning explizit durch das Modell (structured output / function call / separate pass) und mache die Freigabe/Anzeige kontrollierbar (flags, audit, encrypt). Diese Herangehensweise ist kompatibel mit dem, was etablierte Anbieter empfehlen (Responses API → reasoning summaries), und lässt sich lokal für MLX-Modelle sauber umsetzen.  
|
||||||
|
|
||||||
|
⸻
|
||||||
|
|
||||||
|
1 — Designprinzipien (high level)
|
||||||
|
1. Explizit statt implizit: Reasoning ist nur dann Teil der Ausgabe, wenn du das Modell explizit so aufforderst (Prompt oder separate Reasoning-Pass). Default: kein CoT.
|
||||||
|
2. Strukturierte Antwort bevorzugen: Fordere das Modell an, eine JSON-Struktur zu liefern: { "answer": "...", "reasoning": "...", "score": ... }. Server parst und trennt sicher.
|
||||||
|
3. Feinsteuerung per Flag: Client gibt include_reasoning=none|summary|full (oder reasoning=summary) an. Server liefert nur das, wozu Client berechtigt/angefragt hat.
|
||||||
|
4. Streaming mit Kanal-Metadaten: Beim Streamen markiere Reasoning-Deltas ausdrücklich (z. B. delta.reasoning vs delta.content), statt sie als normalen Text zu mischen.
|
||||||
|
5. Validierung & Audit: Prüfe/validiere JSON (jsonschema). Aufbewahrung von kompletten Reasoning-Traces optional & verschlüsselt (audit).
|
||||||
|
6. Fallbacks: Wenn kein strukturiertes Format kommt, suche nach Marker-Patterns (z. B. <<REASONING>>...<</REASONING>>) oder versuche post-hoc einen reasoning_summary via zweitem Prompt zu erzeugen.
|
||||||
|
|
||||||
|
(Praktisch relevant für MLX-Modelle / MXFP4 — mlx-knife kann Tokenizer/chat_template aus dem Modell verwenden, siehe MF-Modelcards und MLX-ecosystem).  
|
||||||
|
|
||||||
|
⸻
|
||||||
|
|
||||||
|
2 — API-Contract (proposal)
|
||||||
|
|
||||||
|
/v1/chat/completions (OpenAI-compatible) — zusätzliche, optionale Felder:
|
||||||
|
|
||||||
|
Request (relevante Felder)
|
||||||
|
|
||||||
|
{
|
||||||
|
"model":"mlx-community/gpt-oss-20b-MXFP4-Q4",
|
||||||
|
"messages":[ ... ],
|
||||||
|
"stream": true,
|
||||||
|
"reasoning": "none" | "summary" | "full", // NEW: server honoriert das
|
||||||
|
"reasoning_encrypt": true | false, // optional: server speichert verschlüsselt
|
||||||
|
"response_schema": { "type":"json", "schema": { ... } } // optional zwingt JSON-Ausgabe
|
||||||
|
}
|
||||||
|
|
||||||
|
Response (streamed SSE chunks): klassische chat.completion.chunk mit erweitertem Delta-Shape:
|
||||||
|
|
||||||
|
// Normaler answer-delta
|
||||||
|
{
|
||||||
|
"object":"chat.completion.chunk",
|
||||||
|
"model":"...",
|
||||||
|
"choices":[{"index":0,"delta":{"content":"Hello, "},"finish_reason":null}]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reasoning-delta (klar gekennzeichnet)
|
||||||
|
{
|
||||||
|
"object":"chat.completion.chunk",
|
||||||
|
"model":"...",
|
||||||
|
"choices":[{"index":0,"delta":{"reasoning":"step1: consider X..."},"finish_reason":null}]
|
||||||
|
}
|
||||||
|
|
||||||
|
Am Stream-Ende wie gewohnt data: [DONE].
|
||||||
|
(Alternative OpenAI-kompatible Variante: benutze choices[].delta mit delta.content und delta.meta.reasoning=true — aber ein dedicated delta.reasoning Feld ist klarer für Clients.)
|
||||||
|
|
||||||
|
Wichtig: Wenn reasoning=none, streamet der Server ausschließlich delta.content. Wenn reasoning=summary, erzeugt der Server (während oder nach der Antwort) eine kompakte natural-language reasoning_summary und streamt diese in einem eigenen delta.reasoning_summary-Event oder liefert sie als separate API-Antwort. 
|
||||||
|
|
||||||
|
⸻
|
||||||
|
|
||||||
|
3 — Prompting / Tokenizer-Handling (praktisch für MLX)
|
||||||
|
• System prompt steuert Erzeugung: wenn du response_schema verlangst, erzeugt dein prompt explizit die JSON-Ausgabe. Nutze HF-chat_template und tokenizer.apply_chat_template(...) wie gehabt (wichtig für MLX-models).  
|
||||||
|
• Beispiel-System-Prompt (verkürzt):
|
||||||
|
|
||||||
|
System: You must return a JSON object with keys "answer" (string) and "reasoning" (string).
|
||||||
|
If the client requested "reasoning": "none", put reasoning = "".
|
||||||
|
Do not include extra text outside the JSON.
|
||||||
|
|
||||||
|
• Wenn Streaming: instruct model to emit JSON in streamable chunks (choose safe delimiters) or use incremental JSON tokenization conventions (clients must reassemble). Consider response_schema to force well-formed JSON.
|
||||||
|
|
||||||
|
⸻
|
||||||
|
|
||||||
|
4 — Streaming: technische Details & client expectations
|
||||||
|
• Server-side: empfange backend-tokenstream; klassifiziere jedes Token/chunk als answer vs reasoning basierend auf:
|
||||||
|
1. explicit JSON keys (best), oder
|
||||||
|
2. markers (<<REASONING>>), oder
|
||||||
|
3. function-call events (siehe unten).
|
||||||
|
• Client: erwartet SSE events mit delta.content (answer) und optional delta.reasoning (reasoning). Client UI zeigt standardmäßig answer inkrementell. reasoning wird verdeckt/optional angezeigt (z. B. “Show reasoning” button) oder in dev/debug mode automatisch expanded.
|
||||||
|
|
||||||
|
Beispiel SSE event payload:
|
||||||
|
|
||||||
|
data: {"object":"chat.completion.chunk","choices":[{"index":0,"delta":{"content":"The result is 42."}}]}
|
||||||
|
|
||||||
|
data: {"object":"chat.completion.chunk","choices":[{"index":0,"delta":{"reasoning":"I computed 6*7 because..."} } ]}
|
||||||
|
|
||||||
|
data: [DONE]
|
||||||
|
|
||||||
|
|
||||||
|
⸻
|
||||||
|
|
||||||
|
5 — Drei praktikable Implementationsmuster (mit Vor-/Nachteilen)
|
||||||
|
|
||||||
|
A) Structured JSON output (empfohlen)
|
||||||
|
• Wie: Prompt zwingt JSON {answer, reasoning}. Server json.loads() und trennt Felder.
|
||||||
|
• Pro: Robust, leicht zu validieren, kein Heuristik-Chaos.
|
||||||
|
• Contra: JSON-Stream-Parsing kann knifflig; erfordert gute prompt-engineering.
|
||||||
|
|
||||||
|
B) Function-Call / Tool pattern
|
||||||
|
• Wie: Fordere Modell, call einer Pseudo-Funktion report_reasoning({ ... }) auszuführen (ähnlich OpenAI function call). Server fängt function_call ab — behandelt arguments als reasoning.
|
||||||
|
• Pro: Natürliche Trennung; streaming-fähig (arguments können über Chunks kommen).
|
||||||
|
• Contra: Erfordert funktionale Unterstützung im Backend (MLX wrappers können das aber handhaben). 
|
||||||
|
|
||||||
|
C) Markers in plain text (Fallback)
|
||||||
|
• Wie: Model schreibt <<REASONING>> ... <</REASONING>> vor/zwischen den Antworten. Server sucht Marker.
|
||||||
|
• Pro: Einfach umzusetzen bei Models ohne JSON-Disziplin.
|
||||||
|
• Contra: Brüchig (Modelle vergessen Marker).
|
||||||
|
|
||||||
|
⸻
|
||||||
|
|
||||||
|
6 — Security / Policy / UX Regeln (wichtig)
|
||||||
|
• Default: reasoning=none. Reasoning nur auf explizite Nachfrage. Logge/versichere Consent, wenn Reasoning gespeichert wird (es kann sensible Interna enthalten).
|
||||||
|
• Biete reasoning_summary (automatisch generiert) statt voller CoT als Standard; das ist oft genug für debug/trace und weniger riskant. OpenAI empfiehlt genau so eine Trennung (Responses API bietet reasoning summaries).  
|
||||||
|
|
||||||
|
⸻
|
||||||
|
|
||||||
|
7 — Fallbacks & Heuristics (wenn Modell nicht kooperiert)
|
||||||
|
1. Try parse JSON → success → separate.
|
||||||
|
2. Else: look for markers <<REASONING>> → split.
|
||||||
|
3. Else: run post-hoc prompt: Given the answer above, summarize the reasoning steps that justify it. → attach reasoning_summary. (Das ist sicherer als rohe CoT-Leaks.)
|
||||||
|
4. Wenn client requested full CoT and model refuses or output undecodable → return warning + reasoning_unavailable flag.
|
||||||
|
|
||||||
|
⸻
|
||||||
|
|
||||||
|
8 — Mini-FastAPI-Sketch (Streaming + JSON-separation)
|
||||||
|
|
||||||
|
Nachfolgend ein kurzes Beispiel wie mlx-knife serve das trennen könnte — konzeptionell, anpassbar an Deine Backend-Adapter (MLX/mlx_lm).
|
||||||
|
|
||||||
|
# server_reasoning.py (sketch)
|
||||||
|
from fastapi import FastAPI, Request
|
||||||
|
from fastapi.responses import StreamingResponse
|
||||||
|
import json, time
|
||||||
|
|
||||||
|
app = FastAPI()
|
||||||
|
|
||||||
|
# backend stream generator -> yields raw token chunks (strings)
|
||||||
|
def backend_stream_generate(prompt, model_id):
|
||||||
|
# >>> adapt to mlx_lm streaming API
|
||||||
|
for token in ["{","\"answer\":","\"42\"",",","\"reasoning\":\"","I multiplied...","\"","}"]:
|
||||||
|
yield token
|
||||||
|
time.sleep(0.01)
|
||||||
|
|
||||||
|
def assemble_stream(prompt, model_id, reasoning_mode):
|
||||||
|
"""
|
||||||
|
Reads backend token stream and yields SSE events with typed deltas:
|
||||||
|
- delta.content for answer
|
||||||
|
- delta.reasoning for reasoning
|
||||||
|
Uses simple JSON reassembly for demo.
|
||||||
|
"""
|
||||||
|
buf = ""
|
||||||
|
for tok in backend_stream_generate(prompt, model_id):
|
||||||
|
buf += tok
|
||||||
|
# try to parse JSON once complete-ish
|
||||||
|
try:
|
||||||
|
obj = json.loads(buf)
|
||||||
|
# we assume model returned full JSON obj
|
||||||
|
if "answer" in obj:
|
||||||
|
# stream the answer as one chunk (or split further)
|
||||||
|
yield json.dumps({"choices":[{"index":0,"delta":{"content":obj["answer"]}}]})
|
||||||
|
if reasoning_mode != "none" and "reasoning" in obj:
|
||||||
|
yield json.dumps({"choices":[{"index":0,"delta":{"reasoning": obj["reasoning"]}}]})
|
||||||
|
buf = "" # reset
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
# not yet complete: do nothing (or stream tokens as best_effort)
|
||||||
|
continue
|
||||||
|
# finish
|
||||||
|
yield "[DONE]"
|
||||||
|
|
||||||
|
@app.post("/v1/chat/completions")
|
||||||
|
async def chat(req: Request):
|
||||||
|
body = await req.json()
|
||||||
|
reasoning = body.get("reasoning","none") # none|summary|full
|
||||||
|
prompt = "<built-from-messages>" # build via HF tokenizer in real impl
|
||||||
|
model_id = body.get("model")
|
||||||
|
def sse():
|
||||||
|
for event in assemble_stream(prompt, model_id, reasoning):
|
||||||
|
if event == "[DONE]":
|
||||||
|
yield "data: [DONE]\n\n"
|
||||||
|
else:
|
||||||
|
yield f"data: {event}\n\n"
|
||||||
|
return StreamingResponse(sse(), media_type="text/event-stream")
|
||||||
|
|
||||||
|
|
||||||
|
⸻
|
||||||
|
|
||||||
|
9 — Concrete recommendations für mlx-knife (konkret & priorisiert)
|
||||||
|
1. Implementiere --reasoning flag für mlx-knife run / serve (none/summary/full). Default none.
|
||||||
|
2. Support response_schema/json output im run path — benutze HF chat_template + system prompt, parse JSON, separate fields. (Das ist die robusteste Methode für MLX runs — siehe HF model pages wie mlx-community/gpt-oss-20b-MXFP4-Q4 die tokenizer.chat_template beschreiben). 
|
||||||
|
3. Streaming: add field types to SSE (delta.reasoning), so UIs can decide whether to render reasoning inline.
|
||||||
|
4. Tool/Function approach: wenn dein backend (mlx_lm) das kann, support function-call style report_reasoning(...) so you can get reasoning as a function result.
|
||||||
|
5. Post-hoc summary: wenn full CoT not available/undesired, always offer --reasoning summary that triggers a small second pass to produce a concise reasoning summary. This is lightweight & safe. 
|
||||||
|
6. Logging & audit: store full CoT only with explicit opt-in and encryption.
|
||||||
|
|
||||||
|
⸻
|
||||||
|
|
||||||
|
Quellen / weiterlese (relevant)
|
||||||
|
• MLX model example (MXFP4 model card, shows mlx-lm usage & chat_template). 
|
||||||
|
• MLX / mlx-lm docs (how to load/generate on Apple silicon). 
|
||||||
|
• OpenAI Responses API / Reasoning summaries (official guidance why reasoning should be a separate capability).  
|
||||||
|
• Community guidance re: how to handle raw CoT in Chat Completions (advice and conventions). 
|
||||||
@@ -0,0 +1,134 @@
|
|||||||
|
# ADR-009 Appendix: Test Plan
|
||||||
|
|
||||||
|
**Status:** Active – authoritative live-test blueprint for ADR-009
|
||||||
|
**Related:** ADR-009 Stop Token Detection Fix
|
||||||
|
**Purpose:** Real-model validation strategy for Beta.6
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Test Models
|
||||||
|
|
||||||
|
### Representative Models (Initial Validation)
|
||||||
|
|
||||||
|
| Model | ID | Expected Issue | Purpose |
|
||||||
|
|-------|----|----|---------|
|
||||||
|
| **MXFP4** | `mlx-community/gpt-oss-20b-MXFP4-Q8` | `<|end|>` visible in output | Validate stop token fix |
|
||||||
|
| **Qwen 2.5** | `mlx-community/Qwen2.5-0.5B-Instruct-4bit` | Self-conversation (?) | Validate chat template handling |
|
||||||
|
| **Llama 3.2** | `mlx-community/Llama-3.2-3B-Instruct-4bit` | None (control) | Regression testing |
|
||||||
|
|
||||||
|
**Note:** These 3 models serve as initial validation. Full portfolio testing (below) extends coverage to all MLX models in user cache.
|
||||||
|
|
||||||
|
### Portfolio Discovery (Production Validation)
|
||||||
|
|
||||||
|
Instead of hard-coded models, iterate over all MLX-compatible models in user cache:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def discover_mlx_models_in_cache(hf_home: str) -> List[ModelInfo]:
|
||||||
|
"""Scan HF_HOME/hub/models--*/snapshots/* for MLX models.
|
||||||
|
|
||||||
|
Filters:
|
||||||
|
- MLX-compatible: Has safetensors + config.json
|
||||||
|
- RAM-aware: Estimates model size, skips if exceeds budget
|
||||||
|
|
||||||
|
Returns: List of discovered models with metadata
|
||||||
|
"""
|
||||||
|
```
|
||||||
|
|
||||||
|
**RAM Gating** (already implemented in `test_stop_tokens_live.py`):
|
||||||
|
- Progressive budget: 40% (16GB), 50% (32GB), 60% (64GB), 70% (96GB+)
|
||||||
|
- Auto-skip models exceeding available RAM
|
||||||
|
- See `get_safe_ram_budget_gb()`, `should_skip_model()` helpers
|
||||||
|
|
||||||
|
**Safety:**
|
||||||
|
- Read-only cache access (no pull/rm)
|
||||||
|
- Sentinel protection (`TEST-CACHE-SENTINEL`)
|
||||||
|
- See ADR-007 for CoW constraints
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Test Phases
|
||||||
|
|
||||||
|
### Phase 1: Baseline Measurement
|
||||||
|
|
||||||
|
**Goal:** Document current broken behavior
|
||||||
|
|
||||||
|
**Test Case:**
|
||||||
|
```python
|
||||||
|
prompt = "Write one sentence about cats."
|
||||||
|
output = runner.generate_streaming(prompt, max_tokens=50)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Collect:**
|
||||||
|
- Full generated text
|
||||||
|
- Token IDs (if accessible)
|
||||||
|
- Stop condition (why stopped?)
|
||||||
|
- Visible stop tokens
|
||||||
|
|
||||||
|
**Expected Baseline Results:**
|
||||||
|
- MXFP4: `<|end|>` appears in output ✗
|
||||||
|
- Qwen: TBD (may self-converse) ?
|
||||||
|
- Llama: Clean output ✓
|
||||||
|
|
||||||
|
### Phase 2: Fix Validation
|
||||||
|
|
||||||
|
**After implementing fix, same test case**
|
||||||
|
|
||||||
|
**Expected After-Fix Results:**
|
||||||
|
- MXFP4: No stop tokens visible ✓
|
||||||
|
- Qwen: No self-conversation ✓
|
||||||
|
- Llama: Still works (no regression) ✓
|
||||||
|
|
||||||
|
### Phase 3: Empirical Mapping
|
||||||
|
|
||||||
|
**Document tokenizer configs:**
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
"model": "gpt-oss",
|
||||||
|
"configured_eos": ["<|return|>"], # From tokenizer
|
||||||
|
"generated_tokens": ["<|end|>", ...], # Empirically observed
|
||||||
|
"workaround_needed": True/False
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Test Implementation
|
||||||
|
|
||||||
|
**File:** `tests_2.0/test_stop_tokens_live.py`
|
||||||
|
|
||||||
|
**Markers:**
|
||||||
|
```python
|
||||||
|
@pytest.mark.live_stop_tokens # Requires models downloaded
|
||||||
|
@pytest.mark.slow # >1 min per model
|
||||||
|
```
|
||||||
|
|
||||||
|
**Run:**
|
||||||
|
```bash
|
||||||
|
# Baseline
|
||||||
|
pytest tests_2.0/test_stop_tokens_live.py::test_baseline -v -m live_stop_tokens
|
||||||
|
|
||||||
|
# After fix
|
||||||
|
pytest tests_2.0/test_stop_tokens_live.py::test_validation -v -m live_stop_tokens
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Success Criteria
|
||||||
|
|
||||||
|
**Initial Validation (3 Models):**
|
||||||
|
✅ **Phase 1 Complete:** Baseline measurements documented
|
||||||
|
✅ **Phase 2 Complete:** All 3 models pass validation tests
|
||||||
|
✅ **Phase 3 Complete:** Empirical mapping generated (test artifact: `stop_token_config_report.json`)
|
||||||
|
|
||||||
|
**Portfolio Validation (All Models in Cache):**
|
||||||
|
⏳ **Portfolio Discovery:** Planned (currently hard-coded 3-model `TEST_MODELS` dict)
|
||||||
|
⏳ **Cache Iterator:** Planned (`discover_mlx_models_in_cache()` not yet implemented)
|
||||||
|
⏳ **Dynamic Validation:** Planned (scale to all models in user cache, not just 3)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Related Documentation
|
||||||
|
|
||||||
|
- **ADR-009 Main:** Implementation details, 2-LOC fix, `add_eos_token()` fallback
|
||||||
|
- **ADR-011:** E2E Live Test Architecture (Server/HTTP/CLI validation, reuses portfolio discovery)
|
||||||
|
- **TESTING.md:** Live test execution, markers, environment setup
|
||||||
+1
-1
@@ -7,4 +7,4 @@ import warnings
|
|||||||
# Issue parity with 1.1.0 (Issue #22)
|
# Issue parity with 1.1.0 (Issue #22)
|
||||||
warnings.filterwarnings('ignore', message='urllib3 v2 only supports OpenSSL 1.1.1+')
|
warnings.filterwarnings('ignore', message='urllib3 v2 only supports OpenSSL 1.1.1+')
|
||||||
|
|
||||||
__version__ = "2.0.0b5"
|
__version__ = "2.0.0b6"
|
||||||
|
|||||||
@@ -464,8 +464,8 @@ class MLXRunner:
|
|||||||
yield new_text
|
yield new_text
|
||||||
tokens_generated += 1
|
tokens_generated += 1
|
||||||
|
|
||||||
# Check for EOS token
|
# Check for EOS token (ADR-009: use eos_token_ids Set for multi-EOS models)
|
||||||
if token_id == self.tokenizer.eos_token_id:
|
if token_id in self.tokenizer.eos_token_ids:
|
||||||
break
|
break
|
||||||
|
|
||||||
# Finalize reasoning parser if used
|
# Finalize reasoning parser if used
|
||||||
@@ -586,7 +586,8 @@ class MLXRunner:
|
|||||||
generated_tokens.append(token_id)
|
generated_tokens.append(token_id)
|
||||||
all_tokens.append(token_id)
|
all_tokens.append(token_id)
|
||||||
|
|
||||||
if token_id == self.tokenizer.eos_token_id:
|
# Check for EOS token (ADR-009: use eos_token_ids Set for multi-EOS models)
|
||||||
|
if token_id in self.tokenizer.eos_token_ids:
|
||||||
break
|
break
|
||||||
|
|
||||||
# Decode full response
|
# Decode full response
|
||||||
|
|||||||
@@ -47,6 +47,12 @@ def extract_stop_tokens(tokenizer: Any, verbose: bool = False) -> StopTokenInfo:
|
|||||||
if isinstance(token_content, str) and token_content:
|
if isinstance(token_content, str) and token_content:
|
||||||
token_lower = token_content.lower()
|
token_lower = token_content.lower()
|
||||||
if token_content == '<|end|>':
|
if token_content == '<|end|>':
|
||||||
|
add_eos_token = getattr(tokenizer, 'add_eos_token', None)
|
||||||
|
if callable(add_eos_token):
|
||||||
|
try:
|
||||||
|
add_eos_token(token_content)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
continue
|
continue
|
||||||
end_patterns = ['stop', 'eot', 'return', 'finish', 'done', 'im_end']
|
end_patterns = ['stop', 'eot', 'return', 'finish', 'done', 'im_end']
|
||||||
if any(pattern in token_lower for pattern in end_patterns):
|
if any(pattern in token_lower for pattern in end_patterns):
|
||||||
@@ -115,4 +121,3 @@ def extract_stop_tokens(tokenizer: Any, verbose: bool = False) -> StopTokenInfo:
|
|||||||
reasoning_end=reasoning_end,
|
reasoning_end=reasoning_end,
|
||||||
final_start=final_start,
|
final_start=final_start,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
+16
-11
@@ -9,7 +9,7 @@ from ..core.runner import MLXRunner
|
|||||||
from ..core.cache import get_current_model_cache, hf_to_cache_dir
|
from ..core.cache import get_current_model_cache, hf_to_cache_dir
|
||||||
from ..core.model_resolution import resolve_model_for_operation
|
from ..core.model_resolution import resolve_model_for_operation
|
||||||
from ..operations.health import check_runtime_compatibility
|
from ..operations.health import check_runtime_compatibility
|
||||||
from ..operations.common import detect_framework
|
from ..operations.common import detect_framework, read_front_matter
|
||||||
|
|
||||||
|
|
||||||
def run_model(
|
def run_model(
|
||||||
@@ -63,6 +63,8 @@ def run_model(
|
|||||||
if model_cache_dir.exists():
|
if model_cache_dir.exists():
|
||||||
snapshots_dir = model_cache_dir / "snapshots"
|
snapshots_dir = model_cache_dir / "snapshots"
|
||||||
if snapshots_dir.exists():
|
if snapshots_dir.exists():
|
||||||
|
# Resolve snapshot path (commit-pinned or latest)
|
||||||
|
model_path = None
|
||||||
if commit_hash:
|
if commit_hash:
|
||||||
model_path = snapshots_dir / commit_hash
|
model_path = snapshots_dir / commit_hash
|
||||||
else:
|
else:
|
||||||
@@ -70,17 +72,20 @@ def run_model(
|
|||||||
if snapshots:
|
if snapshots:
|
||||||
model_path = max(snapshots, key=lambda x: x.stat().st_mtime)
|
model_path = max(snapshots, key=lambda x: x.stat().st_mtime)
|
||||||
|
|
||||||
# Check runtime compatibility
|
# Check runtime compatibility for both pinned and unpinned models
|
||||||
framework = detect_framework(resolved_name, model_path)
|
if model_path and model_path.exists():
|
||||||
compatible, reason = check_runtime_compatibility(model_path, framework)
|
# Read README front-matter for framework hints (e.g., private MLX models)
|
||||||
|
fm = read_front_matter(model_path)
|
||||||
|
framework = detect_framework(resolved_name, model_cache_dir, selected_path=model_path, fm=fm)
|
||||||
|
compatible, reason = check_runtime_compatibility(model_path, framework)
|
||||||
|
|
||||||
if not compatible:
|
if not compatible:
|
||||||
error_msg = f"Model '{resolved_name}' is not compatible: {reason}"
|
error_msg = f"Model '{resolved_name}' is not compatible: {reason}"
|
||||||
if json_output:
|
if json_output:
|
||||||
return f"Error: {error_msg}"
|
return f"Error: {error_msg}"
|
||||||
else:
|
else:
|
||||||
print(f"Error: {error_msg}")
|
print(f"Error: {error_msg}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
# Pre-flight check failed - let the runner handle it
|
# Pre-flight check failed - let the runner handle it
|
||||||
|
|||||||
+11
-2
@@ -28,6 +28,13 @@ classifiers = [
|
|||||||
]
|
]
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"huggingface-hub>=0.34.0",
|
"huggingface-hub>=0.34.0",
|
||||||
|
"requests>=2.32.0",
|
||||||
|
"mlx-lm>=0.28.3",
|
||||||
|
"mlx>=0.29.0",
|
||||||
|
"fastapi>=0.116.0",
|
||||||
|
"uvicorn>=0.35.0",
|
||||||
|
"pydantic>=2.11.0",
|
||||||
|
"httpx>=0.27.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
@@ -50,8 +57,10 @@ version = {attr = "mlxk2.__version__"}
|
|||||||
test = [
|
test = [
|
||||||
"pytest>=7",
|
"pytest>=7",
|
||||||
"jsonschema>=4.20",
|
"jsonschema>=4.20",
|
||||||
"httpx>=0.27.0",
|
]
|
||||||
"fastapi>=0.116.0",
|
dev = [
|
||||||
|
"ruff>=0.1.0",
|
||||||
|
"mypy>=1.5.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.setuptools]
|
[tool.setuptools]
|
||||||
|
|||||||
@@ -9,6 +9,9 @@ markers =
|
|||||||
live_push: Alias for wet; push live tests (require env)
|
live_push: Alias for wet; push live tests (require env)
|
||||||
live_list: Alias for wet; list human live tests (require env)
|
live_list: Alias for wet; list human live tests (require env)
|
||||||
live_clone: Alias for wet; clone live tests (require env, ADR-007 Phase 1)
|
live_clone: Alias for wet; clone live tests (require env, ADR-007 Phase 1)
|
||||||
|
live_run: Opt-in run command tests with real models (require user cache model)
|
||||||
|
live_stop_tokens: Opt-in stop token tests with real models (Issue #32, ADR-009)
|
||||||
issue27: Real-model health policy tests (opt-in; read-only user cache)
|
issue27: Real-model health policy tests (opt-in; read-only user cache)
|
||||||
|
slow: Tests that take >1 minute to run
|
||||||
filterwarnings =
|
filterwarnings =
|
||||||
ignore::urllib3.exceptions.NotOpenSSLWarning
|
ignore::urllib3.exceptions.NotOpenSSLWarning
|
||||||
|
|||||||
@@ -45,6 +45,7 @@ def mock_mlx_runner_environment(temp_cache_dir, model_name="test-model", context
|
|||||||
mock_tokenizer = Mock()
|
mock_tokenizer = Mock()
|
||||||
mock_tokenizer.eos_token = "</s>"
|
mock_tokenizer.eos_token = "</s>"
|
||||||
mock_tokenizer.eos_token_id = 2
|
mock_tokenizer.eos_token_id = 2
|
||||||
|
mock_tokenizer.eos_token_ids = {mock_tokenizer.eos_token_id}
|
||||||
mock_tokenizer.pad_token = None
|
mock_tokenizer.pad_token = None
|
||||||
mock_tokenizer.additional_special_tokens = []
|
mock_tokenizer.additional_special_tokens = []
|
||||||
mock_tokenizer.added_tokens_decoder = {}
|
mock_tokenizer.added_tokens_decoder = {}
|
||||||
@@ -79,4 +80,4 @@ def mock_mlx_runner_environment(temp_cache_dir, model_name="test-model", context
|
|||||||
def mock_runner_env(temp_cache_dir):
|
def mock_runner_env(temp_cache_dir):
|
||||||
"""Fixture version of mock_mlx_runner_environment."""
|
"""Fixture version of mock_mlx_runner_environment."""
|
||||||
with mock_mlx_runner_environment(temp_cache_dir) as env:
|
with mock_mlx_runner_environment(temp_cache_dir) as env:
|
||||||
yield env
|
yield env
|
||||||
|
|||||||
@@ -86,6 +86,7 @@ class TestMLXRunnerInterruption:
|
|||||||
mock_tokenizer = Mock()
|
mock_tokenizer = Mock()
|
||||||
mock_tokenizer.eos_token = "</s>"
|
mock_tokenizer.eos_token = "</s>"
|
||||||
mock_tokenizer.eos_token_id = 2
|
mock_tokenizer.eos_token_id = 2
|
||||||
|
mock_tokenizer.eos_token_ids = {mock_tokenizer.eos_token_id}
|
||||||
mock_tokenizer.additional_special_tokens = []
|
mock_tokenizer.additional_special_tokens = []
|
||||||
mock_tokenizer.added_tokens_decoder = {}
|
mock_tokenizer.added_tokens_decoder = {}
|
||||||
mock_tokenizer.encode.return_value = [1, 2, 3]
|
mock_tokenizer.encode.return_value = [1, 2, 3]
|
||||||
@@ -127,6 +128,7 @@ class TestMLXRunnerInterruption:
|
|||||||
mock_tokenizer = Mock()
|
mock_tokenizer = Mock()
|
||||||
mock_tokenizer.eos_token = "</s>"
|
mock_tokenizer.eos_token = "</s>"
|
||||||
mock_tokenizer.eos_token_id = 2
|
mock_tokenizer.eos_token_id = 2
|
||||||
|
mock_tokenizer.eos_token_ids = {mock_tokenizer.eos_token_id}
|
||||||
mock_tokenizer.additional_special_tokens = []
|
mock_tokenizer.additional_special_tokens = []
|
||||||
mock_tokenizer.added_tokens_decoder = {}
|
mock_tokenizer.added_tokens_decoder = {}
|
||||||
mock_tokenizer.encode.return_value = [1, 2, 3]
|
mock_tokenizer.encode.return_value = [1, 2, 3]
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ class TestInterruptionRecovery:
|
|||||||
mock_tokenizer = Mock()
|
mock_tokenizer = Mock()
|
||||||
mock_tokenizer.eos_token = "</s>"
|
mock_tokenizer.eos_token = "</s>"
|
||||||
mock_tokenizer.eos_token_id = 2
|
mock_tokenizer.eos_token_id = 2
|
||||||
|
mock_tokenizer.eos_token_ids = {mock_tokenizer.eos_token_id}
|
||||||
mock_tokenizer.additional_special_tokens = []
|
mock_tokenizer.additional_special_tokens = []
|
||||||
mock_tokenizer.added_tokens_decoder = {}
|
mock_tokenizer.added_tokens_decoder = {}
|
||||||
mock_tokenizer.encode.return_value = [1, 2, 3]
|
mock_tokenizer.encode.return_value = [1, 2, 3]
|
||||||
@@ -63,6 +64,7 @@ class TestInterruptionRecovery:
|
|||||||
mock_tokenizer = Mock()
|
mock_tokenizer = Mock()
|
||||||
mock_tokenizer.eos_token = "</s>"
|
mock_tokenizer.eos_token = "</s>"
|
||||||
mock_tokenizer.eos_token_id = 2
|
mock_tokenizer.eos_token_id = 2
|
||||||
|
mock_tokenizer.eos_token_ids = {mock_tokenizer.eos_token_id}
|
||||||
mock_tokenizer.additional_special_tokens = []
|
mock_tokenizer.additional_special_tokens = []
|
||||||
mock_tokenizer.added_tokens_decoder = {}
|
mock_tokenizer.added_tokens_decoder = {}
|
||||||
mock_tokenizer.encode.return_value = [1, 2, 3]
|
mock_tokenizer.encode.return_value = [1, 2, 3]
|
||||||
@@ -206,4 +208,4 @@ class TestInterruptionRecovery:
|
|||||||
assert len(conversation_calls[1]) == 3
|
assert len(conversation_calls[1]) == 3
|
||||||
assert conversation_calls[1][0]["content"] == "first prompt"
|
assert conversation_calls[1][0]["content"] == "first prompt"
|
||||||
assert conversation_calls[1][1]["content"] == "[Generation interrupted by user]"
|
assert conversation_calls[1][1]["content"] == "[Generation interrupted by user]"
|
||||||
assert conversation_calls[1][2]["content"] == "second prompt"
|
assert conversation_calls[1][2]["content"] == "second prompt"
|
||||||
|
|||||||
@@ -0,0 +1,170 @@
|
|||||||
|
"""Regression test for Issue #37 P0: Private/org MLX models rejected in run command.
|
||||||
|
|
||||||
|
Beta.5 introduced runtime compatibility pre-flight check in run_model() that incorrectly
|
||||||
|
passed snapshot path instead of cache root to detect_framework(), causing all non-mlx-community
|
||||||
|
models to be detected as "Unknown framework" and rejected.
|
||||||
|
|
||||||
|
This test verifies the fix by simulating a private-org MLX model (renamed from mlx-community/Phi-3).
|
||||||
|
|
||||||
|
Opt-in via: pytest -m live_run
|
||||||
|
Requires: mlx-community/Phi-3-mini-4k-instruct-4bit in user cache (MLXK2_USER_HF_HOME)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import pytest
|
||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
from mlxk2.operations.run import run_model
|
||||||
|
from mlxk2.core.cache import hf_to_cache_dir
|
||||||
|
|
||||||
|
# Opt-in marker: only run with pytest -m live_run
|
||||||
|
pytestmark = [pytest.mark.live_run]
|
||||||
|
|
||||||
|
# Skip if MLXK2_USER_HF_HOME not set (prevents running in standard pytest)
|
||||||
|
_USER_CACHE_ROOT = os.environ.get("MLXK2_USER_HF_HOME") or os.environ.get("HF_HOME")
|
||||||
|
requires_user_cache = pytest.mark.skipif(
|
||||||
|
not _USER_CACHE_ROOT,
|
||||||
|
reason="requires MLXK2_USER_HF_HOME or HF_HOME (opt-in via pytest -m live_run)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@requires_user_cache
|
||||||
|
def test_private_org_mlx_model_runs_without_unknown_framework_error(
|
||||||
|
copy_user_model_to_isolated, isolated_cache
|
||||||
|
):
|
||||||
|
"""Test that private/org MLX models are correctly detected and can run.
|
||||||
|
|
||||||
|
Workflow:
|
||||||
|
1. Copy mlx-community/Phi-3-mini-4k-instruct-4bit from user cache
|
||||||
|
2. Rename cache directory to simulate private-org model (test-org/phi3-mlx-instruct)
|
||||||
|
3. Run the model with a simple prompt
|
||||||
|
4. Verify no "Unknown framework" error occurs
|
||||||
|
|
||||||
|
This test requires:
|
||||||
|
- Phi-3-mini-4k-instruct-4bit in user cache (MLXK2_USER_HF_HOME)
|
||||||
|
- Run with: pytest -m live_run
|
||||||
|
"""
|
||||||
|
# Step 1: Copy Phi-3 from user cache to isolated test cache
|
||||||
|
src_model_dir = copy_user_model_to_isolated("mlx-community/Phi-3-mini-4k-instruct-4bit")
|
||||||
|
|
||||||
|
# Step 2: Rename to simulate private-org model
|
||||||
|
# From: models--mlx-community--Phi-3-mini-4k-instruct-4bit
|
||||||
|
# To: models--test-org--phi3-mlx-instruct
|
||||||
|
private_org_cache_name = "models--test-org--phi3-mlx-instruct"
|
||||||
|
private_org_dir = isolated_cache / private_org_cache_name
|
||||||
|
|
||||||
|
# Move the directory
|
||||||
|
shutil.move(str(src_model_dir), str(private_org_dir))
|
||||||
|
|
||||||
|
# Verify the renamed model exists
|
||||||
|
assert private_org_dir.exists(), "Private org model directory should exist after rename"
|
||||||
|
snapshots = private_org_dir / "snapshots"
|
||||||
|
assert snapshots.exists(), "Snapshots directory should exist"
|
||||||
|
|
||||||
|
# Step 3: Add README.md with MLX tags to ensure framework detection works
|
||||||
|
# (This is what a real private MLX model would have)
|
||||||
|
snapshot_dirs = [d for d in snapshots.iterdir() if d.is_dir()]
|
||||||
|
assert len(snapshot_dirs) > 0, "Should have at least one snapshot"
|
||||||
|
|
||||||
|
for snapshot_dir in snapshot_dirs:
|
||||||
|
readme = snapshot_dir / "README.md"
|
||||||
|
readme.write_text("""---
|
||||||
|
tags: [mlx, chat]
|
||||||
|
library_name: mlx
|
||||||
|
---
|
||||||
|
# Test Org Phi-3 MLX Model
|
||||||
|
|
||||||
|
This is a test private-org MLX model for regression testing.
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Step 4: Run the model - this should NOT fail with "Unknown framework"
|
||||||
|
# Note: We use json_output=True to get structured error messages
|
||||||
|
result = run_model(
|
||||||
|
model_spec="test-org/phi3-mlx-instruct",
|
||||||
|
prompt="Hello",
|
||||||
|
json_output=True,
|
||||||
|
stream=False,
|
||||||
|
max_tokens=5, # Keep it short for speed
|
||||||
|
verbose=False
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 5: Verify no "Unknown framework" or "Incompatible: PyTorch" errors
|
||||||
|
# Note: We're testing framework detection, not mlx_lm availability
|
||||||
|
if isinstance(result, str):
|
||||||
|
# The bug would manifest as one of these:
|
||||||
|
assert "Unknown framework" not in result, (
|
||||||
|
f"Private-org MLX model should not be rejected as 'Unknown framework'. "
|
||||||
|
f"Got result: {result}"
|
||||||
|
)
|
||||||
|
assert "Incompatible: PyTorch" not in result, (
|
||||||
|
f"Private-org MLX model should not be detected as PyTorch. "
|
||||||
|
f"Got result: {result}"
|
||||||
|
)
|
||||||
|
# If we get mlx_lm import errors, that's OK - it means framework detection worked!
|
||||||
|
# The model was recognized as MLX and pre-flight passed
|
||||||
|
|
||||||
|
# If we get here without assertions failing, the regression is fixed!
|
||||||
|
print(f"✓ Private-org MLX model 'test-org/phi3-mlx-instruct' runs successfully")
|
||||||
|
|
||||||
|
|
||||||
|
@requires_user_cache
|
||||||
|
def test_framework_detection_for_renamed_mlx_community_model(
|
||||||
|
copy_user_model_to_isolated, isolated_cache
|
||||||
|
):
|
||||||
|
"""Test that framework detection works correctly when cache root is passed.
|
||||||
|
|
||||||
|
This is a more focused unit-style test that verifies detect_framework()
|
||||||
|
receives the correct parameters from run_model().
|
||||||
|
"""
|
||||||
|
from mlxk2.operations.common import detect_framework
|
||||||
|
from mlxk2.core.cache import get_current_model_cache, hf_to_cache_dir
|
||||||
|
|
||||||
|
# Copy and rename model
|
||||||
|
src_model_dir = copy_user_model_to_isolated("mlx-community/Phi-3-mini-4k-instruct-4bit")
|
||||||
|
private_org_cache_name = "models--acme--mlx-chat-model"
|
||||||
|
private_org_dir = isolated_cache / private_org_cache_name
|
||||||
|
shutil.move(str(src_model_dir), str(private_org_dir))
|
||||||
|
|
||||||
|
# Add MLX tags to README
|
||||||
|
snapshots = private_org_dir / "snapshots"
|
||||||
|
snapshot_dirs = [d for d in snapshots.iterdir() if d.is_dir()]
|
||||||
|
assert len(snapshot_dirs) > 0
|
||||||
|
snapshot_path = snapshot_dirs[0]
|
||||||
|
|
||||||
|
readme = snapshot_path / "README.md"
|
||||||
|
readme.write_text("""---
|
||||||
|
tags: [mlx]
|
||||||
|
library_name: mlx
|
||||||
|
---
|
||||||
|
# Acme MLX Model
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Test framework detection with CORRECT parameters (cache root + selected_path + fm)
|
||||||
|
from mlxk2.operations.common import read_front_matter
|
||||||
|
fm = read_front_matter(snapshot_path) # Read the README we just wrote
|
||||||
|
framework = detect_framework(
|
||||||
|
hf_name="acme/mlx-chat-model",
|
||||||
|
model_root=private_org_dir, # Cache root (models--acme--mlx-chat-model)
|
||||||
|
selected_path=snapshot_path, # Snapshot path (snapshots/abc123...)
|
||||||
|
fm=fm # Front-matter with MLX tags
|
||||||
|
)
|
||||||
|
|
||||||
|
assert framework == "MLX", (
|
||||||
|
f"Framework should be detected as MLX from README tags. Got: {framework}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test with INCORRECT parameters (what Beta.5 bug did)
|
||||||
|
framework_buggy = detect_framework(
|
||||||
|
hf_name="acme/mlx-chat-model",
|
||||||
|
model_root=snapshot_path, # BUG: Passing snapshot as root
|
||||||
|
selected_path=None
|
||||||
|
)
|
||||||
|
|
||||||
|
# With the bug, it would fall through to "Unknown" because:
|
||||||
|
# - Not mlx-community/* → no early return
|
||||||
|
# - README not in snapshot_path / "snapshots" (doesn't exist)
|
||||||
|
# - No GGUF/PyTorch detected
|
||||||
|
# This assertion documents the buggy behavior for reference
|
||||||
|
print(f"Buggy detection result: {framework_buggy} (should be Unknown without fix)")
|
||||||
@@ -371,7 +371,93 @@ class TestStreamingVsBatch:
|
|||||||
# Output should be equivalent (modulo formatting)
|
# Output should be equivalent (modulo formatting)
|
||||||
stream_output = stream_out.getvalue().strip()
|
stream_output = stream_out.getvalue().strip()
|
||||||
batch_output = batch_out.getvalue().strip()
|
batch_output = batch_out.getvalue().strip()
|
||||||
|
|
||||||
# Both should contain the core content
|
# Both should contain the core content
|
||||||
assert "Hello world" in stream_output
|
assert "Hello world" in stream_output
|
||||||
assert "Hello world" in batch_output
|
assert "Hello world" in batch_output
|
||||||
|
|
||||||
|
|
||||||
|
class TestPreflightCompatibilityCheck:
|
||||||
|
"""Test runtime compatibility preflight checks in run command."""
|
||||||
|
|
||||||
|
def test_commit_pinned_incompatible_model_blocked(self, isolated_cache):
|
||||||
|
"""Commit-pinned models must also pass compatibility check (regression test).
|
||||||
|
|
||||||
|
Regression: Beta.5 introduced preflight compatibility checks, but commit-pinned
|
||||||
|
models bypassed the check due to incorrect if/else scoping.
|
||||||
|
|
||||||
|
This test verifies that `mlxk run org/model@commit_hash` properly validates
|
||||||
|
framework compatibility before attempting to load the model.
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
# Create a PyTorch model in cache with specific commit hash
|
||||||
|
commit_hash = "abc123def456"
|
||||||
|
model_name = "test-org/pytorch-model"
|
||||||
|
cache_dir = isolated_cache / f"models--{model_name.replace('/', '--')}"
|
||||||
|
snapshot_dir = cache_dir / "snapshots" / commit_hash
|
||||||
|
snapshot_dir.mkdir(parents=True)
|
||||||
|
|
||||||
|
# Create valid config.json (healthy model)
|
||||||
|
config = {"model_type": "bert", "architectures": ["BertForSequenceClassification"]}
|
||||||
|
(snapshot_dir / "config.json").write_text(json.dumps(config))
|
||||||
|
|
||||||
|
# Create PyTorch weights (incompatible framework)
|
||||||
|
(snapshot_dir / "pytorch_model.bin").write_bytes(b"fake_pytorch_weights" * 100)
|
||||||
|
|
||||||
|
# Mock resolve_model_for_operation to return our commit hash
|
||||||
|
with patch('mlxk2.operations.run.resolve_model_for_operation') as mock_resolve:
|
||||||
|
mock_resolve.return_value = (model_name, commit_hash, None)
|
||||||
|
|
||||||
|
# Mock get_current_model_cache to use our isolated cache
|
||||||
|
with patch('mlxk2.operations.run.get_current_model_cache') as mock_cache:
|
||||||
|
mock_cache.return_value = isolated_cache
|
||||||
|
|
||||||
|
# Attempt to run with commit-pinned spec
|
||||||
|
result = run_model(
|
||||||
|
model_spec=f"{model_name}@{commit_hash}",
|
||||||
|
prompt="test prompt",
|
||||||
|
json_output=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should be blocked by preflight check
|
||||||
|
assert result is not None
|
||||||
|
assert "Error:" in result
|
||||||
|
assert "not compatible" in result or "Incompatible" in result
|
||||||
|
|
||||||
|
def test_latest_snapshot_incompatible_model_blocked(self, isolated_cache):
|
||||||
|
"""Non-pinned models should also be blocked by compatibility check."""
|
||||||
|
import json
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
# Create a PyTorch model in cache (latest snapshot)
|
||||||
|
model_name = "test-org/another-pytorch"
|
||||||
|
cache_dir = isolated_cache / f"models--{model_name.replace('/', '--')}"
|
||||||
|
snapshot_dir = cache_dir / "snapshots" / "latest_snapshot"
|
||||||
|
snapshot_dir.mkdir(parents=True)
|
||||||
|
|
||||||
|
# Create valid config.json (healthy model)
|
||||||
|
config = {"model_type": "gpt2", "architectures": ["GPT2LMHeadModel"]}
|
||||||
|
(snapshot_dir / "config.json").write_text(json.dumps(config))
|
||||||
|
|
||||||
|
# Create PyTorch weights (incompatible framework)
|
||||||
|
(snapshot_dir / "pytorch_model.bin").write_bytes(b"fake_weights" * 100)
|
||||||
|
|
||||||
|
# Mock resolve_model_for_operation (no commit hash)
|
||||||
|
with patch('mlxk2.operations.run.resolve_model_for_operation') as mock_resolve:
|
||||||
|
mock_resolve.return_value = (model_name, None, None)
|
||||||
|
|
||||||
|
with patch('mlxk2.operations.run.get_current_model_cache') as mock_cache:
|
||||||
|
mock_cache.return_value = isolated_cache
|
||||||
|
|
||||||
|
result = run_model(
|
||||||
|
model_spec=model_name,
|
||||||
|
prompt="test prompt",
|
||||||
|
json_output=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should be blocked by preflight check
|
||||||
|
assert result is not None
|
||||||
|
assert "Error:" in result
|
||||||
|
assert "not compatible" in result or "Incompatible" in result
|
||||||
@@ -37,6 +37,7 @@ def mock_runner_environment(temp_cache_dir, model_name="test-model"):
|
|||||||
mock_tokenizer = Mock()
|
mock_tokenizer = Mock()
|
||||||
mock_tokenizer.eos_token = "</s>"
|
mock_tokenizer.eos_token = "</s>"
|
||||||
mock_tokenizer.eos_token_id = 2
|
mock_tokenizer.eos_token_id = 2
|
||||||
|
mock_tokenizer.eos_token_ids = {mock_tokenizer.eos_token_id}
|
||||||
mock_tokenizer.pad_token = None
|
mock_tokenizer.pad_token = None
|
||||||
mock_tokenizer.additional_special_tokens = []
|
mock_tokenizer.additional_special_tokens = []
|
||||||
mock_tokenizer.added_tokens_decoder = {}
|
mock_tokenizer.added_tokens_decoder = {}
|
||||||
@@ -98,6 +99,7 @@ class TestMLXRunnerBasic:
|
|||||||
# Mock tokenizer methods
|
# Mock tokenizer methods
|
||||||
mocks['mock_tokenizer'].encode.return_value = [100, 101] # Prompt tokens
|
mocks['mock_tokenizer'].encode.return_value = [100, 101] # Prompt tokens
|
||||||
mocks['mock_tokenizer'].eos_token_id = 999 # Don't trigger EOS
|
mocks['mock_tokenizer'].eos_token_id = 999 # Don't trigger EOS
|
||||||
|
mocks['mock_tokenizer'].eos_token_ids = {mocks['mock_tokenizer'].eos_token_id}
|
||||||
mocks['mock_tokenizer'].chat_template = None # Disable chat template
|
mocks['mock_tokenizer'].chat_template = None # Disable chat template
|
||||||
|
|
||||||
# Mock decode to return consistent strings based on token list length/content
|
# Mock decode to return consistent strings based on token list length/content
|
||||||
@@ -136,6 +138,7 @@ class TestMLXRunnerBasic:
|
|||||||
mocks['mock_tokenizer'].encode.return_value = [100, 101] # Prompt
|
mocks['mock_tokenizer'].encode.return_value = [100, 101] # Prompt
|
||||||
mocks['mock_tokenizer'].decode.side_effect = lambda tokens: " ".join([f"token{t}" for t in tokens])
|
mocks['mock_tokenizer'].decode.side_effect = lambda tokens: " ".join([f"token{t}" for t in tokens])
|
||||||
mocks['mock_tokenizer'].eos_token_id = 999 # Don't trigger EOS
|
mocks['mock_tokenizer'].eos_token_id = 999 # Don't trigger EOS
|
||||||
|
mocks['mock_tokenizer'].eos_token_ids = {mocks['mock_tokenizer'].eos_token_id}
|
||||||
mocks['mock_tokenizer'].chat_template = None
|
mocks['mock_tokenizer'].chat_template = None
|
||||||
|
|
||||||
with MLXRunner(model_name) as runner:
|
with MLXRunner(model_name) as runner:
|
||||||
@@ -278,7 +281,15 @@ class TestMLXRunnerMemorySafety:
|
|||||||
model_name = "test-model"
|
model_name = "test-model"
|
||||||
|
|
||||||
with patch('mlxk2.core.runner.load') as mock_load:
|
with patch('mlxk2.core.runner.load') as mock_load:
|
||||||
mock_load.return_value = (Mock(), Mock())
|
mock_model = Mock()
|
||||||
|
mock_tokenizer = Mock()
|
||||||
|
mock_tokenizer.encode.return_value = [1]
|
||||||
|
mock_tokenizer.decode.return_value = "ok"
|
||||||
|
mock_tokenizer.eos_token_id = 2
|
||||||
|
mock_tokenizer.eos_token_ids = {mock_tokenizer.eos_token_id}
|
||||||
|
mock_tokenizer.additional_special_tokens = []
|
||||||
|
mock_tokenizer.added_tokens_decoder = {}
|
||||||
|
mock_load.return_value = (mock_model, mock_tokenizer)
|
||||||
|
|
||||||
# First runner
|
# First runner
|
||||||
with MLXRunner(model_name) as runner1:
|
with MLXRunner(model_name) as runner1:
|
||||||
@@ -317,12 +328,20 @@ class TestMLXRunnerDynamicTokens:
|
|||||||
model_name = "test-model"
|
model_name = "test-model"
|
||||||
|
|
||||||
with patch('mlxk2.core.runner.load') as mock_load:
|
with patch('mlxk2.core.runner.load') as mock_load:
|
||||||
mock_load.return_value = (Mock(), Mock())
|
mock_model = Mock()
|
||||||
|
mock_tokenizer = Mock()
|
||||||
|
mock_tokenizer.encode.return_value = [1]
|
||||||
|
mock_tokenizer.decode.return_value = "ok"
|
||||||
|
mock_tokenizer.eos_token_id = 2
|
||||||
|
mock_tokenizer.eos_token_ids = {mock_tokenizer.eos_token_id}
|
||||||
|
mock_tokenizer.additional_special_tokens = []
|
||||||
|
mock_tokenizer.added_tokens_decoder = {}
|
||||||
|
mock_load.return_value = (mock_model, mock_tokenizer)
|
||||||
|
|
||||||
with MLXRunner(model_name) as runner:
|
with MLXRunner(model_name) as runner:
|
||||||
# When max_tokens is explicitly set, should respect it
|
# When max_tokens is explicitly set, should respect it
|
||||||
with patch('mlxk2.core.runner.generate_step') as mock_gen:
|
with patch('mlxk2.core.runner.generate_step') as mock_gen:
|
||||||
mock_gen.return_value = ([1], mx.zeros(1))
|
mock_gen.return_value = iter([(mx.array([1]), mx.zeros(1))])
|
||||||
|
|
||||||
# Mock to check that max_tokens is passed through
|
# Mock to check that max_tokens is passed through
|
||||||
result = runner.generate_batch("test", max_tokens=100)
|
result = runner.generate_batch("test", max_tokens=100)
|
||||||
@@ -355,6 +374,7 @@ class TestMLXRunnerErrorHandling:
|
|||||||
mock_tokenizer.encode.return_value = [1]
|
mock_tokenizer.encode.return_value = [1]
|
||||||
mock_tokenizer.decode.return_value = "ok"
|
mock_tokenizer.decode.return_value = "ok"
|
||||||
mock_tokenizer.eos_token_id = 2
|
mock_tokenizer.eos_token_id = 2
|
||||||
|
mock_tokenizer.eos_token_ids = {mock_tokenizer.eos_token_id}
|
||||||
mock_tokenizer.additional_special_tokens = []
|
mock_tokenizer.additional_special_tokens = []
|
||||||
mock_tokenizer.added_tokens_decoder = {}
|
mock_tokenizer.added_tokens_decoder = {}
|
||||||
mock_load.return_value = (mock_model, mock_tokenizer)
|
mock_load.return_value = (mock_model, mock_tokenizer)
|
||||||
|
|||||||
@@ -0,0 +1,467 @@
|
|||||||
|
"""Real-model stop token detection tests for Issue #32 (ADR-009).
|
||||||
|
|
||||||
|
This test suite validates stop token handling with real models that exhibit
|
||||||
|
known issues:
|
||||||
|
- MXFP4: Visible `<|end|>` tokens in output
|
||||||
|
- Qwen 2.5: Self-conversation (chat template role markers)
|
||||||
|
- Llama 3.2: Control baseline (should work correctly)
|
||||||
|
|
||||||
|
Test Strategy (ADR-009):
|
||||||
|
1. Phase 1: Baseline measurement (document broken behavior)
|
||||||
|
2. Phase 2: Fix validation (verify 2-LOC fix works)
|
||||||
|
3. Phase 3: Empirical mapping (document tokenizer configs)
|
||||||
|
|
||||||
|
Opt-in via: pytest -m live_stop_tokens
|
||||||
|
Requires: HF_HOME set to SSD cache (CoW same-volume requirement, ADR-007)
|
||||||
|
|
||||||
|
RAM Safety:
|
||||||
|
- Tests automatically skip models that exceed available RAM
|
||||||
|
- Progressive budget scaling: 40% (16GB), 50% (32GB), 60% (64GB), 70% (96GB+)
|
||||||
|
- Larger systems have lower relative overhead, enabling better RAM utilization
|
||||||
|
- See TESTING.md: "RAM-Aware Model Selection Strategy"
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import pytest
|
||||||
|
import json
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Any, Optional
|
||||||
|
import importlib
|
||||||
|
import importlib.util
|
||||||
|
|
||||||
|
# Opt-in marker for live tests
|
||||||
|
pytestmark = [pytest.mark.live_stop_tokens, pytest.mark.slow]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module", autouse=True)
|
||||||
|
def _use_real_mlx_modules():
|
||||||
|
"""Ensure live tests use real mlx / mlx-lm without polluting the rest of the suite."""
|
||||||
|
stub_path = Path(__file__).parent / "stubs"
|
||||||
|
stub_path_str = str(stub_path)
|
||||||
|
|
||||||
|
# Remove stub path from sys.path (if present) and remember to restore it later
|
||||||
|
path_removed = False
|
||||||
|
if stub_path_str in sys.path:
|
||||||
|
sys.path = [p for p in sys.path if p != stub_path_str]
|
||||||
|
path_removed = True
|
||||||
|
|
||||||
|
# Remove stub modules from sys.modules so real modules can be imported
|
||||||
|
removed_modules: Dict[str, Any] = {}
|
||||||
|
for module_name, module in list(sys.modules.items()):
|
||||||
|
module_file = getattr(module, "__file__", "") or ""
|
||||||
|
if module_file and stub_path_str in module_file:
|
||||||
|
removed_modules[module_name] = module
|
||||||
|
sys.modules.pop(module_name, None)
|
||||||
|
# Also clear any previously installed huggingface_hub shims
|
||||||
|
removed_hf_modules: Dict[str, Any] = {}
|
||||||
|
for module_name, module in list(sys.modules.items()):
|
||||||
|
if module_name == "huggingface_hub" or module_name.startswith("huggingface_hub."):
|
||||||
|
removed_hf_modules[module_name] = module
|
||||||
|
sys.modules.pop(module_name, None)
|
||||||
|
|
||||||
|
# Require real mlx / mlx-lm; skip entire module if not available
|
||||||
|
missing_runtime = False
|
||||||
|
if (
|
||||||
|
importlib.util.find_spec("mlx.core") is None
|
||||||
|
or importlib.util.find_spec("mlx_lm") is None
|
||||||
|
):
|
||||||
|
missing_runtime = True
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
huggingface_hub = importlib.import_module("huggingface_hub")
|
||||||
|
except ImportError:
|
||||||
|
missing_runtime = True
|
||||||
|
else:
|
||||||
|
if not hasattr(huggingface_hub, "snapshot_download"):
|
||||||
|
for name, mod in removed_modules.items():
|
||||||
|
sys.modules[name] = mod
|
||||||
|
for name, mod in removed_hf_modules.items():
|
||||||
|
sys.modules[name] = mod
|
||||||
|
if path_removed and stub_path_str not in sys.path:
|
||||||
|
sys.path.insert(0, stub_path_str)
|
||||||
|
pytest.skip(
|
||||||
|
"requires huggingface_hub.snapshot_download (install latest huggingface-hub)",
|
||||||
|
allow_module_level=True,
|
||||||
|
)
|
||||||
|
if missing_runtime:
|
||||||
|
# Restore previous state before skipping so rest of suite still uses stubs
|
||||||
|
sys.modules.update({name: mod for name, mod in removed_modules.items()
|
||||||
|
if name not in sys.modules})
|
||||||
|
sys.modules.update({name: mod for name, mod in removed_hf_modules.items()
|
||||||
|
if name not in sys.modules})
|
||||||
|
if path_removed and stub_path_str not in sys.path:
|
||||||
|
sys.path.insert(0, stub_path_str)
|
||||||
|
pytest.skip(
|
||||||
|
"requires mlx / mlx-lm native runtime (Apple Silicon)",
|
||||||
|
allow_module_level=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
# Restore stub modules for the remainder of the test run
|
||||||
|
for name, module in removed_modules.items():
|
||||||
|
sys.modules[name] = module
|
||||||
|
for name, module in removed_hf_modules.items():
|
||||||
|
sys.modules[name] = module
|
||||||
|
|
||||||
|
# Ensure stub path is back at the front for unit tests
|
||||||
|
if path_removed and stub_path_str not in sys.path:
|
||||||
|
sys.path.insert(0, stub_path_str)
|
||||||
|
|
||||||
|
# Skip if HF_HOME not set (required for CoW same-volume, ADR-007)
|
||||||
|
_HF_HOME = os.environ.get("HF_HOME")
|
||||||
|
|
||||||
|
requires_hf_home = pytest.mark.skipif(
|
||||||
|
not _HF_HOME,
|
||||||
|
reason="requires HF_HOME set to SSD cache for CoW same-volume (ADR-007)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_system_ram_gb() -> float:
|
||||||
|
"""Detect system RAM in GB (macOS portable)."""
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["sysctl", "hw.memsize"],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True
|
||||||
|
)
|
||||||
|
# Output: "hw.memsize: 68719476736"
|
||||||
|
memsize_bytes = int(result.stdout.strip().split(":")[1].strip())
|
||||||
|
return memsize_bytes / (1024**3) # Convert to GB
|
||||||
|
except Exception:
|
||||||
|
# Fallback: assume minimum safe config (16GB)
|
||||||
|
return 16.0
|
||||||
|
|
||||||
|
|
||||||
|
def get_safe_ram_budget_gb() -> float:
|
||||||
|
"""Get safe RAM budget for model loading (progressive scaling).
|
||||||
|
|
||||||
|
Progressive budget strategy (relative overhead decreases with larger systems):
|
||||||
|
- 16GB System: 40% budget (6.4GB) - high relative OS overhead
|
||||||
|
- 32GB System: 50% budget (16GB) - moderate overhead
|
||||||
|
- 64GB System: 60% budget (38.4GB) - low overhead
|
||||||
|
- 96GB+ System: 70% budget (67GB+) - minimal overhead
|
||||||
|
|
||||||
|
Rationale:
|
||||||
|
- OS/System baseline overhead is ~4-6GB (relatively constant)
|
||||||
|
- Larger systems have more headroom after OS overhead
|
||||||
|
- Progressive scaling allows better utilization of high-RAM systems
|
||||||
|
"""
|
||||||
|
system_ram = get_system_ram_gb()
|
||||||
|
|
||||||
|
# Progressive budget scaling
|
||||||
|
if system_ram >= 96:
|
||||||
|
budget_ratio = 0.70 # 70% for 96GB+ systems
|
||||||
|
elif system_ram >= 64:
|
||||||
|
budget_ratio = 0.60 # 60% for 64GB systems
|
||||||
|
elif system_ram >= 32:
|
||||||
|
budget_ratio = 0.50 # 50% for 32GB systems
|
||||||
|
else:
|
||||||
|
budget_ratio = 0.40 # 40% for 16GB systems (conservative)
|
||||||
|
|
||||||
|
safe_budget = system_ram * budget_ratio
|
||||||
|
return safe_budget
|
||||||
|
|
||||||
|
|
||||||
|
# Test models from ADR-009 with RAM requirements
|
||||||
|
# RAM estimates from TESTING.md: "RAM-Aware Model Selection Strategy"
|
||||||
|
TEST_MODELS = {
|
||||||
|
"mxfp4": {
|
||||||
|
"id": "mlx-community/gpt-oss-20b-MXFP4-Q8",
|
||||||
|
"expected_issue": "visible_end_token",
|
||||||
|
"description": "MXFP4 format with visible <|end|> in output",
|
||||||
|
"ram_needed_gb": 12.0 # 20B MXFP4 (~12GB empirical)
|
||||||
|
},
|
||||||
|
"qwen25": {
|
||||||
|
"id": "mlx-community/Qwen2.5-0.5B-Instruct-4bit",
|
||||||
|
"expected_issue": "self_conversation",
|
||||||
|
"description": "Qwen 2.5 generates chat template markers",
|
||||||
|
"ram_needed_gb": 1.0 # 0.5B 4-bit (~1GB)
|
||||||
|
},
|
||||||
|
"llama32": {
|
||||||
|
"id": "mlx-community/Llama-3.2-3B-Instruct-4bit",
|
||||||
|
"expected_issue": None,
|
||||||
|
"description": "Control baseline (should work correctly)",
|
||||||
|
"ram_needed_gb": 4.0 # 3B 4-bit (~4GB)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def should_skip_model(model_key: str) -> tuple[bool, str]:
|
||||||
|
"""Check if model should be skipped due to insufficient RAM.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(should_skip, reason)
|
||||||
|
"""
|
||||||
|
model_info = TEST_MODELS[model_key]
|
||||||
|
ram_needed = model_info["ram_needed_gb"]
|
||||||
|
ram_budget = get_safe_ram_budget_gb()
|
||||||
|
system_ram = get_system_ram_gb()
|
||||||
|
|
||||||
|
if ram_needed > ram_budget:
|
||||||
|
budget_pct = int((ram_budget / system_ram * 100) if system_ram > 0 else 40)
|
||||||
|
return (
|
||||||
|
True,
|
||||||
|
f"Model requires {ram_needed}GB but only {ram_budget:.1f}GB available "
|
||||||
|
f"({budget_pct}% of {system_ram:.0f}GB system RAM). See TESTING.md RAM-Aware Model Selection."
|
||||||
|
)
|
||||||
|
return (False, "")
|
||||||
|
|
||||||
|
# Standard test prompt (simple, predictable)
|
||||||
|
TEST_PROMPT = "Write one sentence about cats."
|
||||||
|
MAX_TOKENS = 50
|
||||||
|
|
||||||
|
|
||||||
|
class TestStopTokensValidation:
|
||||||
|
"""Validation: Verify stop token handling works correctly (Issue #32, ADR-009)."""
|
||||||
|
|
||||||
|
@requires_hf_home
|
||||||
|
def test_mxfp4_stop_token_filtering(self):
|
||||||
|
"""MXFP4: Stop tokens should be filtered correctly.
|
||||||
|
|
||||||
|
After ADR-009 2-LOC fix (eos_token_id → eos_token_ids):
|
||||||
|
- Model should stop cleanly without visible stop tokens
|
||||||
|
- No `<|end|>` or `<|return|>` in output
|
||||||
|
|
||||||
|
Background (Issue #32):
|
||||||
|
- MXFP4 previously showed visible `<|end|>` tokens
|
||||||
|
- Root cause: Runner only checked singular eos_token_id
|
||||||
|
- Fix: Use eos_token_ids Set to handle multiple EOS tokens
|
||||||
|
"""
|
||||||
|
# RAM Safety Check
|
||||||
|
should_skip, reason = should_skip_model("mxfp4")
|
||||||
|
if should_skip:
|
||||||
|
pytest.skip(reason)
|
||||||
|
|
||||||
|
from mlxk2.core.runner import MLXRunner
|
||||||
|
|
||||||
|
model_id = TEST_MODELS["mxfp4"]["id"]
|
||||||
|
|
||||||
|
# Run inference
|
||||||
|
with MLXRunner(model_id) as runner:
|
||||||
|
output = runner.generate_batch(
|
||||||
|
prompt=TEST_PROMPT,
|
||||||
|
max_tokens=MAX_TOKENS
|
||||||
|
)
|
||||||
|
|
||||||
|
# Validate clean output
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"VALIDATION: MXFP4")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f"Model: {model_id}")
|
||||||
|
print(f"Prompt: {TEST_PROMPT}")
|
||||||
|
print(f"Output: {output!r}")
|
||||||
|
|
||||||
|
# Assert no visible stop tokens
|
||||||
|
assert "<|end|>" not in output, "MXFP4 should filter <|end|> token"
|
||||||
|
assert "<|return|>" not in output, "MXFP4 should filter <|return|> token"
|
||||||
|
|
||||||
|
print("✓ MXFP4: Stop tokens correctly filtered")
|
||||||
|
|
||||||
|
@requires_hf_home
|
||||||
|
def test_qwen25_no_self_conversation(self):
|
||||||
|
"""Qwen 2.5: Should not generate chat template role markers (self-conversation).
|
||||||
|
|
||||||
|
Self-Conversation Definition (ADR-009):
|
||||||
|
- Model generates chat template role markers (User:, Assistant:, etc.)
|
||||||
|
- Common patterns: '\nUser:', '\nAssistant:', '<|im_start|>user', '<|im_start|>assistant'
|
||||||
|
- Specific to Qwen: '<|im_start|>', '<|im_end|>' markers
|
||||||
|
|
||||||
|
Expected Behavior:
|
||||||
|
- Model stops cleanly after its response
|
||||||
|
- No chat template markers in output
|
||||||
|
"""
|
||||||
|
# RAM Safety Check
|
||||||
|
should_skip, reason = should_skip_model("qwen25")
|
||||||
|
if should_skip:
|
||||||
|
pytest.skip(reason)
|
||||||
|
|
||||||
|
from mlxk2.core.runner import MLXRunner
|
||||||
|
|
||||||
|
model_id = TEST_MODELS["qwen25"]["id"]
|
||||||
|
|
||||||
|
# Run inference
|
||||||
|
with MLXRunner(model_id) as runner:
|
||||||
|
output = runner.generate_batch(
|
||||||
|
prompt=TEST_PROMPT,
|
||||||
|
max_tokens=MAX_TOKENS
|
||||||
|
)
|
||||||
|
|
||||||
|
# Validate clean output
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"VALIDATION: Qwen 2.5")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f"Model: {model_id}")
|
||||||
|
print(f"Prompt: {TEST_PROMPT}")
|
||||||
|
print(f"Output: {output!r}")
|
||||||
|
|
||||||
|
# Check for self-conversation patterns
|
||||||
|
generic_markers = ["\nUser:", "\nAssistant:", "\nHuman:", "\nAI:"]
|
||||||
|
qwen_markers = ["<|im_start|>user", "<|im_start|>assistant", "<|im_start|>", "<|im_end|>"]
|
||||||
|
|
||||||
|
found_generic = [m for m in generic_markers if m in output]
|
||||||
|
found_qwen = [m for m in qwen_markers if m in output]
|
||||||
|
|
||||||
|
print(f"Generic markers found: {found_generic}")
|
||||||
|
print(f"Qwen markers found: {found_qwen}")
|
||||||
|
|
||||||
|
# Assert no self-conversation
|
||||||
|
assert not found_generic, f"Qwen 2.5 should not generate generic chat markers. Found: {found_generic}"
|
||||||
|
assert not found_qwen, f"Qwen 2.5 should not generate Qwen-specific markers. Found: {found_qwen}"
|
||||||
|
|
||||||
|
print("✓ Qwen 2.5: No self-conversation")
|
||||||
|
|
||||||
|
@requires_hf_home
|
||||||
|
def test_llama32_regression_control(self):
|
||||||
|
"""Llama 3.2: Regression control (should work correctly).
|
||||||
|
|
||||||
|
Llama 3.2 has 3 eos_token_ids: [128008, 128001, 128009]
|
||||||
|
This validates that the 2-LOC fix correctly handles multi-EOS models.
|
||||||
|
|
||||||
|
Expected Behavior:
|
||||||
|
- Clean output without visible stop tokens
|
||||||
|
- No self-conversation
|
||||||
|
- Serves as regression baseline
|
||||||
|
"""
|
||||||
|
# RAM Safety Check
|
||||||
|
should_skip, reason = should_skip_model("llama32")
|
||||||
|
if should_skip:
|
||||||
|
pytest.skip(reason)
|
||||||
|
|
||||||
|
from mlxk2.core.runner import MLXRunner
|
||||||
|
|
||||||
|
model_id = TEST_MODELS["llama32"]["id"]
|
||||||
|
|
||||||
|
# Run inference
|
||||||
|
with MLXRunner(model_id) as runner:
|
||||||
|
output = runner.generate_batch(
|
||||||
|
prompt=TEST_PROMPT,
|
||||||
|
max_tokens=MAX_TOKENS
|
||||||
|
)
|
||||||
|
|
||||||
|
# Validate clean output
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"VALIDATION: Llama 3.2 (Regression Control)")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f"Model: {model_id}")
|
||||||
|
print(f"Prompt: {TEST_PROMPT}")
|
||||||
|
print(f"Output: {output!r}")
|
||||||
|
|
||||||
|
# Llama 3.2 stop tokens
|
||||||
|
llama_stop_tokens = ["<|eot_id|>", "</s>", "<|end_of_text|>"]
|
||||||
|
found_stop = [t for t in llama_stop_tokens if t in output]
|
||||||
|
assert not found_stop, f"Llama 3.2 should filter stop tokens. Found: {found_stop}"
|
||||||
|
|
||||||
|
# No generic chat markers
|
||||||
|
generic_markers = ["\nUser:", "\nAssistant:", "\nHuman:", "\nAI:"]
|
||||||
|
found_markers = [m for m in generic_markers if m in output]
|
||||||
|
assert not found_markers, f"Llama 3.2 should not self-converse. Found: {found_markers}"
|
||||||
|
|
||||||
|
print("✓ Llama 3.2: Clean output (regression control passed)")
|
||||||
|
|
||||||
|
|
||||||
|
class TestStopTokensEmpiricalMapping:
|
||||||
|
"""Phase 3: Empirical mapping - document tokenizer configs and observed tokens."""
|
||||||
|
|
||||||
|
@requires_hf_home
|
||||||
|
def test_empirical_mapping_all_models(self):
|
||||||
|
"""Document tokenizer configs and empirically observed stop tokens.
|
||||||
|
|
||||||
|
Generates report: stop_token_config_report.json
|
||||||
|
|
||||||
|
Report Format (ADR-009):
|
||||||
|
{
|
||||||
|
"model": "gpt-oss",
|
||||||
|
"configured_eos": ["<|return|>"], # From tokenizer.eos_token
|
||||||
|
"configured_eos_ids": [50256, ...], # From tokenizer.eos_token_ids
|
||||||
|
"generated_tokens": ["<|end|>", ...], # Empirically observed
|
||||||
|
"workaround_needed": True/False
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
from mlxk2.core.runner import MLXRunner
|
||||||
|
|
||||||
|
report = {}
|
||||||
|
system_ram = get_system_ram_gb()
|
||||||
|
ram_budget = get_safe_ram_budget_gb()
|
||||||
|
|
||||||
|
# Calculate actual budget ratio used
|
||||||
|
budget_ratio = ram_budget / system_ram if system_ram > 0 else 0.40
|
||||||
|
|
||||||
|
# Add system info to report
|
||||||
|
report["_system_info"] = {
|
||||||
|
"system_ram_gb": round(system_ram, 1),
|
||||||
|
"ram_budget_gb": round(ram_budget, 1),
|
||||||
|
"budget_ratio": round(budget_ratio, 2)
|
||||||
|
}
|
||||||
|
|
||||||
|
for model_key, model_info in TEST_MODELS.items():
|
||||||
|
model_id = model_info["id"]
|
||||||
|
|
||||||
|
# Skip models that exceed RAM budget
|
||||||
|
should_skip, skip_reason = should_skip_model(model_key)
|
||||||
|
if should_skip:
|
||||||
|
print(f"\nSkipping {model_key}: {skip_reason}")
|
||||||
|
report[model_key] = {
|
||||||
|
"model_id": model_id,
|
||||||
|
"skipped": True,
|
||||||
|
"skip_reason": skip_reason
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
|
||||||
|
with MLXRunner(model_id) as runner:
|
||||||
|
# Get tokenizer config
|
||||||
|
tokenizer = runner.tokenizer
|
||||||
|
|
||||||
|
# Extract configured stop tokens
|
||||||
|
eos_token = getattr(tokenizer, "eos_token", None)
|
||||||
|
eos_token_id = getattr(tokenizer, "eos_token_id", None)
|
||||||
|
|
||||||
|
# Try to get eos_token_ids (Set or List)
|
||||||
|
eos_token_ids = None
|
||||||
|
if hasattr(tokenizer, "eos_token_ids"):
|
||||||
|
eos_token_ids = tokenizer.eos_token_ids
|
||||||
|
if hasattr(eos_token_ids, "__iter__"):
|
||||||
|
eos_token_ids = list(eos_token_ids)
|
||||||
|
|
||||||
|
# Run inference to observe actual behavior
|
||||||
|
output = runner.generate_batch(
|
||||||
|
prompt=TEST_PROMPT,
|
||||||
|
max_tokens=MAX_TOKENS
|
||||||
|
)
|
||||||
|
|
||||||
|
# Detect visible stop tokens
|
||||||
|
potential_stop_tokens = ["<|end|>", "<|eot_id|>", "<|im_end|>", "<|endoftext|>"]
|
||||||
|
found_stop_tokens = [t for t in potential_stop_tokens if t in output]
|
||||||
|
|
||||||
|
report[model_key] = {
|
||||||
|
"model_id": model_id,
|
||||||
|
"configured_eos_token": eos_token,
|
||||||
|
"configured_eos_token_id": eos_token_id,
|
||||||
|
"configured_eos_token_ids": eos_token_ids,
|
||||||
|
"generated_output": output[:100], # First 100 chars for reference
|
||||||
|
"visible_stop_tokens": found_stop_tokens,
|
||||||
|
"workaround_needed": bool(found_stop_tokens)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Write report
|
||||||
|
report_path = Path("stop_token_config_report.json")
|
||||||
|
report_path.write_text(json.dumps(report, indent=2))
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"EMPIRICAL MAPPING REPORT")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(json.dumps(report, indent=2))
|
||||||
|
print(f"\nReport saved to: {report_path.absolute()}")
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
models_needing_fix = [
|
||||||
|
k for k, v in report.items()
|
||||||
|
if isinstance(v, dict) and v.get("workaround_needed")
|
||||||
|
]
|
||||||
|
print(f"\nModels needing fix: {models_needing_fix}")
|
||||||
@@ -138,6 +138,7 @@ class TestTokenLimitApplication:
|
|||||||
mock_tokenizer = Mock()
|
mock_tokenizer = Mock()
|
||||||
mock_tokenizer.eos_token = "</s>"
|
mock_tokenizer.eos_token = "</s>"
|
||||||
mock_tokenizer.eos_token_id = 2
|
mock_tokenizer.eos_token_id = 2
|
||||||
|
mock_tokenizer.eos_token_ids = {mock_tokenizer.eos_token_id}
|
||||||
mock_tokenizer.additional_special_tokens = []
|
mock_tokenizer.additional_special_tokens = []
|
||||||
mock_tokenizer.added_tokens_decoder = {}
|
mock_tokenizer.added_tokens_decoder = {}
|
||||||
mock_tokenizer.encode.return_value = [1, 2, 3]
|
mock_tokenizer.encode.return_value = [1, 2, 3]
|
||||||
@@ -170,6 +171,7 @@ class TestTokenLimitApplication:
|
|||||||
mock_tokenizer = Mock()
|
mock_tokenizer = Mock()
|
||||||
mock_tokenizer.eos_token = "</s>"
|
mock_tokenizer.eos_token = "</s>"
|
||||||
mock_tokenizer.eos_token_id = 2
|
mock_tokenizer.eos_token_id = 2
|
||||||
|
mock_tokenizer.eos_token_ids = {mock_tokenizer.eos_token_id}
|
||||||
mock_tokenizer.additional_special_tokens = []
|
mock_tokenizer.additional_special_tokens = []
|
||||||
mock_tokenizer.added_tokens_decoder = {}
|
mock_tokenizer.added_tokens_decoder = {}
|
||||||
mock_tokenizer.encode.return_value = [1, 2, 3]
|
mock_tokenizer.encode.return_value = [1, 2, 3]
|
||||||
@@ -202,6 +204,7 @@ class TestTokenLimitApplication:
|
|||||||
mock_tokenizer = Mock()
|
mock_tokenizer = Mock()
|
||||||
mock_tokenizer.eos_token = "</s>"
|
mock_tokenizer.eos_token = "</s>"
|
||||||
mock_tokenizer.eos_token_id = 2
|
mock_tokenizer.eos_token_id = 2
|
||||||
|
mock_tokenizer.eos_token_ids = {mock_tokenizer.eos_token_id}
|
||||||
mock_tokenizer.additional_special_tokens = []
|
mock_tokenizer.additional_special_tokens = []
|
||||||
mock_tokenizer.added_tokens_decoder = {}
|
mock_tokenizer.added_tokens_decoder = {}
|
||||||
mock_tokenizer.encode.return_value = [1, 2, 3]
|
mock_tokenizer.encode.return_value = [1, 2, 3]
|
||||||
@@ -384,4 +387,4 @@ class TestServerVsRunDifferences:
|
|||||||
server_policy = runner._calculate_dynamic_max_tokens(server_mode=True)
|
server_policy = runner._calculate_dynamic_max_tokens(server_mode=True)
|
||||||
|
|
||||||
assert run_policy > server_policy
|
assert run_policy > server_policy
|
||||||
assert run_policy / server_policy == 2.0 # Exactly 2x difference
|
assert run_policy / server_policy == 2.0 # Exactly 2x difference
|
||||||
|
|||||||
Reference in New Issue
Block a user