From 25609e4dcb239fab4013edbc61d9ef0a96762e0e Mon Sep 17 00:00:00 2001 From: The BROKE Cluster Team Date: Wed, 31 Dec 2025 16:05:18 +0100 Subject: [PATCH] Release 2.0.4-beta.5: Community repair tool + OS-agnostic benchmarking Closes #49 (Mistral Tokenizer Bug) Major features: - Workspace Infrastructure (ADR-018 Phase 0a): Managed workspace detection, provenance metadata, backward compatible with unmanaged workspaces - Convert Operation (ADR-018 Phase 1): `mlxk convert --repair-index` fixes mlx-vlm #624 affected models (7+ models including Qwen2.5-VL, gemma-3) - Resumable Pull: Auto-detect partial downloads with `--force-resume` - Wet Umbrella Test Integration: Single entry point for all real model tests Fixes: - #49: BPE space markers now correctly converted (Mistral-family models) - Vision Portfolio Discovery: Filter by capabilities instead of model_type - Memory Cleanup Hook: Triggers for both live_e2e and wet markers Test suite: 528 passed, 60 skipped (Python 3.9-3.14) --- .gitignore | 1 + CHANGELOG.md | 63 +++ README.md | 138 +++++- TESTING-DETAILS.md | 545 ++++++++++++++++++++---- TESTING.md | 53 ++- benchmarks/generate_benchmark_report.py | 47 +- benchmarks/tools/memmon.py | 124 +++--- docs/ADR/ADR-018-Convert-Operation.md | 501 ++++++++++++++++++++++ mlxk2/__init__.py | 2 +- mlxk2/cli.py | 81 +++- mlxk2/core/runner/__init__.py | 134 +++++- mlxk2/operations/clone.py | 62 ++- mlxk2/operations/convert.py | 355 +++++++++++++++ mlxk2/operations/health.py | 108 ++++- mlxk2/operations/pull.py | 40 +- mlxk2/operations/workspace.py | 142 ++++++ mlxk2/output/human.py | 61 ++- pytest.ini | 3 +- scripts/test-wet-memmom.sh | 36 ++ scripts/test-wet-umbrella.sh | 24 ++ tests_2.0/conftest.py | 355 ++++++++++++++- tests_2.0/live/conftest.py | 363 ++++------------ tests_2.0/test_clone_operation.py | 181 +++++++- tests_2.0/test_convert_repair_index.py | 303 +++++++++++++ tests_2.0/test_ctrl_c_handling.py | 50 ++- tests_2.0/test_human_output.py | 109 ++++- tests_2.0/test_interruption_recovery.py | 55 ++- tests_2.0/test_resumable_pull.py | 369 ++++++++++++++++ tests_2.0/test_runner_core.py | 52 ++- tests_2.0/test_stop_tokens_live.py | 284 +++++++++--- tests_2.0/test_workspace_sentinel.py | 304 +++++++++++++ 31 files changed, 4355 insertions(+), 590 deletions(-) create mode 100644 docs/ADR/ADR-018-Convert-Operation.md create mode 100644 mlxk2/operations/convert.py create mode 100644 mlxk2/operations/workspace.py create mode 100755 scripts/test-wet-memmom.sh create mode 100755 scripts/test-wet-umbrella.sh create mode 100644 tests_2.0/test_convert_repair_index.py create mode 100644 tests_2.0/test_resumable_pull.py create mode 100644 tests_2.0/test_workspace_sentinel.py diff --git a/.gitignore b/.gitignore index a33ade5..f410c73 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ mlx_knife/* .DS_Store .claude/ mymodel_test_workspace/ +live_clone_ws/ build/ dist/ *.egg-info/ diff --git a/CHANGELOG.md b/CHANGELOG.md index cdc0942..719c911 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,68 @@ # Changelog +## [2.0.4-beta.5] - 2025-12-31 + +### Added + +- **Workspace Infrastructure (ADR-018 Phase 0a):** + - Managed workspace detection via `.mlxk_workspace.json` sentinel + - Workspace health checks support managed and unmanaged workspaces + - All cloned workspaces now include provenance metadata + - Backward compatible with unmanaged workspaces (pre-2.0.4) + - See ADR-018.md Phase 0a for details + +- **Convert Operation (ADR-018 Phase 1):** + - `mlxk convert --repair-index` rebuilds safetensors index from shards + - Fixes mlx-vlm #624 affected models (7+ models: Qwen2.5-VL, gemma-3, Mistral-Small, etc.) + - Cache sanctity: Hard blocks writes to HF cache + - APFS CoW optimization for instant, space-efficient cloning + - See ADR-018.md Phase 1 for details + +- **Resumable Pull Support:** + - Auto-detect partial downloads with user confirmation prompts + - `--force-resume` flag for non-interactive automation + - Health-aware logic for damaged downloads + +- **Wet Umbrella Test Integration:** + - Single entry point: `scripts/test-wet-umbrella.sh` runs all real model tests + - Auto-marker assignment via pytest hook + - Memory-optimized options prevent pytest RAM spikes + - See TESTING-DETAILS.md for architecture + +- **Vision Model Benchmark Reporting:** + - Auto-reporting fixture adds Vision metadata to benchmark JSONL + - Schema v0.2.0 compliance for memplot integration + +### Changed + +- **Clone operation:** Now produces managed workspaces (sentinel written after APFS clone, enables provenance detection) + +### Fixed + +- **Vision Portfolio Discovery:** Fixed KeyError when discovering Vision models. Issue: Vision models share `model_type="chat"` with TEXT. Solution: Filter by capabilities instead. Result: 3 Vision models now detected. + +- **Mistral Tokenizer Bug (Issue #49):** Fixed BPE space markers `Ġ` (U+0120) appearing in output instead of spaces. Affected: 3+ Mistral-family models (DeepHermes, Mistral-Small-3.2, DeepSeek-R1). Root cause: Broken `tokenizer.json` PreTokenizer regex patterns. Solution: (1) Post-load tokenizer regex patching for encoding fix, (2) `_decode_tokens()` helper using `tokenizer.detokenizer` for proper BPE space marker conversion in both streaming and batch modes. Impact: Correct encoding/decoding, 15% context window waste eliminated. All generation modes fixed (streaming, batch, server). Related: HuggingFace discussion https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84 + +- **Memory Cleanup Hook:** Fixed 60GB swap during wet tests. Hook now triggers for both `live_e2e` and `wet` markers. + +- **Missing Model Skip:** Regression test now skips cleanly when hardcoded model not in cache. + +- **pytest CLI option location:** Moved `--report-output` hook from subdirectory to root conftest.py (pytest requirement). + +- **Cache resolution in pull:** Dynamic `get_current_model_cache()` instead of module-level import (fixes test isolation). + +- **Test Regression:** Fixed 4 unit tests after BPE detokenizer changes. Created `MockDetokenizer` class matching BPEStreamingDetokenizer API. + +### Testing + +- **Benchmark Quality Metrics (OS-agnostic):** Switched from swap-based to RAM-based quality flags. DeepHermes-3-24B steady-state baseline analysis revealed OS-specific memory behavior: Tahoe maintains ~24 GB free RAM with aggressive swap (1-55 GB), Sequoia maintains ~40 GB free with minimal swap (0 MB). Despite 16 GB RAM difference, both OSs achieve similar performance. Identified `ram_free < 5 GB` as universal degradation threshold (extreme memory pressure). Result: 97.8% clean on Tahoe, 96.9% clean on Sequoia. Report generator now recalculates quality flags from raw metrics, ignoring stored flags. Created memory profiling tooling (memmon.py, memplot.py). MLX 0.30.x cleanup verified functional. + +### Community + +- **Repair workflow:** 7+ mlx-community VLM models fixable with `mlxk clone ./ws && mlxk convert ./ws ./ws-fixed --repair-index`. Discord announcement pending mlx-vlm 0.3.10 release. + +--- + ## [2.0.4-beta.4] - 2025-12-25 ### Fixed diff --git a/README.md b/README.md index 91586a4..cfbf802 100644 --- a/README.md +++ b/README.md @@ -4,14 +4,16 @@ MLX Knife Demo

-**Current Version: 2.0.4-beta.4** (Stable: 2.0.3) +**Current Version: 2.0.4-beta.5** (Stable: 2.0.3) -[![GitHub Release](https://img.shields.io/badge/version-2.0.4--beta.4-blue.svg)](https://github.com/mzau/mlx-knife/releases) +[![GitHub Release](https://img.shields.io/badge/version-2.0.4--beta.5-blue.svg)](https://github.com/mzau/mlx-knife/releases) [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0) [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/) [![Apple Silicon](https://img.shields.io/badge/Apple%20Silicon-green.svg)](https://support.apple.com/en-us/HT211814) [![MLX](https://img.shields.io/badge/MLX-Latest-orange.svg)](https://github.com/ml-explore/mlx) +**Release Notes:** See [CHANGELOG.md](CHANGELOG.md) for detailed changes, fixes, and migration guides. + ## Features @@ -75,7 +77,7 @@ MLX Knife has been comprehensively tested and verified on: ## Installation -### Via PyPI (Recommended) +### Via PyPI (Stable) ```bash # Basic installation (Text models only, Python 3.9+) @@ -85,18 +87,32 @@ pip install mlx-knife pip install mlx-knife[vision] # Verify installation -mlxk --version # → mlxk 2.0.3 (stable) or 2.0.4-beta.4 (dev) +mlxk --version # → mlxk 2.0.3 (latest stable on PyPI) ``` **Python Requirements:** - **Text models:** Python 3.9-3.14 -- **Vision models:** Python 3.10-3.14 (requires mlx-vlm with Pixtral pad_token fix) +- **Vision models:** Python 3.10-3.14 + +**Note:** Version 2.0.4 is under development. Beta releases are available on GitHub only (see below). + +### Via GitHub (Latest Beta) -**Beta.4 note:** Uses mlx-vlm commit c536165df2b3b4aece3a795b2e414349f935e750 (includes Pixtral text-only fix). The `[vision]` extra automatically installs the correct version: ```bash -pip install mlx-knife[vision] # Installs mlx-vlm from git with fix +# Install 2.0.4-beta.5 (Community repair tools + BPE fix) +pip install "git+https://github.com/mzau/mlx-knife.git@v2.0.4-beta.5" + +# With Vision support (Python 3.10+ required) +pip install "git+https://github.com/mzau/mlx-knife.git@v2.0.4-beta.5#egg=mlx-knife[vision]" + +# Verify installation +mlxk --version # → mlxk 2.0.4b5 ``` +**Beta.5 note:** Uses mlx-vlm commit c536165df2b3b4aece3a795b2e414349f935e750 (includes Pixtral text-only fix). The `[vision]` extra automatically installs the correct version. + +**For production use:** Wait for 2.0.4 stable on PyPI (requires mlx-vlm 0.3.10 release). + ### Development Installation ```bash @@ -111,7 +127,7 @@ pip install -e ".[dev,test]" pip install -e ".[dev,test,vision]" # Verify installation -mlxk --version # → mlxk 2.0.4-beta.4 +mlxk --version # → mlxk 2.0.4b5 # Run tests and quality checks (before committing) pytest -v @@ -143,6 +159,9 @@ mlxk show "mlx-community/Phi-3-mini-4k-instruct-4bit" # Pull a model mlxk pull "mlx-community/Llama-3.2-3B-Instruct-4bit" +# Resume interrupted download (skip prompt) +mlxk pull "model-name" --force-resume + # Run interactive chat mlxk run "Phi-3-mini" -c @@ -184,6 +203,7 @@ open index.html | `rm` | Model deletion with lock cleanup and fuzzy matching | | 🔒 `push` | **Alpha feature** - Upload to HuggingFace Hub; requires `MLXK2_ENABLE_ALPHA_FEATURES=1` | | 🔒 `clone` | **Alpha feature** - Model workspace cloning; requires `MLXK2_ENABLE_ALPHA_FEATURES=1` | +| 🔒 `convert` | **Beta feature** - Workspace transformations (repair-index, quantize); `--repair-index` fixes mlx-vlm #624 models | | 🔒 `pipe mode` | **Beta feature** - Unix pipes with `mlxk run - ...`; requires `MLXK2_ENABLE_PIPES=1` | @@ -457,6 +477,30 @@ mlxk list mlxk list --health mlxk health mlxk show "mlx-community/Phi-3-mini-4k-instruct-4bit" +mlxk pull "mlx-community/Llama-3.2-3B-Instruct-4bit" +``` + +### Pull Command + +Download models from HuggingFace: + +```bash +mlxk pull "mlx-community/Phi-3-mini-4k-instruct-4bit" +``` + +**Interrupted downloads (2.0.4-beta.5+):** If a download fails (network issue, Ctrl-C), `mlxk pull` will detect this and prompt to resume: + +```bash +$ mlxk pull "model-name" +Model 'model-name' has partial download: + No model weights found. Use --force-resume to attempt resume or 'mlxk rm' to delete. +Resume download? [Y/n]: y +``` + +**Automation/scripting:** Use `--force-resume` to skip the prompt: + +```bash +mlxk pull "model-name" --force-resume ``` ### List Filters @@ -731,6 +775,45 @@ mlxk health --json | jq '.data.summary' ## Feature Gates: `clone`, `push` (Alpha), `pipe mode` (Beta) +### Workspace Structure + +A **workspace** is a self-contained directory containing model files in a flat structure (not the HuggingFace cache format). Workspaces are portable, editable, and can be health-checked standalone. + +**Structure:** +``` +workspace/ +├── config.json # Model configuration +├── tokenizer.json # Tokenizer definition +├── tokenizer_config.json # Tokenizer settings +├── model.safetensors # Weights (single file) +├── (or model-*.safetensors) # Weights (multi-shard) +└── README.md # Optional documentation +``` + +**Key characteristics:** + +| Aspect | **Workspace** | **HuggingFace Cache** | +|--------|--------------|----------------------| +| Structure | Flat, self-contained | Nested (hub/models--org--repo/snapshots/...) | +| Models | **Exactly one** model per workspace | Many models (models--org--repo1, models--org--repo2, ...) | +| Purpose | Portable working directory | Download cache (managed) | +| Health Check | Standalone (no cache needed) | Requires cache structure | +| Portability | **Goal:** USB stick, SMB share, any volume | Fixed location (HF_HOME) | +| Ownership | User owns files | Managed by HuggingFace Hub | +| Operations | `clone` (creates), `push` (uploads from) | `pull` (downloads to) | + +**Portability (Phase 1 limitation):** +- **Current:** Same APFS volume as cache (CoW optimization) +- **Community Goal:** Any location (USB stick, SMB share, different volumes) +- **Future:** Cross-volume support planned + +**Typical workflow:** +1. `mlxk pull org/model` → Downloads to cache +2. `mlxk clone org/model workspace/` → Creates editable workspace copy +3. Edit files in `workspace/` (modify config, quantize, etc.) +4. `mlxk push workspace/ org/new-model` → Upload modified version +5. (Optional) Copy workspace to USB stick for sharing + ### `clone` - Model Workspace Creation `mlxk clone` is a hidden alpha feature. Enable with `MLXK2_ENABLE_ALPHA_FEATURES=1`. It creates a local workspace from a cached model for modification and development. @@ -779,6 +862,43 @@ mlxk push --private ./workspace org/model --create --commit "init" These features are not final and may change or be removed in future releases. +### `convert` - Workspace Transformations (Beta) + +`mlxk convert` transforms workspaces (repair, quantize, etc.). The `--repair-index` mode is beta (feature complete) and fixes safetensors index/shard mismatches. + +**Use case:** Repair models affected by mlx-vlm #624 conversion bug (7+ mlx-community Vision models). + +**Workflow:** +```bash +# Enable alpha features (required for clone) +export MLXK2_ENABLE_ALPHA_FEATURES=1 + +# Clone affected model to workspace +mlxk clone mlx-community/Qwen2.5-VL-7B-Instruct-4bit ./ws-qwen + +# Repair safetensors index (no weights changed) +mlxk convert ./ws-qwen ./ws-qwen-fixed --repair-index + +# Verify health +mlxk health ./ws-qwen-fixed # Should report healthy +``` + +**Affected models (mlx-vlm #624):** +- Qwen2.5-VL-7B-Instruct-4bit +- gemma-3-27b-it-4bit +- Mistral-Small-3.1-24B-Instruct-2503-4bit +- DeepSeek-OCR-4bit +- Devstral-Small-2-24B-Instruct-2512-6bit +- (7+ models total) + +**Key features:** +- **Cache sanctity:** Hard blocks writes to HF cache (workspaces only) +- **Workspace-to-workspace:** Source can be managed or unmanaged, output always managed +- **Health check integration:** Automatic validation (skip with `--skip-health`) +- **APFS CoW:** Instant, space-efficient cloning via `cp -c` + +**Future modes:** `--quantize ` (text models), `--dequantize` (planned). + ### `pipe mode` - stdin for `run` (beta, `mlx-run` shorthand) Pipe mode is beta (feature complete) and requires `MLXK2_ENABLE_PIPES=1`. It lets `mlxk run` (and `mlx-run`) read stdin when you pass `-` as the prompt. @@ -883,7 +1003,7 @@ Apache License 2.0 — see `LICENSE` (root) and `mlxk2/NOTICE`.

Made with ❤️ by The BROKE team BROKE Logo
- Version 2.0.4-beta.4 | December 2025
+ Version 2.0.4-beta.5 | December 2025
💬 Web UI: nChat - lightweight chat interface🔮 Multi-node: BROKE Cluster

diff --git a/TESTING-DETAILS.md b/TESTING-DETAILS.md index a2f5d7d..f886d4e 100644 --- a/TESTING-DETAILS.md +++ b/TESTING-DETAILS.md @@ -4,16 +4,16 @@ This document contains version-specific details, complete file listings, and imp ## Current Status -✅ **2.0.4-beta.1 (Release Candidate)** — Probe/Policy architecture complete; Vision support Phase 1-3 (CLI + Server); Pipes/Memory-Aware; EXIF metadata; **Test Portfolio Separation complete**. -✅ **Test Suite:** **476-485 unit tests passing** (Py3.9: 476 passed/65 skipped, Py3.10+: 485 passed/56 skipped, baseline without HF_HOME); Live E2E: 139 passed/21 skipped -✅ **Test environment:** macOS 14.x, M2 Max, Python 3.9-3.14 +✅ **2.0.4-beta.5 (WIP for stable)** — Probe/Policy architecture complete; Vision support Phase 1-3 (CLI + Server); Pipes/Memory-Aware; EXIF metadata; **Test Portfolio Separation complete**; Workspace Infrastructure (ADR-018 Phase 0a); Convert Operation (ADR-018 Phase 1). +✅ **Test Suite:** **528 unit tests passing** (Py3.10: 528 passed/60 skipped, baseline without HF_HOME); Live E2E: 144+ passed/21 skipped +✅ **Test environment:** macOS 26.2 (Tahoe), M2 Max, Python 3.9-3.14 ✅ **Production verified & reported:** M1, M1 Max, M2 Max in real-world use ✅ **License:** Apache 2.0 (was MIT in 1.x) ✅ **Isolated test system** - user cache stays pristine with temp cache isolation ✅ **3-category test strategy** - optimized for performance and safety ✅ **Portfolio Separation** - Text and Vision models tested independently with separate RAM formulas -### Skipped Tests Breakdown (56 total Python 3.10+, 65 total Python 3.9, standard run without HF_HOME) +### Skipped Tests Breakdown (60 total Python 3.10+, 69 total Python 3.9, standard run without HF_HOME) - **34 Live E2E tests** - Server/HTTP/CLI validation with real models (requires `pytest -m live_e2e`, ADR-011 + Portfolio Separation) - **23 Text model tests** - Parametrized across text_portfolio (chat completions batch/streaming) - **3 Vision model tests** - Parametrized across vision_portfolio (multimodal, SSE, text-on-vision) @@ -25,6 +25,7 @@ This document contains version-specific details, complete file listings, and imp - **2 Runtime Compatibility tests** - Reason chain validation (requires specific model types) - **1 Live List test** - Tests against user cache (requires HF_HOME with models) - **1 Live Push test** - Real HuggingFace push (requires `MLXK2_LIVE_PUSH=1`) +- **1 Resumable Pull test** - Real network download with controlled interruption (requires `MLXK2_TEST_RESUMABLE_DOWNLOAD=1`) - **2 Show Portfolio tests** - Display text/vision portfolios separately (requires HF_HOME) - **7 Issue #27 tests** - Real-model health validation (requires HF_HOME or MLXK2_USER_HF_HOME setup) @@ -78,6 +79,7 @@ For complete test file structure, see [Appendix](#complete-test-file-structure-2 | Live E2E (ADR-011) | `HF_HOME=/path/to/cache pytest -m live_e2e -v` | `live_e2e` (required) + Env: `HF_HOME` (optional, enables Portfolio Discovery); Requires: `httpx` installed | **✅ Working:** Server/HTTP/CLI validation with real models. Portfolio Discovery auto-discovers all MLX chat models via `mlxk list --json` (filter: MLX+healthy+runtime+chat), parametrized tests (one server per model), RAM-aware skip. | No (uses local cache) | | Vision CLI E2E (ADR-012) | `HF_HOME=/path/to/cache pytest -m live_e2e tests_2.0/live/test_vision_e2e_live.py -v` | `live_e2e` (required) + Env: `HF_HOME` (vision model in cache, e.g., pixtral-12b-8bit or Llama-3.2-Vision); Requires: `mlx-vlm` installed (Python 3.10+) | **✅ Working:** Deterministic vision queries validate actual image understanding (not hallucination). Tests: chess position reading (e6=black king), OCR text extraction (contract name), color recognition (blue mug), chart label reading (Y-axis), large image support (2.7MB). | No (uses local cache) | | Vision Server E2E (ADR-012 Phase 3) | `HF_HOME=/path/to/cache pytest -m live_e2e tests_2.0/live/test_vision_server_e2e.py -v` | `live_e2e` (required) + Env: `HF_HOME` (vision model in cache); Requires: `mlx-vlm` installed (Python 3.10+), `httpx` | **✅ Working:** Vision API over HTTP. Tests: Base64 image chat completion, streaming graceful degradation (SSE emulation), text request on vision model server. | No (uses local cache) | +| Resumable Pull | `MLXK2_TEST_RESUMABLE_DOWNLOAD=1 pytest -m live_resumable tests_2.0/test_resumable_pull.py -v` | `live_resumable` (required) + Env: `MLXK2_TEST_RESUMABLE_DOWNLOAD=1` (opt-in for network test) | **✅ Working:** Real network download with controlled interruption (45s timer). Tests unhealthy detection → `requires_confirmation` status → resume with `force_resume=True` → final health check. Validates resumable pull feature (interrupted downloads can be resumed). Uses isolated cache (no impact on user cache). | Yes (HuggingFace download) | | Show E2E portfolios | `HF_HOME=/path/to/cache python tests_2.0/show_portfolios.py` OR `pytest -m show_model_portfolio -s` | Env: `HF_HOME` | Displays TEXT and VISION portfolios separately. Shows model keys (text_XX, vision_XX), RAM requirements, and test/skip status. Diagnostic tool for understanding portfolio separation. Use script for detailed output, or pytest marker for quick check. | No (uses local cache) | | Manual debug mode | `mlxk run "test prompt" --verbose` | Manual CLI usage with `--verbose` flag | Shows token generation details including multiple EOS token warnings. Use this for manual debugging of model quality issues. Output includes `[DEBUG] Token generation analysis` and `⚠️ WARNING: Multiple EOS tokens detected` for broken models. | No (uses local cache) | | Issue #27 real-model | `pytest -m issue27 tests_2.0/test_issue_27.py -v` | Marker: `issue27`; Env (required): `MLXK2_USER_HF_HOME` or `HF_HOME` (user cache, read-only). Env (optional): `MLXK2_ISSUE27_MODEL`, `MLXK2_ISSUE27_INDEX_MODEL`, `MLXK2_SUBSET_COUNT=0`. | Copies real models from user cache into isolated test cache; validates strict health policy on index-based models (no network) | No (uses local cache) | @@ -115,6 +117,12 @@ HF_HOME=/path/to/user/cache pytest -m live_run -v # Live E2E only (ADR-011) HF_HOME=/path/to/user/cache pytest -m live_e2e -v # See model list: pytest tests_2.0/live/test_server_e2e.py::TestChatCompletionsBatch --collect-only -q +# Resumable Pull only (separate run - uses isolated cache) +MLXK2_TEST_RESUMABLE_DOWNLOAD=1 pytest -m live_resumable tests_2.0/test_resumable_pull.py -v + +# Empirical Mapping only (model benchmarking - excluded from wet due to RAM) +pytest -m live_stop_tokens tests_2.0/test_stop_tokens_live.py::TestStopTokensEmpiricalMapping -v + # Issue #27 only MLXK2_USER_HF_HOME=/path/to/user/cache pytest -m issue27 tests_2.0/test_issue_27.py -v @@ -124,6 +132,360 @@ MLXK2_ENABLE_ALPHA_FEATURES=1 pytest -m wet -v --- +## Test Directory Organization + +### Fundamental Definitions (Single Source of Truth) + +**User Cache (Singleton - ONE instance per system)** +``` +What: The HuggingFace cache directory owned by the user +Location: Defined by HF_HOME environment variable + Default: ~/.cache/huggingface + Example: /Volumes/ExternalSSD/huggingface/cache +Lifecycle: Permanent - survives test runs, system reboots +Ownership: Belongs to USER, NOT to tests +Instances: Exactly ONE per system/user +``` + +**Isolated Cache (Class with instances - MANY per test run)** +``` +What: Temporary cache directories created by isolated_cache fixture +Location: System temp OR APFS volume root (for CoW optimization) + Example: /Volumes/ExternalSSD/.mlxk2_test_isolation/mlxk2_test_xyz123/test_cache/hub +Lifecycle: Temporary - created before test, deleted after test (seconds) +Ownership: Belongs to SPECIFIC TEST, isolated from others +Instances: NEW instance PER test function (pytest scope="function") + 5 tests = 5 separate isolated cache instances +``` + +**Instance Model:** +- **User Cache:** Singleton pattern - shared resource, READ-only in tests +- **Isolated Cache:** Factory pattern - `isolated_cache()` fixture creates new instance each time + +**Lifecycle Diagram:** +``` +Test 1: def test_foo(isolated_cache): + [SETUP] Create /tmp/mlxk2_test_abc123/ ← Instance 1 + [TEST] Use isolated cache + [TEARDOWN] Delete /tmp/mlxk2_test_abc123/ ✓ Instance 1 destroyed + +Test 2: def test_bar(isolated_cache): + [SETUP] Create /tmp/mlxk2_test_xyz789/ ← Instance 2 (NEW, separate) + [TEST] Use isolated cache + [TEARDOWN] Delete /tmp/mlxk2_test_xyz789/ ✓ Instance 2 destroyed +``` + +**Sentinel Safety Mechanism:** + +Every isolated cache contains a sentinel model: `models--TEST-CACHE-SENTINEL--mlxk2-safety-check` + +```python +# Fixture setup (Line 464-468 conftest.py) +sentinel_dir = hub_path / TEST_SENTINEL +sentinel_snapshot = sentinel_dir / "snapshots" / "test123..." +sentinel_snapshot.mkdir(parents=True) +(sentinel_snapshot / "config.json").write_text('{"model_type": "test_sentinel", "test_cache": true}') + +# Fixture teardown (Line 498-500) +_safe_rmtree(temp_dir_path, signature_id) # ← Checks signature before delete +``` + +**How Sentinel protects User Cache:** +1. Test code tries to delete a directory +2. `_safe_rmtree()` checks: Does this directory contain TEST_SENTINEL? +3. **NO** → ❌ REFUSE deletion (could be User Cache!) +4. **YES** → ✅ OK to delete (is Test Cache) + +**What it prevents:** +- Accidental deletion if `HF_HOME` wrongly points to User Cache +- Bugs in test code using wrong paths +- Race conditions between tests +- Catastrophic data loss (User Cache = gigabytes of downloaded models) + +**Environment Variables during test:** +```python +# Before test: +HF_HOME = /Volumes/ExternalSSD/huggingface/cache # User Cache + +# During test (isolated_cache fixture): +HF_HOME = /tmp/mlxk2_test_abc123/test_cache # Isolated Cache (instance 1) +MLXK2_STRICT_TEST_DELETE = 1 # Enable safety checks + +# After test: +HF_HOME = /Volumes/ExternalSSD/huggingface/cache # Restored +MLXK2_STRICT_TEST_DELETE = # Restored +``` + +**Workspace (Separate Concept - NOT a Cache)** + +Workspace is semantically distinct from Cache - it's a **self-contained, portable** directory for Clone/Push operations. + +``` +What: Self-contained directory with model files (standalone health-check capable) +Purpose: Clone target (output) OR Push source (input) +Location: User-specified path (production) OR tmp_path (tests) + Goal: Any location (USB stick, SMB share, different volumes) + Phase 1: Same APFS volume as cache (CoW optimization) +Structure: Flat directory with config.json + weights (*.safetensors, *.gguf, etc.) + NOT HuggingFace cache structure (no hub/snapshots/blobs) +Lifecycle: Permanent (user owns it) OR temporary (tests use tmp_path) +Portable: Yes (conceptually) - can be copied, moved, shared via USB/SMB +``` + +**Workspace vs Cache:** +- **Cache**: HuggingFace internal structure (hub/models--org--repo/snapshots/...), **many models** +- **Workspace**: User-facing structure (config.json, model.safetensors, tokenizer.json, ...), **exactly one model** + +**Self-contained health check:** +- Workspace contains all files needed for validation +- Can be checked without HuggingFace cache +- `mlxk push --check-only workspace/` validates standalone + +**Workspace in production:** +- **Clone**: `mlxk clone org/model /path/to/workspace` → Creates workspace at user-specified path +- **Push**: `mlxk push /path/to/workspace org/model` → Uploads from user-specified path +- **Validation**: Clone requires empty directory, Push requires valid model structure +- **Portability**: Phase 1 requires same APFS volume (limitation), future: any location + +**Workspace in tests:** +- Uses pytest's `tmp_path` fixture (NOT `isolated_cache`) +- Pattern: `target_dir = str(tmp_path / "workspace")` +- Example: `test_clone_operation.py` line 489, `test_push_workspace_check.py` line 18 + +**Workspace safety (Clone operation):** +- Temp cache during clone: `.mlxk2_temp_cache_sentinel` (cleanup protection) +- Sentinel is in temp cache, NOT in final workspace +- Temp cache deleted after successful clone → workspace remains + +**Workspace dimension in truth table:** + +| Operation | Workspace Location | Cache Type | Allowed? | +|-----------|-------------------|------------|----------| +| **Clone (production)** | User-specified path | Isolated temp cache | ✅ | +| **Clone (tests)** | `tmp_path / "workspace"` | Isolated temp cache | ✅ | +| **Push (production)** | User-specified path | N/A (upload only) | ✅ | +| **Push (tests)** | `tmp_path / "ws"` | N/A (offline `--check-only`) | ✅ | +| **Clone/Push (NEVER)** | Inside User Cache | User Cache | ❌ FORBIDDEN | +| **Clone/Push (wrong)** | Inside Isolated Cache | Isolated Cache | ❌ Semantically wrong | + +**Why Workspace ≠ Cache:** +- Different structure (flat vs HF nested) +- Different ownership (user vs HF tooling) +- Different purpose (working directory vs download cache) +- Different lifecycle (permanent vs managed) + +--- + +**CRITICAL RULE:** ❌ **NEVER write to User Cache** ❌ All writes must go to isolated cache or external destinations (HuggingFace, workspace). + +### Truth Table: Cache Type × Operation + +| Cache Type | Read | Write | Allowed? | Example Tests | Directory | +|------------|------|-------|----------|---------------|-----------| +| **User Cache** | ✅ | ❌ | **READ ONLY** | Portfolio Discovery, E2E with real models | `tests_2.0/live/` | +| **Isolated Cache** | ✅ | ✅ | **Both allowed** | Mock models, fresh downloads, safety copies | `tests_2.0/`, `tests_2.0/spec/` | + +### Extended Truth Table: Portfolio Discovery Compatibility + +Portfolio Discovery fixtures (`vision_portfolio`, `text_portfolio`) use module scope and run subprocesses, creating import-state side-effects. Most tests are compatible, but some require clean import state. + +| Test Category | Cache Type | Fixtures Used | Portfolio Compatible? | Marker | Run Group | +|------------------------|------------|-------------------------|-----------------------|-----------------|-----------| +| Portfolio Discovery | User READ | vision/text_portfolio | ✅ (source) | live_e2e | wet | +| Stop Token Validation | User READ | vision/text_portfolio | ✅ (uses it) | live_stop_tokens| wet | +| User Cache Read | User READ | isolated_cache (copy) | ✅ (no conflict) | live_run/list | wet | +| Workspace Operations | N/A | tmp_path | ✅ (cache-agnostic) | live_push/clone | wet | +| Issue Reproduction | User→Iso | isolated_cache + copy | ✅ (no conflict) | issue27 | wet | +| Isolated Cache Write | Isolated | isolated_cache (fresh) | ❌ (import conflict) | live_resumable | separate | + +**Run Groups:** +- `wet`: Can run together in one pytest invocation +- `separate`: Must run in separate pytest process + +**User Experience:** +```bash +./scripts/test-wet-umbrella.sh # Runs both groups automatically +``` + +### Decision Tree: Categorizing New Tests + +When writing a new test, follow this decision tree: + +**Question 1:** Does your test use Portfolio Discovery fixtures (vision/text_portfolio)? +├─ YES → Marker: `live_e2e` or `live_stop_tokens` → Run group: **wet** ✅ +└─ NO → Continue to Question 2 + +**Question 2:** Does your test need fresh downloads (Isolated Cache WRITE)? +├─ YES → Marker: `live_resumable` → Run group: **separate** ⚠️ +└─ NO → Continue to Question 3 + +**Question 3:** What does your test use? +├─ User Cache (READ only) → Markers: `live_run`, `live_list`, `issue27` → Run group: **wet** ✅ +├─ Workspace (tmp_path) → Markers: `live_push`, `live_clone` → Run group: **wet** ✅ +└─ Isolated Cache (copy/mock) → Standard markers → Unit test (not live) + +### Compatibility Rule (Technical Background) + +**Why separate runs?** + +Portfolio Discovery fixtures manipulate import state via subprocesses: +- Subprocess runs: `mlxk list --json [--vision]` +- Side-effect: Imports `mlx_lm`, `mlx_vlm`, `transformers`, `huggingface_hub` into pytest process +- Module scope: Remains active across all tests in run + +**Compatible tests:** Can tolerate polluted import state +- Tests using Portfolio fixtures (they expect it) +- Tests with User Cache READ (no import-sensitive operations) +- Tests with Workspace (cache-agnostic) + +**Incompatible tests:** Require clean import state +- `live_resumable`: HuggingFace Hub needs clean imports for symlink creation +- Fresh downloads fail with polluted `sys.modules` + +**Solution:** Separate pytest runs ensure clean import state for incompatible tests. + +For pytest implementation details, see Appendix below. + +### conftest.py Scope Rules + +**Pytest conftest.py files form a hierarchy** - parent conftest applies to all children, but child conftest should ONLY apply to their directory. + +**Rule:** Subdirectory `conftest.py` files MUST limit their scope to their own directory to avoid interfering with sibling/parent tests. + +**Implementation pattern:** +```python +# tests_2.0/live/conftest.py + +@pytest.fixture(scope="function", autouse=True) +def _skip_unless_live_e2e_marker(request): + """Autouse fixture that ONLY applies to tests in live/ directory.""" + # CRITICAL: Early return for tests outside this directory + test_path = str(request.node.path) + if "/live/" not in test_path and "\\live\\" not in test_path: + return # Skip fixture for tests outside live/ directory + + # Rest of fixture logic... + +def pytest_generate_tests(metafunc): + """Hook that ONLY applies to tests in live/ directory.""" + # CRITICAL: Early return for tests outside this directory + test_path = str(metafunc.definition.path) + if "/live/" not in test_path and "\\live\\" not in test_path: + return # Skip hook for tests outside live/ directory + + # Rest of hook logic (Portfolio Discovery, parametrization, etc.) +``` + +**Rationale:** +- `tests_2.0/live/` contains User Cache tests (Portfolio Discovery) +- `tests_2.0/` contains Isolated Cache tests (fresh downloads, mocks) +- Portfolio Discovery hooks (`pytest_generate_tests`, `autouse` fixtures) should NOT apply to Isolated Cache tests +- Without scope limitation: `live/conftest.py` hooks interfere with `isolated_cache` fixture in parent tests + +**Test hierarchy:** +``` +tests_2.0/ +├── conftest.py # Global: applies to ALL tests (isolated_cache, assert_is_test_cache, etc.) +├── test_resumable_pull.py # Uses isolated_cache → must NOT be affected by live/conftest.py +├── live/ +│ ├── conftest.py # Local: MUST limit scope to /live/ only (Portfolio Discovery) +│ └── test_server_e2e.py # Uses Portfolio Discovery → affected by live/conftest.py +└── spec/ + ├── conftest.py # (if exists) Local: MUST limit scope to /spec/ only + └── test_*.py +``` + +**Verification:** +- Test in `tests_2.0/test_resumable_pull.py` with marker `live_resumable` should collect as 1 test (NOT parametrized) +- Test in `tests_2.0/live/test_server_e2e.py` with `text_model_key` should parametrize across portfolio + +### Test Categories by Cache Strategy + +**Category 1: User Cache READ (Portfolio Discovery)** +- **Location:** `tests_2.0/live/` +- **Cache:** Direct User Cache via `HF_HOME` environment +- **Operation:** READ only (via `mlxk list --json`, server load, etc.) +- **Examples:** + - `test_server_e2e.py` - Parametrized tests across text_portfolio (23 models) + - `test_vision_e2e_live.py` - Vision CLI with real models from cache + - `test_stop_tokens_live.py` - Stop token validation with discovered models +- **Why in live/:** Portfolio Discovery hooks (`pytest_generate_tests`) auto-discover models from User Cache +- **Count:** 10+ tests + +**Category 2: Isolated Cache WRITE (Fresh State)** +- **Location:** `tests_2.0/` +- **Cache:** Isolated temp cache (empty at start) +- **Operation:** WRITE (download, create mock models) +- **Examples:** + - `test_resumable_pull.py` - Downloads fresh from HuggingFace into isolated cache + - `test_robustness.py` - Creates mock model files for testing + - `test_integration.py` - Synthetic model creation for integration tests +- **Why in parent:** Needs clean pytest environment without Portfolio Discovery hooks +- **Count:** 15+ tests + +**Category 3: Isolated Cache READ (Safety Copies)** +- **Location:** `tests_2.0/` +- **Cache:** Isolated temp cache (copied from User Cache) +- **Operation:** WRITE (copy) then READ (test on copy) +- **Examples:** + - `test_issue_27.py` - Copies real models from User Cache, mutates copies, tests health + - `test_issue_37_private_org_regression.py` - Copies model, renames to simulate private repo +- **Why copy:** Protects User Cache from mutations (delete shards, truncate, inject LFS pointers) +- **Count:** 2 tests + +**Category 4: Spec/Schema Validation** +- **Location:** `tests_2.0/spec/` +- **Cache:** Isolated cache with minimal mocks +- **Operation:** WRITE (mock creation) + READ +- **Examples:** + - `test_cli_commands_json_flag.py` - JSON output format validation + - `test_code_outputs_validate_against_schema.py` - Schema compliance +- **Why isolated:** Fast, deterministic, no real models needed +- **Count:** 7 tests + +**Special Cases (External Writes):** +- `test_push_live.py` - Writes to **HuggingFace** (not local cache) ✅ Allowed +- `test_clone_live.py` - Writes to **workspace directory** (not cache) ✅ Allowed + +### Why Resumable Pull MUST stay in `tests_2.0/` + +**The conflict:** +- Resumable Pull needs **empty isolated cache** (Category 2) +- `tests_2.0/live/` expects **populated User Cache** (Category 1) +- Portfolio Discovery hooks (`pytest_generate_tests`) run during collection, expecting models in HF_HOME +- When test uses `isolated_cache` in `live/`, hooks interfere with cache isolation + +**Observed:** +- ✅ `tests_2.0/test_resumable_pull.py` → 2.15GB downloaded, PASS +- ❌ `tests_2.0/live/test_resumable_pull.py` → 0 bytes downloaded, FAIL + +### Decision Tree: Where does my test belong? + +**Ask yourself:** + +1. **Does my test READ from User Cache?** + - YES, and needs Portfolio Discovery (test many models) → `tests_2.0/live/` + - YES, but only one specific model → Copy to isolated cache, use `tests_2.0/` + - NO → Continue to question 2 + +2. **Does my test WRITE?** + - Write to **User Cache** → ❌ **FORBIDDEN** - redesign your test + - Write to **isolated cache** (fresh download, mock creation) → `tests_2.0/` + - Write to **external** (HuggingFace, workspace) → `tests_2.0/` or `tests_2.0/live/` depending on read source + +3. **Does my test use mock/stub models?** + - YES, schema validation only → `tests_2.0/spec/` + - YES, but needs integrated testing → `tests_2.0/` + - NO → See questions 1-2 + +**Summary:** +- User Cache READ + Many models → `tests_2.0/live/` (Portfolio Discovery) +- User Cache READ + Safety copies → `tests_2.0/` (copy_user_model_to_isolated) +- Isolated Cache WRITE → `tests_2.0/` (fresh state, no hooks) +- Schema/Mock only → `tests_2.0/spec/` (fast validation) + +--- + ## ⚠️ CRITICAL: Sequential Execution Required for E2E Tests **DO NOT use parallel execution with `-m live_e2e` tests:** @@ -170,15 +532,15 @@ HF_HOME=/path/to/cache pytest -m live_e2e -n auto # ← NEVER DO THIS! | Python Version | Status | Tests Passing | Skipped | Notes | |----------------|--------|---------------|---------|-------| -| 3.9.6 (macOS) | ✅ Verified | 476/541 | 65 | Vision tests auto-skip (mlx-vlm requires 3.10+) | -| 3.10.x | ✅ Verified | 485/541 | 56 | Full suite including vision tests | -| 3.11.x | ✅ Verified | 485/541 | 56 | Full suite including vision tests | -| 3.12.x | ✅ Verified | 485/541 | 56 | Full suite including vision tests | -| 3.13.x | ✅ Verified | 485/541 | 56 | Full suite including vision tests | -| 3.14.x | ✅ Verified | 485/541 | 56 | Full suite including vision tests | +| 3.9.6 (macOS) | ✅ Verified | 519/588 | 69 | Vision tests auto-skip (mlx-vlm requires 3.10+) | +| 3.10.x | ✅ Verified | 528/588 | 60 | Full suite including vision tests | +| 3.11.x | ✅ Verified | 528/588 | 60 | Full suite including vision tests | +| 3.12.x | ✅ Verified | 528/588 | 60 | Full suite including vision tests | +| 3.13.x | ✅ Verified | 528/588 | 60 | Full suite including vision tests | +| 3.14.x | ✅ Verified | 528/588 | 60 | Full suite including vision tests | -**Note:** 56 skipped tests (65 on Python 3.9) are opt-in (live tests, alpha features). Skipped count may vary by environment: -- Without `HF_HOME`: Standard 56 skipped (65 on Py3.9, live E2E tests use fallback parametrization) +**Note:** 60 skipped tests (69 on Python 3.9) are opt-in (live tests, alpha features). Skipped count may vary by environment: +- Without `HF_HOME`: Standard 60 skipped (69 on Py3.9, live E2E tests use fallback parametrization) - With `HF_HOME`: Live E2E tests run with discovered models across text_portfolio (23) and vision_portfolio (3) All versions tested with `isolated_cache` system and MLX stubs for fast execution without model downloads. @@ -798,11 +1160,90 @@ pytest -m live_e2e --collect-only # Should work without errors ## Appendix -### Test Environment Variables Reference +### A1. Portfolio Discovery Fixture Compatibility + +**Implementation:** `tests_2.0/conftest.py` + +The `pytest_collection_modifyitems` hook auto-assigns the `wet` marker based on test markers and location: + +```python +# Declarative compatibility set +LIVE_MARKERS_FOR_WET = { + # Portfolio Discovery users + "live_e2e", # Uses Portfolio fixtures + "live_stop_tokens", # Uses Portfolio fixtures + + # User Cache READ + "live_run", # User Cache READ only + "live_list", # User Cache READ only + "issue27", # User Cache READ + copy + + # Workspace operations + "live_push", # Workspace only (tmp_path) + "live_clone", # Workspace only (tmp_path) +} + +def pytest_collection_modifyitems(config, items): + """Auto-assign wet marker based on compatibility.""" + for item in items: + test_markers = {m.name for m in item.iter_markers()} + test_path = str(item.path) + is_in_live_dir = "/live/" in test_path or "\\live\\" in test_path + + # Wet marker for compatible tests + if (test_markers & LIVE_MARKERS_FOR_WET) or is_in_live_dir: + # EXCLUDE live_resumable (incompatible) + if "live_resumable" not in test_markers: + item.add_marker(pytest.mark.wet) +``` + +### A2. Why sys.modules Pollution Matters + +**pytest runs in single Python process:** +- All tests share: `sys.modules`, `sys.path` +- Module-scoped fixtures remain active across tests +- Import pollution persists until pytest exit + +**Portfolio Discovery side-effects:** +```python +# live/conftest.py:vision_portfolio (module scope) +def vision_portfolio(): + discover_vision_models() # Runs: mlxk list --json --vision + # Subprocess imports pollute pytest process sys.modules +``` + +**Why live_resumable breaks:** +```python +# test_resumable_pull.py:190 +pull_operation(model, force_resume=True) # In pytest process +# Imports huggingface_hub → finds polluted version in sys.modules +# Symlink creation fails +``` + +**Evidence:** +- Solo run: ✅ No Portfolio Discovery → clean imports → works +- Multi-test run: ❌ Portfolio Discovery → polluted imports → fails + +### A3. Why Not Function Scope for Portfolio? + +**Current (module scope):** +- Portfolio Discovery runs 1x per module +- 26 tests → 2x subprocess (text + vision) → fast + +**Alternative (function scope):** +- Portfolio Discovery per test +- 26 tests → 52x subprocess → 10-20x slower +- Subprocess overhead: Model enumeration + RAM calculation + +**Decision:** Module scope is performance optimization, not bug. + +--- + +### A4. Test Environment Variables Reference This section documents environment variables used exclusively for testing and development. For user-facing configuration, see the "Configuration Reference" section in [README.md](README.md). -#### Test Control Variables +#### A4.1 Test Control Variables These variables control test behavior and should only be set when running the test suite: @@ -812,7 +1253,7 @@ These variables control test behavior and should only be set when running the te | `MLXK2_STRICT_TEST_DELETE` | Enforce strict deletion checks in `rm` tests | Set to `1` in test suite to validate error handling | | `HF_HUB_DISABLE_PROGRESS_BARS` | Disable HuggingFace download progress bars | Set automatically by MLX Knife; don't set manually | -#### Live Test Environment Variables +#### A4.2 Live Test Environment Variables These variables enable optional live tests that interact with real models or external services: @@ -829,6 +1270,7 @@ These variables enable optional live tests that interact with real models or ext | `MLXK2_ISSUE27_INDEX_MODEL` | Index-based model for Issue #27 | `pytest -m issue27` | | `MLXK2_SUBSET_COUNT` | Limit Issue #27 test count | `pytest -m issue27` | | `MLXK2_BOOTSTRAP_INDEX` | Auto-download model for Issue #27 | `pytest -m issue27` | +| `MLXK2_TEST_RESUMABLE_DOWNLOAD` | Enable resumable pull tests (requires network) | `pytest -m live_resumable tests_2.0/test_resumable_pull.py` | **Example:** ```bash @@ -847,12 +1289,15 @@ MLXK2_LIVE_PUSH=1 \ --- -### Complete Test File Structure (2.0.4-beta.3) +### A5. Complete Test File Structure (2.0.4-beta.5) ``` +scripts/ +└── test-wet-umbrella.sh # Single entry point for all real tests (wet + resumable), memory-optimized (--tb=no --capture=sys) + tests_2.0/ ├── __init__.py -├── conftest.py # Isolated test cache (HF_HOME override), safety sentinel, core fixtures +├── conftest.py # Isolated test cache (HF_HOME override), safety sentinel, core fixtures, wet marker hook, memory cleanup (live_e2e+wet), pytest_addoption (--report-output) ├── conftest_runner.py # Runner-specific fixtures/mocks ├── show_portfolios.py # Diagnostic tool: Display text/vision portfolios with RAM estimates ├── stubs/ # Minimal mlx/mlx_lm stubs for unit/spec tests @@ -883,6 +1328,7 @@ tests_2.0/ │ ├── test_portfolio_fixtures.py # Portfolio separation validation tests (7 tests: fixture behavior, disjoint check) │ ├── test_push_live.py # Live push flow (requires MLXK2_LIVE_PUSH, HF_TOKEN) │ ├── test_server_e2e.py # Server E2E tests with TEXT models (ADR-011 + Portfolio Separation, parametrized: text_XX) +│ ├── test_show_portfolio.py # Portfolio display (marker: show_model_portfolio, requires HF_HOME) │ ├── test_streaming_parity.py # Streaming vs batch parity tests (Issue #20, ADR-011, parametrized) │ ├── test_vision_e2e_live.py # Vision CLI E2E tests with real models (ADR-012, 5 deterministic vision queries) │ ├── test_vision_server_e2e.py # Vision Server E2E tests with VISION models (ADR-012 Phase 3 + Portfolio Separation, parametrized: vision_XX) @@ -917,6 +1363,7 @@ tests_2.0/ ├── test_push_minimal.py # Minimal push scenarios (offline) ├── test_push_workspace_check.py # Push check-only: workspace validation without network ├── test_ram_calculation.py # RAM calculation unit tests (11 tests: text 1.2x, vision 0.70 threshold, system memory) +├── test_resumable_pull.py # Resumable download tests (real network download with controlled interruption) ├── test_robustness.py # Robustness for rm/pull/disk/timeout/concurrency ├── test_run_complete.py # End-to-end run command (stream/batch/params) ├── test_run_vision.py # Vision runner unit tests (ADR-012 Phase 1b, VisionRunner routing, default prompt) @@ -931,7 +1378,9 @@ tests_2.0/ ├── test_stop_tokens_live.py # Stop token validation with real models (marker: live_stop_tokens, ADR-009) ├── test_token_limits.py # Dynamic token calculation; server vs run policies ├── test_vision_adapter.py # Vision HTTP adapter unit tests (46 tests: Base64 decoding, OpenAI format parsing, sequential images, image ID persistence) -└── test_vision_exif.py # EXIF extraction tests (ADR-017 Phase 1, 8 tests: GPS, DateTime, Camera, collapsible table, privacy controls) +├── test_vision_exif.py # EXIF extraction tests (ADR-017 Phase 1, 8 tests: GPS, DateTime, Camera, collapsible table, privacy controls) +├── test_workspace_sentinel.py # Workspace infrastructure tests (ADR-018 Phase 0a, 20 tests: sentinel primitives, atomic write, managed/unmanaged detection, health checks, CLI integration) +└── test_convert_repair_index.py # Convert operation tests (ADR-018 Phase 1, 11 tests: rebuild_safetensors_index, cache sanctity, workspace sentinels, validation) ``` --- @@ -963,62 +1412,4 @@ mlxk run mlx-community/Phi-3-mini-4k-instruct-4bit "Write one sentence about cat --- -### Version History - -### 2.0.4-beta.1 (Release Candidate, 2025-12-16) -- ✅ **Vision Server integration (ADR-012 Phase 3, Session 24):** - - Backend-aware `get_or_load_model()`: Loads MLXRunner OR VisionRunner based on policy - - `ChatMessage.content` extended: `Union[str, List[Dict]]` for OpenAI Vision format - - `_handle_vision_chat_completion()`: Vision request handler with cache reuse - - `_handle_text_chat_completion()`: Supports both runner types - - Streaming rejection (HTTP 400) for vision models - - 17 new unit tests in `test_server_vision.py` - - 3 new E2E tests in `test_vision_server_e2e.py` -- ✅ **Test count:** 430 passed, 41 skipped (17 new tests from Session 24) -- ✅ **Live E2E:** 105 passed, 5 failed, 20 skipped - - **Known failures:** discovered_04/25 (Vision models) HTTP 404 in generic E2E tests (not vision-specific tests) - - **Fixed:** test_chess_position_e6 (increased max_tokens, relaxed assertion for "black" OR "king") - -### 2.0.3 (2025-11-17) -- ✅ **Test updates for stderr separation:** 4 test files modified to verify errors go to stderr (human mode) - - `test_interactive_mode.py`: 2 tests patching stderr for ERROR messages - - `test_run_complete.py`: 2 tests validating stderr error handling - - `test_cli_run_exit_codes.py`: 3 tests checking stdout/stderr separation in JSON mode - - `test_cli_push_args.py`: 2 tests verifying push stdout/stderr returns -- ✅ **Benchmark reporting infrastructure:** 4 live test files updated with benchmark fixtures - - `live/conftest.py`: +225 lines - `report_benchmark()` fixture, `_parse_model_family()` helper - - `live/test_cli_e2e.py`: Benchmark metadata reporting (family, variant, stop_tokens) - - `live/test_server_e2e.py`: Benchmark metadata + performance (usage) data - - `live/test_streaming_parity.py`: Portfolio Discovery refactoring (uses `mlxk list --json`) -- ✅ **Interactive mode reasoning control:** 2 new tests added (review_report.md) - - `test_interactive_mode.py`: 1 test for `hide_reasoning` parameter passing - - `test_run_complete.py`: 1 test for `TestRunReasoningControl` class -- ✅ **Test count:** 308 passed, 35 skipped (+2 tests from review_report.md fixes) - -### 2.0.2 (2025-11-14) -- ✅ Test infrastructure hardening (TOKENIZERS_PARALLELISM, active polling, gc.collect()) -- ✅ Portfolio Discovery validation complete (73/81 E2E tests, 17 models discovered) -- ✅ Sequential execution warning added (TESTING-DETAILS.md CRITICAL section) -- ✅ RAM logging infrastructure added (server_context.py, for future benchmark tooling) - -### 2.0.2-dev (2025-11-13) -- ✅ Stop token ordering bugs fixed (batch + streaming modes) -- ✅ E2E test suite completed (ADR-011) -- ✅ 72/80 E2E tests passing baseline (15 models tested) -- ✅ TESTING.md restructuring (timeless policy + version-specific details split) - -### 2.0.1 (2025-11-28) -- ✅ Portfolio Discovery (Issue #32, ADR-009) -- ✅ CLI exit code propagation (Issue #38) -- ✅ 306/306 tests passing - -### 2.0.0 (2025-11-06) -- ✅ Complete rewrite with Apache 2.0 license -- ✅ JSON-first architecture -- ✅ Isolated test system with sentinel protection -- ✅ MLX stubs for fast testing without model downloads -- ✅ 3-category test strategy - ---- - -*MLX-Knife 2.0.4-beta.1 Testing Details* +*MLX-Knife 2.0 Testing Details* diff --git a/TESTING.md b/TESTING.md index cb027c5..4f7a24a 100644 --- a/TESTING.md +++ b/TESTING.md @@ -12,7 +12,14 @@ For current test counts, version-specific details, and complete file listings, s - **Isolated by default** - User cache stays pristine with sentinel protection - **Opt-in live tests** - Network/model tests require explicit markers/environment - **Mock-heavy** - MLX stubs enable fast testing without model downloads -- **Fast feedback** - 300+ tests run in seconds on any Apple Silicon Mac +- **Fast feedback** - 500+ tests run in seconds on any Apple Silicon Mac + +**Cache Architecture:** +- **User Cache (Singleton):** ONE permanent cache per system - READ-ONLY in tests +- **Isolated Cache (Factory):** NEW temporary cache PER test - full read/write +- **Sentinel Safety:** Automatic protection prevents accidental User Cache deletion + +See [TESTING-DETAILS.md → Fundamental Definitions](TESTING-DETAILS.md#fundamental-definitions-single-source-of-truth) for complete cache architecture and safety mechanisms. **Safety First:** - Tests use temporary caches with `TEST_SENTINEL` protection @@ -45,6 +52,24 @@ ruff check mlxk2/ --fix && mypy mlxk2/ && pytest -v **That's it!** Default tests use isolated caches and MLX stubs - no model downloads required. +## Running All Real Tests + +**Single command (recommended):** +```bash +./scripts/test-wet-umbrella.sh +``` + +This runs all real tests in the correct order. For details on test categories, see [TESTING-DETAILS.md](TESTING-DETAILS.md). + +**Manual execution (advanced):** +```bash +# Portfolio-compatible tests +pytest -m wet -v + +# Isolated Cache WRITE tests +MLXK2_TEST_RESUMABLE_DOWNLOAD=1 pytest -m live_resumable -v +``` + ## Test Categories ### Category 1: Isolated Cache (Default) @@ -140,11 +165,25 @@ tests_2.0/ **Legend:** - `spec/` - API contract validation (stays in sync with `docs/schema`) -- `live/` - Opt-in tests requiring environment (markers: `live_*`) +- `live/` - **User Cache READ only** - Portfolio Discovery tests (parametrized across many models) - `stubs/` - Lightweight MLX replacements for unit tests - `conftest.py` - Isolated HF cache (temp), safety sentinel, fixtures + - Parent `conftest.py` applies globally + - Subdirectory `conftest.py` (live/, spec/) MUST limit scope to own directory only + - See [TESTING-DETAILS.md → conftest.py Scope Rules](TESTING-DETAILS.md#conftestpy-scope-rules) -See [TESTING-DETAILS.md](TESTING-DETAILS.md) for complete file listing with descriptions. +**CRITICAL RULE:** ❌ **NEVER write to User Cache** ❌ + +**Test organization by cache strategy:** +- **User Cache READ** → `tests_2.0/live/` (Portfolio Discovery with many models) +- **Isolated Cache WRITE** → `tests_2.0/` (fresh downloads, mock creation) +- **Isolated Cache READ** → `tests_2.0/` (safety copies from User Cache) +- **Schema validation** → `tests_2.0/spec/` (mocks, fast) +- **Workspace operations** → `tmp_path` fixture (Clone/Push tests, separate from cache) + +**Note:** Workspace is semantically distinct from Cache - see [TESTING-DETAILS.md → Workspace](TESTING-DETAILS.md#workspace-separate-concept---not-a-cache) for details. + +See [TESTING-DETAILS.md → Truth Table](TESTING-DETAILS.md#truth-table-cache-type--operation) for complete categorization and decision tree. ## MLX Stubs (Fast Testing Without Model Downloads) @@ -344,9 +383,9 @@ When submitting PRs with test changes, please include: 2. **Test results** (example): ``` - Platform: macOS 14.6, M2 Max - Python: 3.9.6 - Results: 476 passed, 65 skipped + Platform: macOS 26.2 (Tahoe), M2 Max + Python: 3.10.x + Results: 528 passed, 60 skipped ``` 3. **Any issues encountered** and resolutions @@ -373,7 +412,7 @@ ruff check mlxk2/ --fix && mypy mlxk2/ && pytest -v **MLX Knife Testing:** - ✅ **Isolated by default** - User cache stays pristine -- ✅ **Fast feedback** - 400+ tests run in seconds without model downloads +- ✅ **Fast feedback** - 500+ tests run in seconds without model downloads - ✅ **Low requirements** - 16GB RAM, ~20MB disk, no HF cache needed - ✅ **Opt-in live tests** - Real models/network when needed - ✅ **Multi-Python support** - Verified on Python 3.9-3.14 diff --git a/benchmarks/generate_benchmark_report.py b/benchmarks/generate_benchmark_report.py index e49d79f..bbed403 100644 --- a/benchmarks/generate_benchmark_report.py +++ b/benchmarks/generate_benchmark_report.py @@ -106,13 +106,40 @@ def calculate_statistics(data: List[dict]) -> Dict: for e in data: if "system_health" in e: - swap_values.append(e["system_health"].get("swap_used_mb", 0)) - ram_values.append(e["system_health"].get("ram_free_gb", 0)) - zombie_values.append(e["system_health"].get("zombie_processes", 0)) - quality_flags.append(e["system_health"].get("quality_flags", ["unknown"])) + swap_mb = e["system_health"].get("swap_used_mb", 0) + ram_gb = e["system_health"].get("ram_free_gb", 0) + zombies = e["system_health"].get("zombie_processes", 0) + + swap_values.append(swap_mb) + ram_values.append(ram_gb) + zombie_values.append(zombies) + + # Recalculate quality_flags from raw values (ignore stored flags) + # Rationale: Thresholds are experimental and OS-specific + # + # Session 61 Analysis (Sequoia vs Tahoe): + # - Sequoia: RAM free varies 10-27 GB, swap=0 + # - Tahoe: RAM free drops to 0-0.1 GB during load, recovers to ~24 GB between tests + # + # Steady-State Baseline (DeepHermes post-load relaxation): + # - Tahoe: ~24 GB free (1.2-1.4 min after first test) + # - Sequoia: ~40 GB free (similar pattern) + # + # Degraded Threshold: ram_free < 5 GB (extreme memory pressure) + # - Marks 0-0.1 GB minima as degraded ✅ + # - Normal tests (10-20 GB free) stay clean ✅ + flags = [] + if ram_gb < 5.0: # < 5 GB free = extreme memory pressure + flags.append("degraded_ram") + if zombies > 0: + flags.append("degraded_zombies") + if not flags: + flags.append("clean") + + quality_flags.append(flags) clean_count = sum(1 for flags in quality_flags if flags == ["clean"]) - degraded_swap = sum(1 for flags in quality_flags if "degraded_swap" in flags) + degraded_ram = sum(1 for flags in quality_flags if "degraded_ram" in flags) degraded_zombies = sum(1 for flags in quality_flags if "degraded_zombies" in flags) # Per-model statistics @@ -211,7 +238,7 @@ def calculate_statistics(data: List[dict]) -> Dict: }, "quality": { "clean": clean_count, - "degraded_swap": degraded_swap, + "degraded_ram": degraded_ram, "degraded_zombies": degraded_zombies, "clean_percent": 100 * clean_count / len(data) if data else 0, }, @@ -288,9 +315,9 @@ def generate_markdown(stats: Dict, input_file: Path, compare_file: Optional[Path quality_icon = "✅" if stats['quality']['clean_percent'] == 100 else "⚠️" md += f"{quality_icon} **System Health:** " if stats['quality']['clean_percent'] == 100: - md += "All tests clean (0 MB swap, 0 zombies)\n" + md += "All tests clean (RAM >5 GB free, 0 zombies)\n" else: - md += f"{stats['quality']['degraded_swap']} degraded (swap), {stats['quality']['degraded_zombies']} degraded (zombies)\n" + md += f"{stats['quality']['degraded_ram']} degraded (RAM <5 GB free), {stats['quality']['degraded_zombies']} degraded (zombies)\n" md += "\n---\n\n" @@ -315,9 +342,9 @@ Swap (MB): min={stats['swap']['min']}, max={stats['swap']['max']}, avg={ RAM free (GB): min={stats['ram']['min']:.1f}, max={stats['ram']['max']:.1f}, avg={stats['ram']['avg']:.1f} Zombies: min={stats['zombies']['min']}, max={stats['zombies']['max']} -Quality Flags: +Quality Flags (Thresholds: RAM <5 GB free, zombies >0): Clean: {stats['quality']['clean']}/{stats['total_tests']} ({stats['quality']['clean_percent']:.1f}%) - Degraded (swap): {stats['quality']['degraded_swap']} + Degraded (RAM): {stats['quality']['degraded_ram']} Degraded (zombies): {stats['quality']['degraded_zombies']} ``` diff --git a/benchmarks/tools/memmon.py b/benchmarks/tools/memmon.py index bf57c3e..a710c42 100644 --- a/benchmarks/tools/memmon.py +++ b/benchmarks/tools/memmon.py @@ -14,11 +14,15 @@ Usage: # Just monitor (no subprocess) python benchmarks/tools/memmon.py --duration 60 --output memory.jsonl +Platform: macOS + Apple Silicon (MLX requirement) +Dependencies: ZERO - uses native macOS tools (sysctl, vm_stat) + Future: Will be part of mlxk-benchmark kit. """ import argparse import json +import re import subprocess import sys import threading @@ -28,110 +32,84 @@ from pathlib import Path from typing import Optional +def parse_vm_stat_page_size(output: str) -> int: + """Extract vm_stat page size in bytes, falling back to 16384 (Apple Silicon default). + + Reuses proven logic from tests_2.0/conftest.py (ADR-013 Phase 0.5). + """ + match = re.search(r"page size of (\d+) bytes", output) + if match: + return int(match.group(1)) + return 16384 # Apple Silicon default + + def get_memory_sample() -> dict: - """Get current memory state using psutil.""" - try: - import psutil - import subprocess + """Get current memory state using native macOS tools. - # Get memory pressure from sysctl (macOS only) - # Values: 1=NORMAL (green), 2=WARN (yellow), 4=CRITICAL (red) - memory_pressure = 1 # Default to NORMAL - try: - result = subprocess.run( - ["sysctl", "-n", "kern.memorystatus_vm_pressure_level"], - capture_output=True, text=True, timeout=1 - ) - memory_pressure = int(result.stdout.strip()) - except Exception: - pass + Platform: macOS only (MLX requirement) + Dependencies: ZERO - uses sysctl and vm_stat - vm = psutil.virtual_memory() - swap = psutil.swap_memory() - return { - "ram_free_gb": round(vm.available / 1e9, 2), - "ram_used_gb": round(vm.used / 1e9, 2), - "ram_percent": vm.percent, - "swap_used_mb": round(swap.used / 1e6, 1), - "swap_percent": swap.percent, - "memory_pressure": memory_pressure, - } - except ImportError: - # Fallback without psutil - return get_memory_sample_native() - - -def get_memory_sample_native() -> dict: - """Get memory state using native macOS commands (no psutil).""" - import subprocess + Reuses proven parsing logic from tests_2.0/conftest.py (_get_macos_system_health). + Previously used psutil.swap_memory() which was BROKEN (showed 0 MB during 65GB real usage). + """ + # Force C locale for consistent number formatting (avoid locale-specific decimal separators) + import os + env = os.environ.copy() + env["LC_ALL"] = "C" # Get memory pressure (1=NORMAL/green, 2=WARN/yellow, 4=CRITICAL/red) memory_pressure = 1 # Default to NORMAL try: result = subprocess.run( ["sysctl", "-n", "kern.memorystatus_vm_pressure_level"], - capture_output=True, text=True, timeout=1 + capture_output=True, text=True, timeout=1, env=env ) memory_pressure = int(result.stdout.strip()) except Exception: pass - # Get swap usage + # Get swap usage via sysctl (proven working - same logic as conftest.py) swap_mb = 0 try: result = subprocess.run( - ["sysctl", "-n", "vm.swapusage"], - capture_output=True, text=True, timeout=1 + ["sysctl", "vm.swapusage"], + capture_output=True, text=True, timeout=1, env=env ) - # Parse: "total = 0.00M used = 0.00M free = 0.00M (encrypted)" - for part in result.stdout.split(): - if part.endswith("M") and "used" in result.stdout.split()[result.stdout.split().index(part)-2]: - swap_mb = float(part[:-1]) - break - # Simpler parsing - parts = result.stdout.replace("M", "").split() - for i, p in enumerate(parts): - if p == "used" and i + 2 < len(parts): - swap_mb = float(parts[i + 2]) - break + if result.returncode == 0: + # Parse: "vm.swapusage: total = 0.00M used = 0.00M free = 0.00M (encrypted)" + # LC_ALL=C ensures consistent dot decimal separator + parts = result.stdout.split("used = ") + if len(parts) > 1: + used_str = parts[1].split()[0] + # Parse size (can be M or G suffix) + if used_str.endswith("G"): + swap_mb = int(float(used_str[:-1]) * 1024) + elif used_str.endswith("M"): + swap_mb = int(float(used_str[:-1])) except Exception: pass - # Get RAM via vm_stat + # Get RAM via vm_stat (proven working - same logic as conftest.py) ram_free_gb = 0 try: result = subprocess.run( ["vm_stat"], - capture_output=True, text=True, timeout=1 + capture_output=True, text=True, timeout=1, env=env ) - # Parse page size and available pages - page_size = 16384 # Default for Apple Silicon - pages_free = 0 - pages_inactive = 0 - pages_purgeable = 0 - pages_speculative = 0 - - for line in result.stdout.splitlines(): - if "page size of" in line: - page_size = int(line.split()[-2]) - elif "Pages free:" in line: - pages_free = int(line.split()[-1].rstrip(".")) - elif "Pages inactive:" in line: - pages_inactive = int(line.split()[-1].rstrip(".")) - elif "Pages purgeable:" in line: - pages_purgeable = int(line.split()[-1].rstrip(".")) - elif "Pages speculative:" in line: - pages_speculative = int(line.split()[-1].rstrip(".")) - - # Total available = free + inactive + purgeable + speculative - total_available_pages = pages_free + pages_inactive + pages_purgeable + pages_speculative - ram_free_gb = round((total_available_pages * page_size) / 1e9, 2) + if result.returncode == 0: + page_size = parse_vm_stat_page_size(result.stdout) + # Parse "Pages free: 12345." + for line in result.stdout.splitlines(): + if "Pages free:" in line: + pages_free = int(line.split(":")[1].strip().rstrip(".")) + ram_free_gb = round(pages_free * page_size / (1024**3), 2) + break except Exception: pass return { "ram_free_gb": ram_free_gb, - "ram_used_gb": 0, # Not available without psutil + "ram_used_gb": 0, # Not available from vm_stat alone "ram_percent": 0, "swap_used_mb": swap_mb, "swap_percent": 0, diff --git a/docs/ADR/ADR-018-Convert-Operation.md b/docs/ADR/ADR-018-Convert-Operation.md new file mode 100644 index 0000000..fcc3323 --- /dev/null +++ b/docs/ADR/ADR-018-Convert-Operation.md @@ -0,0 +1,501 @@ +# ADR-018: Convert Operation + +**Status:** Partially Implemented +**Created:** 2025-12-18 +**Updated:** 2025-12-30 (Phase 0a+1 complete for 2.0.4-beta.5, Phase 0b+0c planned for beta.6) +**Context:** Users need to (a) quantize MLX workspaces locally without polluting the HF cache and (b) repair MLX/HF compliance issues (notably safetensors index/shard mismatches) in a deterministic way. + +**Phase Status:** +- **Phase 0a:** Workspace infrastructure — ✅ Implemented (2.0.4-beta.5) +- **Phase 0b:** Resumable clone — 🚧 Planned (2.0.4-beta.6) +- **Phase 0c:** Workspace run/show/server support — 🚧 Planned (2.0.4-beta.6) +- **Phase 1:** `--repair-index` — ✅ Implemented (2.0.4-beta.5) + +**Note:** Phase 0b+0c complete the workspace infrastructure before 2.0.4 stable release. This enables full `clone → convert → run` workflow with resume support and no HF push requirement. + +--- + +## Problem + +1. Pre-quantized models (notably VLM) may ship with a broken `model.safetensors.index.json` (index ↔ shard mismatch), e.g. due to the mlx-vlm overwrite regression (issue #624, fixed by PR #638; stable release pending). +2. Users want custom quantization (specific bit depths, mixed recipes). +3. Quantizing “in cache” pollutes operational runtime models used by `mlxk serve`. +4. Strict validators (mlx-knife health) block models that are “lenient-loader runnable” but formally inconsistent. + +--- + +## Decision + +### Convert is a Workspace-to-Workspace Transform + +`mlxk convert` operates on **existing MLX workspaces** (or 3rd-party model directories treated as *unmanaged workspaces*) and writes results to a **target workspace**. + +**Core modes (one per invocation):** +- `--quantize `: produce new quantized weights in target workspace and **always** write a correct `model.safetensors.index.json` for the output. +- `--repair-index`: **fix packaging only** (rebuild `model.safetensors.index.json` from existing shards) without changing weights / quantization. +- `--dequantize`: produce dequantized weights (future / optional). + +### Good citizenship rule + +mlx-knife must be able to **detect, explain, and repair** common MLX/HF compliance breakage where possible. +This does **not** imply silently mutating the HF cache; repair happens in a workspace. + +### Cache sanctity & CoW constraint (explicit) + +- The HF cache is the **production base** for inference and must be treated as **read-only** by `convert`. +- `convert` must **never** write into the HF cache, even if the workspace is on the same volume. +- Workspaces may live on the **same filesystem/volume** as the cache to benefit from CoW (`cp -c`), but they remain a distinct namespace. +- Safety rule: if `` or `` resolves inside the cache root, `convert` must fail with a hard error. + +### Managed vs unmanaged workspaces (sentinel contract) + +- A **managed workspace** contains a workspace sentinel/metadata file (e.g. `.mlxk_workspace.json`). +- `mlxk clone` and `mlxk convert` MUST produce **managed** workspaces (sentinel always written). +- 3rd-party directories may be used as a **source** without a sentinel (treated as *unmanaged*), but: + - they are treated read-only, + - `convert` always produces a managed target workspace. +- `mlxk health ` MUST support both managed and unmanaged directories, but reports `managed=false` when the sentinel is missing. + +### Atomic target initialization rule + +For any `mlxk convert SRC DST ...` operation: +- DST is created (or validated empty) and the **workspace sentinel is written first** (atomic write + rename) **before** any other processing (copy/CoW, repair, quantize). +- If sentinel creation fails, the operation aborts without modifying SRC or cache. + +--- + +## Non-Goals + +- Converting arbitrary PyTorch/Transformers checkpoints to MLX format. +- Implementing architecture-specific conversion pipelines beyond what mlx-lm / mlx-vlm provide. +- Auto-modifying downloaded artifacts in-place (cache) without explicit user action. + +--- + +## CLI + +```bash +mlxk convert [MODE] [options] +``` + +**Modes (exactly one required):** +- `--quantize ` Quantize to N bits (2, 3, 4, 6, 8) +- `--repair-index` Rebuild `model.safetensors.index.json` from existing `.safetensors` shards +- `--dequantize` Dequantize weights (optional / later) + +**Options:** +- `--q-group-size N` Group size (default: 64) +- `--mixed-recipe R` Mixed quantization recipe (future phase) +- `--dtype TYPE` Output dtype (float16, bfloat16) (where applicable) +- `--skip-health` Skip health check on output (debug/internal) +- *(compat alias, optional)* `--q-bits N` as alias for `--quantize N` (if desired) + +**Mode constraints:** +- `--quantize ` and `--repair-index` are **not** combined in one call. + Rationale: `--quantize` already guarantees a correct output index by design; `--repair-index` exists for “no quant change, just fix metadata”. + +--- + +## Workflows + +### A) Repair broken index without changing quantization + +```bash +# Clone a pre-quantized model to workspace (even if unhealthy) +mlxk clone mlx-community/DeepSeek-OCR-4bit ./ws-ocr + +# Repair index/shard metadata only (no weight changes) +mlxk convert ./ws-ocr ./ws-ocr-fixed --repair-index + +# Now it should validate and be runnable under strict tooling +mlxk health ./ws-ocr-fixed +``` + +### B) Quantize from a clean source workspace (bf16/fp16) to multiple outputs + +```bash +# Clone bf16 (or fp16) source to workspace +mlxk clone mistralai/Mistral-Small-3.1-24B ./ws-bf16 + +# Quantize to 4-bit workspace +mlxk convert ./ws-bf16 ./ws-4bit --quantize 4 + +# Optional: another quantization from the same source +mlxk convert ./ws-bf16 ./ws-6bit --quantize 6 + +# Upload result (still behind ALPHA gate if desired) +mlxk push ./ws-4bit myorg/Mistral-Small-4bit + +# Cleanup source workspace +rm -rf ./ws-bf16 +``` + +### C) Adopt a 3rd-party directory as a managed workspace (without changing weights) + +```bash +# Treat 3rd-party dir as unmanaged source; produce a managed target workspace +mlxk convert ./third-party-model ./ws-managed --repair-index + +mlxk health ./ws-managed +``` + +--- + +## Operation Matrix + +| Command | Source | Target | Transform | +|------------|-------------|-------------|----------| +| `pull` | HF | cache | none | +| `clone` | HF | workspace | none (CoW) | +| `convert` | workspace | workspace | `quantize` / `repair-index` / `dequantize` | +| `push` | workspace | HF | none | + +--- + +## Design Rationale + +### Why workspace → workspace (not cache → workspace)? +- Cache stays clean for operational runtime models (`mlxk serve`). +- Workspace is the explicit “working area” for transformations. +- Aligns with philosophy: clone creates a workspace to “do something” with it. + +### Why convert as a single command with modes? +- Keeps command surface small (Unix-friendly). +- Clear semantics: “transform workspace” with explicit mode flags. + +### Why `--repair-index` exists separately? +- Many issues are **metadata-only** (index mismatch) and should be fixable without re-quantization. +- Enables strict tooling to work with “lenient-runnable” repos by making them formally consistent. + +### Why enforce "sentinel first"? +- Ensures every `convert` result is safely identifiable as a mlxk-managed workspace. +- Supports safe cleanup and stricter gating (e.g. `push`) without relying on conventions. + +### Why split Phase 0 into 0a and 0b? + +**Rationale:** +- Phase 0a (workspace infrastructure) is **prerequisite** for convert operation +- Phase 0b (resumable clone) is **user convenience**, not blocking +- Splitting allows 2.0.4 to ship community repair tool without waiting for resume feature +- Clean dependency chain: 0a → Phase 1 (repair-index) → 2.0.4 stable +- Phase 0b can mature in 2.0.5-beta with more testing/validation + +**Timeline:** +- 2.0.4 stable blocked by mlx-vlm 0.3.10 PyPI release anyway +- Use waiting time to implement 0a + Phase 1 (community value) +- Phase 0b deferred to 2.0.5-beta (larger scope, more testing needed) + +--- + +## Implementation (POC-first) + +### Phase 0a: Workspace Infrastructure (2.0.4 stable) ✅ + +**Foundation for all workspace operations (clone, convert, future push).** + +#### 1. Workspace Sentinel Primitives + +**File:** `mlxk2/operations/workspace.py` (NEW) + +Functions: +- `write_workspace_sentinel(path, metadata)` - Atomic write with rename +- `is_managed_workspace(path) -> bool` - Check for valid sentinel +- `read_workspace_metadata(path) -> dict` - Read sentinel metadata + +Sentinel format (`.mlxk_workspace.json`): +```json +{ + "mlxk_version": "2.0.4", + "created_at": "2025-12-29T10:30:00Z", + "source_repo": "mlx-community/Llama-3.2-3B", + "source_revision": "abc123def456", + "managed": true, + "operation": "clone" // or "convert" +} +``` + +#### 2. Workspace Health Check Extension + +**File:** `mlxk2/operations/health.py` (MODIFY) + +New function: +- `health_check_workspace(path) -> (bool, str, bool)` - Returns (healthy, reason, is_managed) + +Modified function: +- `health_from_cache(model_spec, cache_dir)` - Now detects workspace vs cache structure + +Logic: +- If path has `.mlxk_workspace.json` → treat as workspace, read metadata +- Otherwise → treat as cache, use existing logic +- Backward compatible: unmanaged workspaces still work, just flagged as unmanaged + +#### 3. Clone Integration + +**File:** `mlxk2/operations/clone.py` (MODIFY) + +Changes: +- Write workspace sentinel after APFS clone (before declaring success) +- Uses `health_check_workspace()` for final validation (optional enhancement) + +Benefits: +- All cloned workspaces are now managed (traceable) +- Foundation for convert, push operations +- Health checks work with workspace paths + +--- + +### Phase 0b: Resumable Clone (Deferred to 2.0.5-beta) + +**Not critical for 2.0.4 community repair workflow.** + +See PLAN-resumable-pull-clone.md for detailed design. + +--- + +### Phase 0c: Workspace Run/Show/Server Support (2.0.4-beta.6) 🚧 + +**Goal:** Complete local workflow without HF push requirement. + +**Problem:** +Current workflow incomplete for local testing: +```bash +mlxk clone model ./ws +mlxk convert ./ws ./ws-fixed --repair-index +mlxk health ./ws-fixed # ✅ Works (Session 59) +mlxk run ./ws-fixed "test" # ❌ Fails (needs HF model ID) +mlxk show ./ws-fixed # ❌ Fails (needs HF model ID) +``` + +Workaround requires HF push: +```bash +mlxk push ./ws-fixed user/model-fixed # Phase 2 (not implemented) +mlxk run user/model-fixed "test" # Now works +``` + +**Solution:** +Extend `run`, `show`, `server` to accept workspace paths (like `health` already does). + +**Design Principles:** +1. **Central implementation** - No operation-specific hacks +2. **Consistent behavior** - All operations use same resolution logic +3. **Zero breaking changes** - HF model IDs still work +4. **OpenAI compatible** - Server remains spec-compliant + +**Implementation:** + +1. **Core Layer** - `resolve_model_for_operation()` (model_resolution.py): + ```python + def resolve_model_for_operation(model_spec: str) -> Tuple[str, Optional[str], Optional[List[str]]]: + """Resolve model specification to (name, commit_hash, ambiguous_matches). + + New: Workspace path detection (analog to health.py:502-504) + """ + # NEW: Check if model_spec is workspace path + workspace_path = Path(model_spec) + if workspace_path.exists() and (workspace_path / "config.json").exists(): + # Return absolute path as resolved_name, skip cache logic + return (str(workspace_path.resolve()), None, None) + + # Existing cache resolution logic... + ``` + +2. **Runner Layer** - `MLXRunner` + `VisionRunner` (__init__.py, vision_runner.py): + ```python + def load_model(self): + resolved_name, commit_hash, ambiguous = resolve_model_for_operation(self.model_spec) + + # NEW: Path vs cache detection + if Path(resolved_name).exists(): + # Workspace path - use directly + model_path = Path(resolved_name) + else: + # Cache model - existing logic + model_cache_dir = cache_root / hf_to_cache_dir(resolved_name) + # ... existing snapshot resolution ... + + # Load from model_path (works for both) + ``` + +3. **Operation Layer** - `show_model_operation()` (show.py): + ```python + def show_model_operation(model_pattern: str, ...): + resolved_name, commit_hash, ambiguous = resolve_model_for_operation(model_pattern) + + # NEW: Direct path handling + if Path(resolved_name).exists(): + model_path = Path(resolved_name) + # build_model_object works on any path + model_obj = build_model_object(resolved_name, model_path.parent, model_path) + else: + # Existing cache logic... + ``` + +4. **Server Layer** - `/v1/models` endpoint (server_base.py): + ```python + @app.get("/v1/models") + async def list_models(): + model_list = [] + + # Existing: Scan cache for models... + + # NEW: Add preloaded workspace if present + global _preload_model + if _preload_model and Path(_preload_model).exists(): + model_list.append(ModelInfo( + id=_preload_model, # Original path string + object="model", + owned_by="workspace", # ✅ Distinguishable! + context_length=None + )) + + # Sort: preloaded first (cache or workspace), then alphabetical + ``` + +**Affected Operations:** +- ✅ `mlxk run ./workspace "prompt"` - Direct testing +- ✅ `mlxk show ./workspace` - Metadata inspection +- ✅ `mlxk server --model ./workspace` - Local dev/testing +- ✅ `mlxk health ./workspace` - Already works (Session 59) + +**OpenAI API Compatibility:** +- `/v1/models` response includes workspace with `"owned_by": "workspace"` +- Clients use the model ID from `/v1/models` (standard flow) +- Remote clients would see local path - acceptable for local dev servers + +**JSON API Schema:** +- ✅ No changes required (`model` field already accepts strings) +- ✅ Backward compatible (cache model IDs still work) + +**Files Modified:** +- `mlxk2/core/model_resolution.py` (~20 LOC) +- `mlxk2/core/runner/__init__.py` (~30 LOC) +- `mlxk2/core/vision_runner.py` (~20 LOC) +- `mlxk2/operations/show.py` (~15 LOC) +- `mlxk2/core/server_base.py` (~10 LOC) +- Tests: +10-12 tests (resolve: 3, MLXRunner: 3, show: 2, run E2E: 2, server: 2) + +**Effort:** ~1.5-2 sessions (~85 LOC, 10-12 tests) + +**Benefits:** +1. Complete local workflow: `clone → convert → run` (no HF push) +2. Faster iteration (test fixes immediately) +3. Privacy (no upload for local testing) +4. Consistent UX (all operations accept paths) + +**Limitations:** +- Server `--model ./workspace` only for local development +- Remote clients cannot resolve local file paths +- Production: Push workspace to HF first (`mlxk push` - Phase 2) + +**Documentation Note:** +> **Workspace paths:** Supported in `run`, `show`, `server` for local development/testing. For production servers, push workspace to HuggingFace first - remote clients cannot access local file paths. + +--- + +### Phase 1: repair-index (2.0.4 stable) ✅ + +**Community repair tool for mlx-vlm #624 affected models.** + +#### Dependencies +- Requires Phase 0a complete (workspace infrastructure) +- Uses safetensors library for header parsing +- Cache sanctity validation via workspace primitives + +#### Community Impact +Affected models can be repaired without reconversion: +- Qwen2.5-VL-7B-Instruct-4bit +- gemma-3-27b-it-4bit +- Mistral-Small-3.1-24B-Instruct-2503-4bit +- DeepSeek-OCR-4bit +- Devstral-Small-2-24B-Instruct-2512-6bit +- (7+ models total, see mlx-vlm issue #624) + +Workflow: +```bash +mlxk clone mlx-community/ ./ws +mlxk convert ./ws ./ws-fixed --repair-index +mlxk health ./ws-fixed # Should be healthy +# Optional: push back if maintainer +``` + +--- + +### Common prerequisites (Convert Operation) +1. Validate source path exists and contains model files (workspace-like structure). +2. Resolve real paths for source/target and **hard-block cache paths** for both. +3. Ensure target workspace path is new/empty (or explicitly allowed by policy). +4. Create target directory and **write workspace sentinel first** (atomic write + rename). +5. Copy required non-weight assets to target (config/tokenizer/etc.), using CoW where possible (`cp -c`). +6. Run the selected mode. +7. Run health check on output (unless `--skip-health`). + +### Primitive: `rebuild_safetensors_index()` +- Scan all `*.safetensors` shard files in the workspace. +- Read safetensors headers to enumerate tensor keys (no full tensor load). +- Build `weight_map: key -> shard_filename`. +- Write `model.safetensors.index.json` deterministically. + +### Mode: `--repair-index` +1. (After copy) run `rebuild_safetensors_index()` in target. +2. Run health check on output (unless `--skip-health`). + +### Mode: `--quantize ` +1. Load model from source workspace (text: mlx-lm; vision: mlx-vlm later). +2. Quantize into target workspace. +3. Ensure output index is correct by: + - either using the upstream “save weights” output **and verifying**, or + - always running `rebuild_safetensors_index()` as the final step (preferred for robustness). +4. Run health check on output (unless `--skip-health`). + +**Dependencies:** +- Phase 0/1 (POC): safetensors index rebuild primitive + mlx-lm (text quantize) +- Phase 3 (later): mlx-vlm (vision) once stable release includes the upstream fix + +--- + +## Status / Phases + +- [x] **Phase 0a (2.0.4-beta.5):** ✅ Workspace infrastructure foundation + - Workspace sentinel (`.mlxk_workspace.json`) - atomic write, managed/unmanaged detection + - Health checks support workspace directories + - Clone produces managed workspaces + - **Files:** `mlxk2/operations/workspace.py` (NEW), `health.py` (extended), `clone.py` (integrated) + - **Tests:** 20 new tests, all passing + +- [ ] **Phase 0b (2.0.4-beta.6):** 🚧 Resumable clone + - Temp cache reuse with user prompt (analog to resumable pull) + - Conditional cleanup based on workspace health + - UX parity with pull operation + - `--force-resume` flag for non-interactive use + - **Effort:** ~1 session + +- [ ] **Phase 0c (2.0.4-beta.6):** 🚧 Workspace run/show/server support + - Direct workspace execution: `mlxk run ./workspace "prompt"` + - Workspace inspection: `mlxk show ./workspace` + - Local dev server: `mlxk server --model ./workspace` + - Central implementation in `resolve_model_for_operation()` + runners + - Server: `/v1/models` shows workspace with `"owned_by": "workspace"` + - **Files:** `model_resolution.py`, `runner/__init__.py`, `vision_runner.py`, `show.py`, `server_base.py` (~85 LOC) + - **Tests:** +10-12 tests + - **Effort:** ~1 session + - **Benefit:** Complete local workflow without HF push + +- [x] **Phase 1 (2.0.4-beta.5):** ✅ `--repair-index` for safetensors index/shard mismatch + - `rebuild_safetensors_index()` primitive + - `mlxk convert --repair-index` command + - Cache sanctity enforcement (hard block) + - Fixes mlx-vlm #624 affected models (7+ models) + - **Files:** `mlxk2/operations/convert.py` (NEW), `cli.py` (convert subparser), `output/human.py` (render_convert) + - **Tests:** 11 new tests, all passing + +- [ ] **Phase 2 (future):** `--quantize ` for text models (mlx-lm) +- [ ] **Phase 3 (future):** Mixed recipes / advanced quant options +- [ ] **Phase 4 (future):** Vision model support (mlx-vlm) once stable and dependency policy allows + +--- + +## References + +- mlx-vlm issue #624 (index overwrite regression) +- mlx-vlm PR #638 (fix) +- ADR-007: Clone Implementation (workspace concept) \ No newline at end of file diff --git a/mlxk2/__init__.py b/mlxk2/__init__.py index 604a977..b04b84c 100644 --- a/mlxk2/__init__.py +++ b/mlxk2/__init__.py @@ -7,4 +7,4 @@ import warnings # Issue parity with 1.1.0 (Issue #22) warnings.filterwarnings('ignore', message='urllib3 v2 only supports OpenSSL 1.1.1+') -__version__ = "2.0.4b4" +__version__ = "2.0.4b5" diff --git a/mlxk2/cli.py b/mlxk2/cli.py index c07ade8..1fb7fe7 100644 --- a/mlxk2/cli.py +++ b/mlxk2/cli.py @@ -167,6 +167,7 @@ def main(): pull_parser = subparsers.add_parser("pull", help="Download a model") pull_parser.add_argument("model", help="Model name to download") pull_parser.add_argument("--json", action="store_true", help="Output in JSON format") + pull_parser.add_argument("--force-resume", action="store_true", help="Force resume of partial downloads without prompting") # Clone command (alpha) - only show if alpha features enabled if os.getenv("MLXK2_ENABLE_ALPHA_FEATURES"): @@ -177,7 +178,28 @@ def main(): clone_parser.add_argument("--no-health-check", action="store_true", help="Skip health validation before copy") clone_parser.add_argument("--quiet", action="store_true", help="Suppress progress output") clone_parser.add_argument("--json", action="store_true", help="Output in JSON format") - + clone_parser.add_argument("--force-resume", action="store_true", help="Force resume of partial downloads without prompting") + + # Convert command (ADR-018 Phase 1) + convert_parser = subparsers.add_parser( + "convert", + help="Convert workspace to workspace with transformations", + description="Transform model workspaces (repair-index, quantize, etc.)" + ) + convert_parser.add_argument("source", help="Source workspace path") + convert_parser.add_argument("target", help="Target workspace path") + convert_parser.add_argument( + "--repair-index", + action="store_true", + help="Rebuild model.safetensors.index.json from shards (fixes mlx-vlm #624)" + ) + convert_parser.add_argument( + "--skip-health", + action="store_true", + help="Skip health check on output (debug only)" + ) + convert_parser.add_argument("--json", action="store_true", help="Output in JSON format") + # Remove command rm_parser = subparsers.add_parser("rm", help="Delete a model") rm_parser.add_argument("model", help="Model name to delete") @@ -297,6 +319,43 @@ def main(): print_result(result, render_show, args.json) elif args.command == "pull": result = pull_operation(args.model) + + # Handle resume confirmation + if result.get("data", {}).get("download_status") == "requires_confirmation": + # JSON mode: Return as-is, let caller decide + if args.json: + print_result(result, render_pull, True) + sys.exit(0) + + # --force-resume flag: Resume immediately (works in both interactive and non-interactive) + if getattr(args, "force_resume", False): + result = pull_operation(args.model, force_resume=True) + # Non-interactive without --force-resume: Fail with clear error + elif not sys.stdin.isatty(): + result["status"] = "error" + result["error"] = { + "type": "requires_confirmation", + "message": result["data"]["message"] + } + print_result(result, None, False) + sys.exit(1) + # Interactive: Prompt user + else: + model_name = result["data"]["model"] + message = result["data"]["message"] + + print(f"Model '{model_name}' has partial download:", file=sys.stderr) + print(f" {message}", file=sys.stderr) + + response = input("Resume download? [Y/n]: ").strip().lower() + if response not in ("", "y", "yes"): + print("Download cancelled. Partial download kept.", file=sys.stderr) + print(f"Use 'mlxk rm {model_name}' to delete if needed.", file=sys.stderr) + sys.exit(0) + + # User confirmed - retry pull with force_resume + result = pull_operation(args.model, force_resume=True) + print_result(result, render_pull, args.json) elif args.command == "clone": # Check if alpha features are enabled (should not reach here if not, but double-check) @@ -319,6 +378,26 @@ def main(): ) print_result(result, render_clone, args.json, quiet=getattr(args, "quiet", False)) + elif args.command == "convert": + from .operations.convert import convert_operation + + # Validate mode flags + if args.repair_index: + mode = "repair-index" + else: + print("Error: Must specify conversion mode (--repair-index)", file=sys.stderr) + sys.exit(1) + + result = convert_operation( + args.source, + args.target, + mode=mode, + skip_health=args.skip_health + ) + + # Import render_convert from output.human + from .output.human import render_convert + print_result(result, render_convert, args.json) elif args.command == "rm": result = rm_operation(args.model, args.force) print_result(result, render_rm, args.json) diff --git a/mlxk2/core/runner/__init__.py b/mlxk2/core/runner/__init__.py index 390cb88..afff00d 100644 --- a/mlxk2/core/runner/__init__.py +++ b/mlxk2/core/runner/__init__.py @@ -99,6 +99,25 @@ class MLXRunner: """Handle Ctrl-C interruption during generation.""" self._interrupted = True + def _decode_tokens(self, token_ids): + """Decode token IDs using the streaming detokenizer. + + This properly converts BPE space markers (Ġ U+0120) to spaces (U+0020) + while tokenizer.decode() for some tokenizers does NOT. + + Args: + token_ids: List of token IDs to decode + + Returns: + Decoded string + """ + detok = self.tokenizer.detokenizer + detok.reset() + for token_id in token_ids: + detok.add_token(token_id) + detok.finalize() + return detok.text + def request_interrupt(self) -> None: """Request an interruption from external controller (e.g., server signal). @@ -199,6 +218,9 @@ class MLXRunner: print(f"Model loaded in {load_time:.1f}s") print(f"Memory: {model_memory:.1f}GB model, {current_memory:.1f}GB total") + # Apply Mistral regex fix if needed (workaround for mlx-community models with broken tokenizers) + self._apply_mistral_regex_fix(model_path) + # Extract stop tokens and other properties self._extract_stop_tokens() self._context_length = get_model_context_length(str(model_path)) @@ -239,6 +261,98 @@ class MLXRunner: if self.verbose and self._is_reasoning_model: print("Reasoning model detected - special handling enabled") + def _apply_mistral_regex_fix(self, model_path): + """Apply Mistral tokenizer regex fix for models with broken tokenizers. + + Problem: Some mlx-community models were converted with transformers 4.39-4.57.2, + which had an incorrect regex pattern for Mistral tokenizers. This causes: + 1. Incorrect encoding (user prompts tokenized incorrectly, merged words) + 2. Incorrect decoding (BPE space markers Ġ (U+0120) not converted to spaces) + 3. Context window waste (broken tokenizer uses ~15% more tokens) + + Affected models: + - mlx-community/DeepHermes-3-Mistral-24B-Preview-8bit (transformers 4.46.3) + - mlx-community/Mistral-Small-3.2-24B-Instruct-2506-4bit (transformers 4.52.4) + - mlx-community/DeepSeek-R1-Distill-Llama-8B-4bit (transformers 4.43.0) + + Solution: Apply the same regex pattern fix that transformers 4.57.3+ uses. + + See: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84 + """ + try: + import json + from packaging import version + + # Read config to check if fix is needed + config_path = model_path / "config.json" + if not config_path.exists(): + return # No config, can't determine if fix needed + + with open(config_path) as f: + config = json.load(f) + + model_type = config.get("model_type") + transformers_version = config.get("transformers_version") + + # Only apply fix to Mistral-family models converted with affected transformers versions + if model_type not in ["mistral", "mistral3", "llama"]: + return + + if not transformers_version: + return # Can't determine version, skip + + # Parse version and check if in bug window + try: + tv = version.parse(transformers_version) + # Bug exists in 4.39.0 through 4.57.2 + if tv < version.parse("4.39.0") or tv > version.parse("4.57.2"): + return # Not in bug window + except Exception: + return # Can't parse version, skip + + # Model needs fix - apply the patch + try: + import tokenizers + + split_pretokenizer = tokenizers.pre_tokenizers.Split( + pattern=tokenizers.Regex( + r"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+" + ), + behavior="isolated", + ) + + # Access the underlying HF tokenizer + backend_tokenizer = self.tokenizer._tokenizer.backend_tokenizer + current_pretokenizer = backend_tokenizer.pre_tokenizer + + if isinstance(current_pretokenizer, tokenizers.pre_tokenizers.Sequence): + # Replace the first element (the Split pattern) + backend_tokenizer.pre_tokenizer[0] = split_pretokenizer + else: + # Not a Sequence, create one with Split + current pretokenizer + if isinstance(current_pretokenizer, tokenizers.pre_tokenizers.Metaspace): + current_pretokenizer = tokenizers.pre_tokenizers.ByteLevel( + add_prefix_space=False, use_regex=False + ) + backend_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence( + [split_pretokenizer, current_pretokenizer] + ) + + # Mark as fixed + setattr(self.tokenizer._tokenizer, 'fix_mistral_regex', True) + + if self.verbose: + print(f"Applied Mistral tokenizer regex fix (transformers {transformers_version})") + + except Exception as e: + # Patching failed, but don't break model loading + if self.verbose: + print(f"Warning: Could not apply Mistral tokenizer fix: {e}") + + except Exception: + # Silently skip if anything goes wrong (don't break model loading) + pass + def cleanup(self): """Clean up model resources and clear GPU memory.""" mx_core = self._mx @@ -409,7 +523,7 @@ class MLXRunner: # Use sliding window for proper decoding start_idx = max(0, len(generated_tokens) - context_window) window_tokens = generated_tokens[start_idx:] - window_text = self.tokenizer.decode(window_tokens) + window_text = self._decode_tokens(window_tokens) # Extract new text if start_idx == 0: @@ -421,13 +535,13 @@ class MLXRunner: new_text = window_text previous_decoded = window_text else: - new_text = self.tokenizer.decode(window_tokens) + new_text = self._decode_tokens(window_tokens) if len(window_tokens) > 1: - prefix = self.tokenizer.decode(window_tokens[:-1]) + prefix = self._decode_tokens(window_tokens[:-1]) if new_text.startswith(prefix): new_text = new_text[len(prefix):] else: - new_text = self.tokenizer.decode([token_id]) + new_text = self._decode_tokens([token_id]) if new_text: accumulated_response += new_text @@ -603,8 +717,10 @@ class MLXRunner: if token_id in self.tokenizer.eos_token_ids: break - # Decode full response - full_response = self.tokenizer.decode(all_tokens) + # Decode full response using the streaming detokenizer + # This properly converts BPE space markers (Ġ U+0120) to spaces (U+0020) + # while tokenizer.decode() for slow tokenizers (LlamaTokenizer) does NOT. + full_response = self._decode_tokens(all_tokens) # Debug: Show raw generated tokens for quality analysis (enabled via --verbose) if self.verbose: @@ -615,7 +731,8 @@ class MLXRunner: last_3_decoded = [] for tid in last_3_ids: try: - decoded = self.tokenizer.decode([tid]) + # Use detokenizer for debug output too + decoded = self._decode_tokens([tid]) last_3_decoded.append(f"{tid}={decoded!r}") except Exception: last_3_decoded.append(f"{tid}=") @@ -630,7 +747,8 @@ class MLXRunner: if isinstance(full_response, str) and isinstance(formatted_prompt, str) and full_response.startswith(formatted_prompt): response = full_response[len(formatted_prompt):] else: - decoded = self.tokenizer.decode(generated_tokens) + # Decode generated tokens only (use detokenizer) + decoded = self._decode_tokens(generated_tokens) response = decoded if isinstance(decoded, str) else str(decoded) # Filter stop tokens (strings only) diff --git a/mlxk2/operations/clone.py b/mlxk2/operations/clone.py index f7b1c79..d1b2d98 100644 --- a/mlxk2/operations/clone.py +++ b/mlxk2/operations/clone.py @@ -23,6 +23,7 @@ from pathlib import Path from typing import Optional, Dict, Any from .pull import pull_to_cache +from .workspace import write_workspace_sentinel from ..core.cache import hf_to_cache_dir, get_current_cache_root logger = logging.getLogger(__name__) @@ -122,6 +123,12 @@ def clone_operation(model_spec: str, target_dir: str, health_check: bool = True) result["data"]["clone_status"] = "pull_failed" return result + # Phase 3b: Mark download as complete (ADR-018 Phase 0b prep) + # This marker distinguishes "download incomplete" from "model unhealthy" + # Critical for resumable clone: know when to skip re-download + download_marker = temp_cache / ".mlxk2_download_complete" + download_marker.write_text(f"completed_{int(time.time())}\n") + # Extract resolved model name from pull result resolved_model = pull_result["data"]["model"] result["data"]["model"] = resolved_model @@ -138,19 +145,16 @@ def clone_operation(model_spec: str, target_dir: str, health_check: bool = True) return result # Phase 5: Optional health check on temp cache + # Health check is informational only - all HF models must be clonable + # This allows users to repair broken models (e.g., mlx-vlm #624 affected models) if health_check: result["data"]["clone_status"] = "health_checking" - # Use health_from_cache for proper isolation from .health import health_from_cache healthy, health_message = health_from_cache(model_spec, temp_cache) - if not healthy: - result["status"] = "error" - result["error"] = { - "type": "ModelUnhealthyError", - "message": f"Model failed health check: {health_message}" - } - result["data"]["clone_status"] = "health_check_failed" - return result + # Store health status but continue regardless + result["data"]["health"] = "healthy" if healthy else "unhealthy" + result["data"]["health_reason"] = health_message + logger.info(f"Health check: {'healthy' if healthy else 'unhealthy'} - {health_message}") # Phase 6: APFS clone temp cache → workspace (instant, CoW) result["data"]["clone_status"] = "cloning" @@ -166,12 +170,41 @@ def clone_operation(model_spec: str, target_dir: str, health_check: bool = True) result["data"]["clone_status"] = "filesystem_error" return result + # Phase 6b: Write workspace sentinel (ADR-018 Phase 0a) + # Sentinel written AFTER clone success, BEFORE declaring operation complete + from datetime import datetime + + # Extract commit hash if available from pull result + commit_hash = pull_result["data"].get("commit_hash") + + metadata = { + "mlxk_version": "2.0.4", # TODO: Read from __version__ + "created_at": datetime.utcnow().isoformat() + "Z", + "source_repo": resolved_model, + "source_revision": commit_hash, + "managed": True, + "operation": "clone" + } + + try: + write_workspace_sentinel(target_path, metadata) + logger.debug(f"Wrote workspace sentinel to {target_path}") + except Exception as e: + # Sentinel write failure is non-fatal - workspace is still usable + # Log warning but don't fail the entire clone operation + logger.warning(f"Failed to write workspace sentinel: {e}") + # Workspace is unmanaged but functional + # Success - temp cache auto-cleanup via finally block result["data"]["clone_status"] = "success" result["data"]["message"] = f"Cloned to {target_dir}" finally: # Phase 7: Cleanup temp cache (always) - with safety check + # TODO (ADR-018 Phase 0b): Make cleanup conditional based on: + # - .mlxk2_download_complete marker exists AND + # - health status (healthy → cleanup, unhealthy → keep for debugging/repair) + # - user choice (prompt if unhealthy: "Keep temp cache for inspection?") if temp_cache and temp_cache.exists(): _cleanup_temp_cache_safe(temp_cache) @@ -245,11 +278,20 @@ def _is_apfs_filesystem(path: Path) -> bool: def _create_temp_cache_same_volume(target_workspace: Path) -> Path: - """Create temp cache on same APFS volume as target for CoW optimization.""" + """Create temp cache on same APFS volume as target for CoW optimization. + + TODO (ADR-018 Phase 0b - Resumable Clone): + Change naming to deterministic for resume support: + OLD: f".mlxk2_temp_{os.getpid()}_{random.randint(...)}" # ephemeral + NEW: f".mlxk2_temp_{hash(model_spec + target_workspace)}" # deterministic + + This allows detection of existing partial downloads for resume prompts. + """ # Get target volume mount point via st_dev target_volume = _get_volume_mount_point(target_workspace) # Create temp cache on same volume + # NOTE: PID-based (ephemeral) - will become deterministic in Phase 0b temp_cache = target_volume / f".mlxk2_temp_{os.getpid()}_{random.randint(1000,9999)}" temp_cache.mkdir(parents=True) diff --git a/mlxk2/operations/convert.py b/mlxk2/operations/convert.py new file mode 100644 index 0000000..fba516e --- /dev/null +++ b/mlxk2/operations/convert.py @@ -0,0 +1,355 @@ +"""Convert operation for MLX Knife 2.0 (ADR-018 Phase 1). + +Workspace-to-workspace transformations: +- repair-index: Rebuild safetensors index from existing shards (fixes mlx-vlm #624) +- quantize: (Phase 2, future) +- dequantize: (Phase 3, future) + +ADR-018 Core Rules: +1. Convert operates on workspaces, NEVER on HF cache (hard block) +2. Target workspace gets sentinel written FIRST (atomic) +3. Source can be managed or unmanaged +4. Output is always managed workspace +5. Health check runs on output (unless --skip-health) + +Philosophy: +- HF cache is "holy mirror" of remote repos (read-only for convert) +- Workspaces are "working area" for transformations +- User workflow: HF Remote → Clone → Edit/Convert → Push to HF Remote +""" + +import json +import logging +import subprocess +from datetime import datetime +from pathlib import Path +from typing import Dict, Any + +from .workspace import write_workspace_sentinel, is_managed_workspace, read_workspace_metadata +from .health import health_check_workspace +from ..core.cache import get_current_cache_root + +logger = logging.getLogger(__name__) + + +def rebuild_safetensors_index(workspace_path: Path) -> bool: + """Rebuild model.safetensors.index.json from existing shards. + + This fixes models affected by mlx-vlm #624 regression where conversion + overwrites correct index with source model's outdated index. + + Strategy: + 1. Find all *.safetensors shard files + 2. Read safetensors headers (no full tensor load) + 3. Build weight_map: tensor_key → shard_filename + 4. Write index atomically (tmp + rename) + + Args: + workspace_path: Path to workspace containing .safetensors shards + + Returns: + True if index rebuilt successfully + False if no shards found (single-file model, no index needed) + + Raises: + ValueError: If shard headers are corrupted or unreadable + OSError: If index write fails + + Example: + >>> success = rebuild_safetensors_index(Path("./workspace")) + >>> if success: + ... print("Index rebuilt") + ... else: + ... print("Single-file model, no index needed") + """ + from safetensors import safe_open + + # Find all safetensors shards (sorted for deterministic output) + shards = sorted(workspace_path.glob("*.safetensors")) + + if not shards: + logger.debug(f"No safetensors files in {workspace_path}") + return False + + if len(shards) == 1: + # Single-file model (e.g., model.safetensors) + # No index needed - MLX/HF loaders handle directly + logger.debug(f"Single safetensors file, no index needed: {shards[0].name}") + return False + + # Build weight map by reading headers only (no tensor load) + weight_map = {} + total_size = 0 + + for shard in shards: + total_size += shard.stat().st_size + + try: + # Read safetensors header without loading tensors + # framework="mlx" ensures compatibility with MLX tensor format + with safe_open(shard, framework="mlx", device="cpu") as f: + for key in f.keys(): + if key in weight_map: + # Duplicate key across shards - invalid model + raise ValueError( + f"Duplicate tensor key '{key}' found in multiple shards: " + f"{weight_map[key]} and {shard.name}" + ) + weight_map[key] = shard.name + + logger.debug(f"Mapped {key} → {shard.name}") + except Exception as e: + raise ValueError(f"Failed to read shard {shard.name}: {e}") from e + + if not weight_map: + raise ValueError("No tensors found in shards - corrupted model?") + + # Build index structure (HuggingFace format) + index = { + "metadata": { + "total_size": total_size + }, + "weight_map": weight_map + } + + # Atomic write (tmp + rename) + index_path = workspace_path / "model.safetensors.index.json" + tmp_path = index_path.with_suffix(".json.tmp") + + try: + tmp_path.write_text(json.dumps(index, indent=2) + "\n") + tmp_path.rename(index_path) + logger.info(f"Rebuilt index with {len(weight_map)} tensors across {len(shards)} shards") + return True + except Exception as e: + # Cleanup tmp file on failure + if tmp_path.exists(): + try: + tmp_path.unlink() + except Exception: + pass # Best effort cleanup + raise OSError(f"Failed to write index: {e}") from e + + +def convert_operation( + source_path: str, + target_path: str, + mode: str, + skip_health: bool = False +) -> Dict[str, Any]: + """Convert operation: workspace → workspace transformation. + + Phase 1 modes: + - "repair-index": Rebuild safetensors index only (fixes mlx-vlm #624) + + Future modes (Phase 2+): + - "quantize": Quantize to N bits + - "dequantize": Dequantize weights + + Workflow: + 1. Validate source exists, target is empty + 2. Cache sanctity check (hard block if source/target in cache) + 3. Create target, write sentinel FIRST (atomic) + 4. Copy non-weight assets (config, tokenizer, etc.) with CoW + 5. Clone safetensors shards with CoW + 6. Apply mode operation (repair-index/quantize/etc.) + 7. Health check on output + + Args: + source_path: Source workspace path + target_path: Target workspace path + mode: Conversion mode ("repair-index" for Phase 1) + skip_health: Skip health check on output (debug only) + + Returns: + Result dict with JSON API schema: + { + "status": "success" | "error", + "command": "convert", + "data": { + "source": str, + "target": str, + "mode": str, + "health_check": bool, + "health_status": "healthy" | "unhealthy", + "health_reason": str, + "message": str + }, + "error": { + "type": str, + "message": str + } | None + } + + Raises: + None - all errors returned in result dict + """ + result = { + "status": "success", + "command": "convert", + "error": None, + "data": { + "source": source_path, + "target": target_path, + "mode": mode, + "health_check": not skip_health + } + } + + try: + # Phase 1: Validate paths + src = Path(source_path).resolve() + dst = Path(target_path).resolve() + + if not src.exists(): + raise ValueError(f"Source path does not exist: {source_path}") + + if not src.is_dir(): + raise ValueError(f"Source must be a directory: {source_path}") + + # Phase 2: Cache sanctity check (hard block) + # ADR-018: Cache is read-only for convert, use clone first + cache_root = get_current_cache_root() + + def is_in_cache(path: Path) -> bool: + """Check if path is inside HF cache.""" + try: + path.relative_to(cache_root) + return True + except ValueError: + return False + + if is_in_cache(src): + raise ValueError( + f"Source path is in HF cache (read-only): {src}\n" + "Convert operates on workspaces only. Use 'mlxk clone' first to create a workspace:\n" + f" mlxk clone ./workspace\n" + f" mlxk convert ./workspace ./workspace-fixed --repair-index" + ) + + if is_in_cache(dst): + raise ValueError( + f"Target path is in HF cache (read-only): {dst}\n" + "Convert cannot write to cache. Choose workspace location outside cache:\n" + f" mlxk convert {source_path} ./fixed-workspace --repair-index" + ) + + # Phase 3: Validate target is empty or new + if dst.exists(): + if any(dst.iterdir()): + raise ValueError( + f"Target directory not empty: {target_path}\n" + "Convert requires empty target workspace. Use different path or delete contents." + ) + else: + dst.mkdir(parents=True) + + # Phase 4: Write workspace sentinel FIRST (atomic, ADR-018 contract) + # Ensures target is identifiable as managed workspace before any processing + src_metadata = {} + if is_managed_workspace(src): + src_metadata = read_workspace_metadata(src) + + target_metadata = { + "mlxk_version": "2.0.4", # TODO: Read from __version__ + "created_at": datetime.utcnow().isoformat() + "Z", + "source_repo": src_metadata.get("source_repo", str(src)), + "source_revision": src_metadata.get("source_revision"), + "managed": True, + "operation": "convert", + "mode": mode + } + + try: + write_workspace_sentinel(dst, target_metadata) + logger.debug(f"Wrote workspace sentinel to {dst}") + except Exception as e: + raise OSError(f"Failed to write workspace sentinel: {e}") from e + + # Phase 5: Copy non-weight assets with CoW + # Skip .safetensors (will be cloned), copy config/tokenizer/images/etc. + # Use APFS CoW (cp -c) for efficient cloning + logger.info(f"Copying non-weight assets from {src} to {dst}") + + for item in src.iterdir(): + # Skip safetensors files (handled separately) + # Skip existing sentinel in target + # Skip model.safetensors.index.json (will be rebuilt) + if item.name.endswith(".safetensors"): + continue + if item.name == ".mlxk_workspace.json": + continue + if item.name == "model.safetensors.index.json": + continue + + if item.is_file(): + try: + # Use APFS CoW for large files (instant, zero space initially) + subprocess.run( + ["cp", "-c", str(item), str(dst / item.name)], + check=True, + capture_output=True, + text=True + ) + except subprocess.CalledProcessError as e: + logger.warning(f"Failed to CoW copy {item.name}, using regular copy: {e}") + # Fallback to regular copy + import shutil + shutil.copy2(item, dst / item.name) + + # Phase 6: Clone safetensors shards (CoW) + logger.info("Cloning safetensors shards") + + for shard in src.glob("*.safetensors"): + try: + subprocess.run( + ["cp", "-c", str(shard), str(dst / shard.name)], + check=True, + capture_output=True, + text=True + ) + except subprocess.CalledProcessError as e: + logger.warning(f"Failed to CoW copy {shard.name}, using regular copy: {e}") + import shutil + shutil.copy2(shard, dst / shard.name) + + # Phase 7: Apply mode operation + if mode == "repair-index": + logger.info("Rebuilding safetensors index") + success = rebuild_safetensors_index(dst) + + if not success: + result["data"]["message"] = "No index to rebuild (single-file model or no safetensors found)" + else: + result["data"]["message"] = "Index rebuilt successfully" + + else: + raise ValueError(f"Unsupported conversion mode: {mode}") + + # Phase 8: Health check on output + if not skip_health: + logger.info("Running health check on output workspace") + healthy, reason, managed = health_check_workspace(dst) + + result["data"]["health_status"] = "healthy" if healthy else "unhealthy" + result["data"]["health_reason"] = reason + + if not healthy: + result["status"] = "error" + result["error"] = { + "type": "HealthCheckFailed", + "message": f"Output workspace failed health check: {reason}" + } + return result + + logger.info("Convert operation completed successfully") + + except Exception as e: + logger.error(f"Convert operation failed: {e}") + result["status"] = "error" + result["error"] = { + "type": type(e).__name__, + "message": str(e) + } + + return result diff --git a/mlxk2/operations/health.py b/mlxk2/operations/health.py index 16cfed8..37f189a 100644 --- a/mlxk2/operations/health.py +++ b/mlxk2/operations/health.py @@ -274,25 +274,88 @@ def check_lfs_corruption(model_path): return True, "No LFS corruption detected" -def health_from_cache(model_spec, cache_dir): - """Health check for a specific model in a specific cache directory. +def health_check_workspace(workspace_path: Path) -> Tuple[bool, str, bool]: + """Health check for workspace directory (managed or unmanaged). - This is used by clone operations to check model health in temporary caches + Workspaces can be: + - Managed: Created by mlxk (clone/convert), has .mlxk_workspace.json + - Unmanaged: 3rd-party directories, no sentinel + + Both types are checked for model file validity. The is_managed flag + helps operations decide if they can trust provenance metadata. + + Args: + workspace_path: Path to workspace directory + + Returns: + Tuple of (is_healthy, reason_message, is_managed): + - is_healthy (bool): True if model files are valid + - reason_message (str): Health status or error details + - is_managed (bool): True if workspace has valid sentinel + + Example: + >>> healthy, reason, managed = health_check_workspace(Path("./ws")) + >>> if not healthy: + ... print(f"Unhealthy: {reason}") + >>> if managed: + ... metadata = read_workspace_metadata(Path("./ws")) + ... print(f"Source: {metadata['source_repo']}") + """ + from pathlib import Path + from .workspace import is_managed_workspace + + workspace_path = Path(workspace_path).resolve() + managed = is_managed_workspace(workspace_path) + + # Check if workspace contains model files + config = workspace_path / "config.json" + if not config.exists(): + return False, "No config.json found in workspace", managed + + # Run standard snapshot health check + # Workspace root is treated as snapshot directory + healthy, reason = _check_snapshot_health(workspace_path) + + return healthy, reason, managed + + +def health_from_cache(model_spec, cache_dir): + """Health check for model in cache OR workspace path. + + This function now supports both: + 1. Cache directories: HuggingFace cache structure with snapshots + 2. Workspace directories: Direct model file directories (managed or unmanaged) + + Detection logic: + - If cache_dir contains .mlxk_workspace.json → treat as workspace + - Otherwise → treat as HF cache structure + + Used by clone operations to check model health in temporary caches without contaminating the user's main cache. Uses the full _check_snapshot_health() logic to ensure identical health validation standards. Args: model_spec: Model name/spec to check (e.g., "microsoft/DialoGPT-small") - cache_dir: Path to the cache directory containing the model + cache_dir: Path to cache directory OR workspace directory Returns: (bool, str): (is_healthy, reason_message) + + Note: For workspace paths, model_spec is ignored (workspace already + contains the model files directly). Kept for API compatibility. """ from pathlib import Path from ..core.cache import hf_to_cache_dir cache_path = Path(cache_dir) + # Check if this is a workspace (not cache structure) + if (cache_path / ".mlxk_workspace.json").exists(): + # Direct workspace check (model_spec ignored, workspace has files directly) + healthy, reason, managed = health_check_workspace(cache_path) + return healthy, reason + + # Original cache structure logic # Convert model spec to cache directory format model_cache_dir = cache_path / hf_to_cache_dir(model_spec) if not model_cache_dir.exists(): @@ -412,7 +475,13 @@ def check_runtime_compatibility(model_path: Path, framework: str) -> Tuple[bool, def health_check_operation(model_pattern=None): - """Health check operation for JSON API with model resolution support.""" + """Health check operation for JSON API with model resolution support. + + Supports: + - Cache model names: "microsoft/phi-2", "phi-2", "1.3b" + - Workspace paths: "./workspace", "/path/to/model" + - No pattern: Check all cached models + """ result = { "status": "success", "command": "health", @@ -427,13 +496,38 @@ def health_check_operation(model_pattern=None): } } } - + try: + # Check if model_pattern is a workspace path + if model_pattern: + workspace_path = Path(model_pattern) + if workspace_path.exists() and (workspace_path / "config.json").exists(): + # This is a workspace directory - use workspace health check + healthy, reason, managed = health_check_workspace(workspace_path) + + model_info = { + "name": str(workspace_path), + "status": "healthy" if healthy else "unhealthy", + "reason": reason, + "managed": managed + } + + result["data"]["summary"]["total"] = 1 + if healthy: + result["data"]["healthy"].append(model_info) + result["data"]["summary"]["healthy_count"] = 1 + else: + result["data"]["unhealthy"].append(model_info) + result["data"]["summary"]["unhealthy_count"] = 1 + + return result + + # Not a workspace - proceed with cache-based health check model_cache = get_current_model_cache() if not model_cache.exists(): result["data"]["summary"]["total"] = 0 return result - + # Use model resolution if specific pattern provided if model_pattern: resolved_name, commit_hash, ambiguous_matches = resolve_model_for_operation(model_pattern) diff --git a/mlxk2/operations/pull.py b/mlxk2/operations/pull.py index 906b198..725f43a 100644 --- a/mlxk2/operations/pull.py +++ b/mlxk2/operations/pull.py @@ -1,4 +1,4 @@ -from ..core.cache import MODEL_CACHE, hf_to_cache_dir +from ..core.cache import get_current_model_cache, hf_to_cache_dir from ..core.model_resolution import resolve_model_for_operation from .health import is_model_healthy import os @@ -127,8 +127,13 @@ def pull_model_with_huggingface_hub(model_name, cache_dir=None): return False, f"Download failed: {str(e)}" -def pull_operation(model_spec): - """Pull (download) operation for JSON API.""" +def pull_operation(model_spec, force_resume=False): + """Pull (download) operation for JSON API. + + Args: + model_spec: Model name/spec to pull + force_resume: If True, skip unhealthy check and attempt resume + """ result = { "status": "success", "command": "pull", @@ -190,22 +195,19 @@ def pull_operation(model_spec): result["data"]["commit_hash"] = commit_hash # Check if already exists and is healthy - cache_dir = MODEL_CACHE / hf_to_cache_dir(resolved_name) - if cache_dir.exists(): - healthy, _ = is_model_healthy(resolved_name) + cache_dir = get_current_model_cache() / hf_to_cache_dir(resolved_name) + if cache_dir.exists() and not force_resume: + healthy, health_reason = is_model_healthy(resolved_name) if healthy: result["data"]["download_status"] = "already_exists" result["data"]["message"] = f"Model {resolved_name} already exists in cache" return result else: - # Model exists but unhealthy - suggest rm workflow - result["status"] = "error" - result["error"] = { - "type": "model_corrupted", - "message": f"Model exists but is corrupted. Use 'rm {model_spec}' first, then pull again." - } - result["data"]["download_status"] = "corrupted" - return result + # Model exists but unhealthy - prompt user to resume + # Let huggingface_hub decide if resume is possible + result["data"]["download_status"] = "requires_confirmation" + result["data"]["message"] = f"{health_reason}. Use --force-resume to attempt resume or 'mlxk rm' to delete." + return result # CLI will handle prompt # Preflight check for repository access (Issue #30) result["data"]["download_status"] = "checking_access" @@ -250,7 +252,15 @@ def pull_operation(model_spec): def pull_to_cache(model_spec, cache_dir): - """Pull model to specific cache directory - used by clone operation.""" + """Pull model to specific cache directory - used by clone operation. + + Note: Resumable download prompts are not implemented for pull_to_cache() because: + - Clone creates fresh temp caches with unique names (PID + random) + - Partial downloads don't persist across clone operations + - Making is_model_healthy() work with custom cache dirs requires refactoring + - The scenario is rare (as noted in PLAN-resumable-pull-clone.md) + TODO: Add health check if clone starts reusing temp caches in the future + """ result = { "status": "success", "command": "pull", diff --git a/mlxk2/operations/workspace.py b/mlxk2/operations/workspace.py new file mode 100644 index 0000000..3f1a3cf --- /dev/null +++ b/mlxk2/operations/workspace.py @@ -0,0 +1,142 @@ +"""Workspace sentinel management (ADR-018 Phase 0a). + +This module provides primitives for managed workspace detection and metadata tracking. + +Managed workspaces contain a `.mlxk_workspace.json` sentinel file that enables: +- Workspace lifecycle tracking (clone, convert, push) +- Source provenance (which HF repo was cloned) +- Operation history (what transformations were applied) +- Safety guarantees (e.g., cache sanctity enforcement in convert) + +Sentinel format: +{ + "mlxk_version": "2.0.4", + "created_at": "2025-12-29T10:30:00Z", + "source_repo": "mlx-community/Llama-3.2-3B", + "source_revision": "abc123def456", + "managed": true, + "operation": "clone" // or "convert" +} + +ADR-018 Contract: +- Clone and convert MUST produce managed workspaces +- Sentinel is written FIRST (atomic, before other processing) +- Health checks support both managed and unmanaged workspaces +- Unmanaged workspaces can be converted to managed via convert operation +""" + +import json +import logging +from pathlib import Path +from typing import Dict, Any + +logger = logging.getLogger(__name__) + +SENTINEL_FILENAME = ".mlxk_workspace.json" + + +def write_workspace_sentinel(workspace_path: Path, metadata: Dict[str, Any]) -> None: + """Write workspace sentinel with atomic write+rename. + + Sentinel is written atomically to prevent partial writes during crashes. + Uses tmp file + rename pattern (POSIX atomic on same filesystem). + + Args: + workspace_path: Root directory of workspace + metadata: Dictionary with sentinel fields: + - mlxk_version (str): mlxk version that created workspace + - created_at (str): ISO8601 timestamp + - source_repo (str): Source HF repo (e.g., "mlx-community/Llama-3.2-3B") + - source_revision (str|None): Git revision hash + - managed (bool): Always True for mlxk-created workspaces + - operation (str): Creating operation ("clone", "convert") + Additional fields allowed (forward compatibility) + + Raises: + OSError: If sentinel write fails + TypeError: If workspace_path is not a Path object + ValueError: If metadata is missing required fields + """ + if not isinstance(workspace_path, Path): + raise TypeError(f"workspace_path must be Path, got {type(workspace_path)}") + + # Validate required fields + required_fields = {"mlxk_version", "created_at", "managed", "operation"} + missing = required_fields - set(metadata.keys()) + if missing: + raise ValueError(f"Missing required metadata fields: {missing}") + + workspace_path = workspace_path.resolve() + sentinel_path = workspace_path / SENTINEL_FILENAME + tmp_path = workspace_path / f"{SENTINEL_FILENAME}.tmp" + + try: + # Atomic write: tmp file + rename + tmp_path.write_text(json.dumps(metadata, indent=2) + "\n") + tmp_path.rename(sentinel_path) + + logger.debug(f"Wrote workspace sentinel to {sentinel_path}") + except Exception as e: + # Cleanup tmp file if rename failed + if tmp_path.exists(): + try: + tmp_path.unlink() + except Exception: + pass # Best effort cleanup + raise OSError(f"Failed to write workspace sentinel: {e}") from e + + +def is_managed_workspace(workspace_path: Path) -> bool: + """Check if workspace has valid sentinel. + + Args: + workspace_path: Path to workspace directory + + Returns: + True if workspace has valid .mlxk_workspace.json with managed=True, + False otherwise (missing sentinel, invalid JSON, managed=False) + """ + if not isinstance(workspace_path, Path): + return False + + sentinel = workspace_path.resolve() / SENTINEL_FILENAME + + if not sentinel.exists(): + return False + + try: + data = json.loads(sentinel.read_text()) + return data.get("managed", False) is True + except (json.JSONDecodeError, OSError) as e: + logger.debug(f"Invalid sentinel in {workspace_path}: {e}") + return False + + +def read_workspace_metadata(workspace_path: Path) -> Dict[str, Any]: + """Read workspace sentinel metadata. + + Args: + workspace_path: Path to workspace directory + + Returns: + Dictionary with sentinel metadata, or empty dict if: + - Sentinel doesn't exist + - JSON is invalid + - Read fails + + Note: This function does NOT validate metadata fields. + Use is_managed_workspace() to check if workspace is managed. + """ + if not isinstance(workspace_path, Path): + return {} + + sentinel = workspace_path.resolve() / SENTINEL_FILENAME + + if not sentinel.exists(): + return {} + + try: + return json.loads(sentinel.read_text()) + except (json.JSONDecodeError, OSError) as e: + logger.debug(f"Failed to read sentinel in {workspace_path}: {e}") + return {} diff --git a/mlxk2/output/human.py b/mlxk2/output/human.py index ef59c9b..4e064cc 100644 --- a/mlxk2/output/human.py +++ b/mlxk2/output/human.py @@ -232,7 +232,13 @@ def render_health(data: Dict[str, Any]) -> str: healthy_count = summary.get("healthy_count", 0) unhealthy_count = summary.get("unhealthy_count", 0) - lines = [f"Summary: total {total}, healthy {healthy_count}, unhealthy {unhealthy_count}"] + lines = [] + + # Show summary only when checking multiple items (or zero) + # For single item (total==1), the result line is self-explanatory + if total != 1: + lines.append(f"Summary: total {total}, healthy {healthy_count}, unhealthy {unhealthy_count}") + for entry in d.get("healthy", []): lines.append(f"healthy {entry.get('name','-')} — {entry.get('reason','')}".rstrip()) for entry in d.get("unhealthy", []): @@ -292,6 +298,13 @@ def render_show(data: Dict[str, Any]) -> str: def render_pull(data: Dict[str, Any]) -> str: d = data.get("data", {}) + + # Handle requires_confirmation status + if d.get("download_status") == "requires_confirmation": + model = d.get("model", "-") + message = d.get("message", "Partial download detected") + return f"pull: {model} — {message}" + status = data.get("status", "error") model = d.get("model", "-") msg = d.get("message", "") @@ -328,16 +341,20 @@ def render_clone(data: Dict[str, Any], quiet: bool = False) -> str: # Show additional info for successful clone cache_cleanup = d.get("cache_cleanup", False) - health_check = d.get("health_check", True) + health_status = d.get("health") # "healthy" or "unhealthy" or None + health_reason = d.get("health_reason", "") status_parts = [] - if health_check: - status_parts.append("✓ health") + if health_status == "healthy": + status_parts.append("✓ healthy") + elif health_status == "unhealthy": + # Show warning for unhealthy models + status_parts.append(f"⚠ unhealthy: {health_reason}") if cache_cleanup: status_parts.append("✓ cleanup") status_info = f" ({', '.join(status_parts)})" if status_parts else "" - return f"clone: {model} → {target_dir}{status_info} — {msg}".rstrip() + return f"clone: {model} → {target_dir}{status_info}".rstrip() # Error case err = data.get("error", {}) @@ -351,6 +368,40 @@ def render_clone(data: Dict[str, Any], quiet: bool = False) -> str: return f"clone: {model} → {target_dir} — {error_msg}".rstrip() +def render_convert(data: Dict[str, Any]) -> str: + """Render convert operation result for human consumption.""" + d = data.get("data", {}) + status = data.get("status", "error") + + source = d.get("source", "-") + target = d.get("target", "-") + mode = d.get("mode", "unknown") + message = d.get("message", "") + + health_status = d.get("health_status", "unknown") + health_reason = d.get("health_reason", "") + + if status == "success": + output = f"convert: {source} → {target}\n" + output += f" Mode: {mode}\n" + + if message: + output += f" {message}\n" + + if health_status == "healthy": + output += " ✓ Health check: passed" + elif health_status == "unhealthy": + output += f" ✗ Health check: {health_reason}" + + return output.rstrip() + + # Error case + err = data.get("error", {}) + error_msg = err.get("message", message) + + return f"convert: {source} → {target} — {error_msg}".rstrip() + + def render_push(data: Dict[str, Any], verbose: bool = False) -> str: d = data.get("data", {}) status = data.get("status", "error") diff --git a/pytest.ini b/pytest.ini index 7f483b2..48b8d8b 100644 --- a/pytest.ini +++ b/pytest.ini @@ -5,13 +5,14 @@ python_classes = Test* python_functions = test_* markers = spec: JSON API contract tests (current spec only) - wet: Opt-in live tests against Hugging Face (require env) + wet: Umbrella for Portfolio Discovery compatible tests (see TESTING-DETAILS.md) live_push: Alias for wet; push live tests (require env) live_list: Alias for wet; list human live tests (require env) live_clone: Alias for wet; clone live tests (require env, ADR-007 Phase 1) live_run: Opt-in run command tests with real models (require user cache model) live_stop_tokens: Opt-in stop token tests with real models (Issue #32, ADR-009) live_e2e: E2E tests with real models and server (require HF_HOME, ADR-011) + live_resumable: Isolated Cache WRITE tests (separate run required) show_model_portfolio: Display E2E test portfolio (convenience, no testing) issue27: Real-model health policy tests (opt-in; read-only user cache) slow: Tests that take >1 minute to run diff --git a/scripts/test-wet-memmom.sh b/scripts/test-wet-memmom.sh new file mode 100755 index 0000000..0bde0a1 --- /dev/null +++ b/scripts/test-wet-memmom.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# ⚠️ DEBUG TOOL - NOT FOR REGULAR TESTING ⚠️ +# +# Session 56-57 Memory Debugging Script +# Purpose: Analyze memory behavior with memmon.py + benchmark reporting +# Created: 2025-12-28 for investigating macOS Tahoe 26.2 RAM tax +# +# ❌ DO NOT USE for regular development testing! +# ✅ USE scripts/test-wet-umbrella.sh instead +# +# This script adds significant overhead: +# - memmon.py (memory monitoring every 200ms) +# - Benchmark JSONL reporting (schema v0.2.0) +# - Memplot generation for timeline visualization +# +# Historical context: +# - Session 56: Investigated 71 GB swap issue +# - Session 57: Discovered macOS Tahoe 26.2 consumes 25 GB more baseline RAM +# - Result: No MLX bug, just Apple TB5 RDMA support overhead +# +# Usage: $0 +# Example: $0 v4 +# Output: benchmarks/reports/YYYY-MM-DD-wet-{memory,benchmark}-.jsonl + +if [ -z "$1" ]; then + echo "Usage: $0 " + echo "Example: $0 v4" + exit 1 +fi + +SIGNATURE="$1" +DATE=$(date +%Y-%m-%d) + +python -u benchmarks/tools/memmon.py \ + --output benchmarks/reports/${DATE}-wet-memory-${SIGNATURE}.jsonl -- \ + venv310/bin/pytest -m wet -v -s --tb=no --report-output=benchmarks/reports/${DATE}-wet-benchmark-${SIGNATURE}.jsonl diff --git a/scripts/test-wet-umbrella.sh b/scripts/test-wet-umbrella.sh new file mode 100755 index 0000000..1c5ddb1 --- /dev/null +++ b/scripts/test-wet-umbrella.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# Run all "real tests" (wet umbrella + resumable) +# Memory-optimized for large test suites (154+ tests) +set -e + +echo "🌂 Wet Umbrella: Running all real tests..." + +# Memory-saving pytest options: +# --tb=no: No tracebacks (routine runs expect all PASSED, debug failures individually) +# --capture=sys: System-level capture only (less buffering than --capture=fd) +PYTEST_OPTS="--tb=no --capture=sys" + +# Run 1: Compatible live tests (User Cache READ) +echo "" +echo "📦 Phase 1: User Cache READ tests (wet umbrella)..." +MLXK2_ENABLE_ALPHA_FEATURES=1 pytest -m wet -v $PYTEST_OPTS + +# Run 2: Isolated Cache WRITE tests (incompatible with Portfolio) +echo "" +echo "📥 Phase 2: Isolated Cache WRITE tests (resumable)..." +MLXK2_TEST_RESUMABLE_DOWNLOAD=1 pytest -m live_resumable -v $PYTEST_OPTS + +echo "" +echo "✅ All real tests completed!" diff --git a/tests_2.0/conftest.py b/tests_2.0/conftest.py index db2a9c6..1fd451f 100644 --- a/tests_2.0/conftest.py +++ b/tests_2.0/conftest.py @@ -10,10 +10,11 @@ if str(_stubs_path) not in sys.path: sys.path.insert(0, str(_stubs_path)) import os +import re import tempfile import pytest from pathlib import Path -from typing import Generator +from typing import Generator, Dict, Any from contextlib import contextmanager import shutil import random @@ -42,10 +43,11 @@ def cleanup_zombie_servers(request): Use case: Prevents RAM exhaustion and port conflicts from accumulated zombies. """ - # SAFETY: Only cleanup when live_e2e marker is explicitly requested + # SAFETY: Only cleanup when live_e2e or wet marker is explicitly requested # This prevents killing production/dev servers during unit test runs + # wet marker includes live_e2e tests (Wet Umbrella pattern) selected_markers = request.config.getoption("-m") or "" - should_cleanup = "live_e2e" in selected_markers + should_cleanup = "live_e2e" in selected_markers or "wet" in selected_markers if should_cleanup: # Pre-test cleanup: Kill zombies from previous runs @@ -1158,3 +1160,350 @@ def copy_user_model_to_isolated(isolated_cache): return dst return copier + + +# ============================================================================= +# Wet Umbrella: Auto-assign marker to Portfolio-compatible tests +# ============================================================================= + +def pytest_collection_modifyitems(config, items): + """Auto-assign wet marker to Portfolio Discovery compatible tests. + + Wet Umbrella groups tests that can run together in one pytest invocation: + - User Cache READ tests (live_e2e, live_stop_tokens, live_run, live_list) + - Workspace operations (live_push, live_clone) + - Issue reproduction (issue27) + + Excluded from wet: + - live_resumable (Isolated Cache WRITE - requires clean import state) + - Model validation tests (memory-intensive, belongs in separate benchmark suite) + + See: TESTING-DETAILS.md → Extended Truth Table + """ + # Compatible live markers (User Cache READ + Workspace) + LIVE_MARKERS_FOR_WET = { + "live_e2e", # Portfolio Discovery (User Cache READ) + "live_stop_tokens", # Portfolio Discovery (User Cache READ) + "live_run", # User Cache READ + "live_list", # User Cache READ + "live_push", # Workspace (not Cache) + "live_clone", # Workspace (not Cache) + "issue27", # User Cache READ + } + + # Tests excluded from wet (model validation, not code tests) + EXCLUDED_FROM_WET = { + "test_empirical_mapping_single_model", # Model benchmark (ADR-013) + "test_empirical_mapping_generate_report", # Report generation + } + + for item in items: + test_markers = {m.name for m in item.iter_markers()} + test_path = str(item.path) + test_name = item.name + is_in_live_dir = "/live/" in test_path or "\\live\\" in test_path + + # Skip model validation tests + if any(excluded in test_name for excluded in EXCLUDED_FROM_WET): + continue + + # Wet marker for compatible tests + if (test_markers & LIVE_MARKERS_FOR_WET) or is_in_live_dir: + # EXCLUDE live_resumable (Isolated Cache WRITE - incompatible!) + if "live_resumable" not in test_markers: + item.add_marker(pytest.mark.wet) + + +# ============================================================================ +# Benchmark Reporting (ADR-013 Phase 0.5) +# ============================================================================ + +def pytest_addoption(parser): + """Add --report-output option for benchmark reporting.""" + parser.addoption( + "--report-output", + action="store", + default=None, + metavar="PATH", + help="Generate benchmark reports to JSONL file (ADR-013 Phase 0.5)" + ) + + +def pytest_configure(config): + """Initialize report file if --report-output is specified.""" + from pathlib import Path + config.report_file = None + if report_path := config.getoption("--report-output"): + config.report_file = Path(report_path).open("a", encoding="utf-8") + print(f"\n📊 Benchmark reporting enabled: {report_path}") + + +def pytest_unconfigure(config): + """Close report file at end of session.""" + if config.report_file: + config.report_file.close() + + +# ============================================================================ +# Benchmark Reporting Helpers (ADR-013 Phase 0.5) +# ============================================================================ + +def parse_vm_stat_page_size(output: str) -> int: + """Extract vm_stat page size in bytes, falling back to 4096.""" + match = re.search(r"page size of (\d+) bytes", output) + if match: + return int(match.group(1)) + return 4096 + + +def _get_macos_system_health() -> Dict[str, Any]: + """Collect macOS system health metrics (ADR-013 Phase 0.5 - v0.2.0). + + Uses macOS-native tools (sysctl, vm_stat, ps) - ZERO new dependencies. + Enables automatic regression quality assessment via quality_flags. + + Returns: + dict: System health metrics with keys: + - swap_used_mb: Current swap usage in MB + - ram_free_gb: Available RAM in GB + - zombie_processes: Count of zombie processes + - quality_flags: List of quality indicators + ["clean"] = healthy system + ["degraded_swap"] = swap usage detected (memory pressure) + ["degraded_zombies"] = zombie processes detected + + Quality Thresholds (empirically derived from Session 43 analysis): + - Swap: >100 MB indicates memory pressure (beta2→beta3: 1.8 GB swap = +3.4% slowdown) + - Zombies: >0 indicates stuck processes (REGRESSION-2025-12-08: 14 zombies = +90% slowdown) + """ + # Force C locale for consistent number formatting (avoid locale-specific decimal separators) + env = os.environ.copy() + env["LC_ALL"] = "C" + + health = { + "swap_used_mb": 0, + "ram_free_gb": 0.0, + "zombie_processes": 0, + "quality_flags": [] + } + + try: + # Get swap usage via sysctl (macOS native) + # sysctl vm.swapusage returns: "vm.swapusage: total = 0.00M used = 0.00M free = 0.00M (encrypted)" + result = subprocess.run( + ["sysctl", "vm.swapusage"], + capture_output=True, + text=True, + timeout=2, + env=env + ) + if result.returncode == 0: + # Parse: "total = X.XXM used = Y.YYM free = Z.ZZM" + # LC_ALL=C ensures consistent dot decimal separator + for part in result.stdout.split(): + if part.endswith("M") and "used" in result.stdout: + # Extract used value (appears after "used = ") + parts = result.stdout.split("used = ") + if len(parts) > 1: + used_str = parts[1].split()[0] + # Parse size (can be M or G suffix) + if used_str.endswith("G"): + health["swap_used_mb"] = int(float(used_str[:-1]) * 1024) + elif used_str.endswith("M"): + health["swap_used_mb"] = int(float(used_str[:-1])) + break + except Exception: + pass # Swap metric is optional (not critical if it fails) + + try: + # Get free RAM via vm_stat (macOS native) + # vm_stat reports page size in the header (Apple Silicon uses 16KB pages). + result = subprocess.run( + ["vm_stat"], + capture_output=True, + text=True, + timeout=2, + env=env + ) + if result.returncode == 0: + page_size = parse_vm_stat_page_size(result.stdout) + # Parse "Pages free: 12345." + for line in result.stdout.splitlines(): + if "Pages free:" in line: + pages_free = int(line.split(":")[1].strip().rstrip(".")) + health["ram_free_gb"] = round(pages_free * page_size / (1024**3), 2) + break + except Exception: + pass # RAM metric is optional + + try: + # Get zombie process count via ps aux (macOS native) + # Zombies show as "" in ps output + result = subprocess.run( + ["ps", "aux"], + capture_output=True, + text=True, + timeout=2, + env=env + ) + if result.returncode == 0: + # Count lines containing "" + health["zombie_processes"] = result.stdout.count("") + except Exception: + pass # Zombie count is optional + + # Determine quality flags (empirical thresholds from regression analysis) + flags = [] + if health["swap_used_mb"] > 100: + flags.append("degraded_swap") + if health["zombie_processes"] > 0: + flags.append("degraded_zombies") + + # If no degradation detected, mark as clean + if not flags: + flags.append("clean") + + health["quality_flags"] = flags + return health + + +def _get_macos_hardware_profile() -> Dict[str, Any]: + """Collect macOS hardware profile (ADR-013 Phase 0.5 - v0.2.0). + + Uses macOS-native sysctl - ZERO new dependencies. + Enables hardware-specific performance analysis (M1 vs M2 vs M3 vs M4). + + Returns: + dict: Hardware profile with keys: + - model: Mac model identifier (e.g., "Mac14,9" = M3 Max) + - cores_physical: Physical CPU cores (P-cores only) + - cores_logical: Logical CPU cores (P+E cores with hyperthreading) + """ + profile = { + "model": "unknown", + "cores_physical": 0, + "cores_logical": 0, + } + + try: + # Get Mac model identifier + result = subprocess.run( + ["sysctl", "-n", "hw.model"], + capture_output=True, + text=True, + timeout=2 + ) + if result.returncode == 0: + profile["model"] = result.stdout.strip() + except Exception: + pass + + try: + # Get physical cores (P-cores) + result = subprocess.run( + ["sysctl", "-n", "hw.physicalcpu"], + capture_output=True, + text=True, + timeout=2 + ) + if result.returncode == 0: + profile["cores_physical"] = int(result.stdout.strip()) + except Exception: + pass + + try: + # Get logical cores (P+E cores with hyperthreading) + result = subprocess.run( + ["sysctl", "-n", "hw.logicalcpu"], + capture_output=True, + text=True, + timeout=2 + ) + if result.returncode == 0: + profile["cores_logical"] = int(result.stdout.strip()) + except Exception: + pass + + return profile + + +@pytest.hookimpl(hookwrapper=True) +def pytest_runtest_makereport(item, call): + """Generate benchmark report for each test (if --report-output enabled). + + Reports are written as JSONL (one JSON object per line) to allow + streaming and easy appending across test runs. + + Schema version: 0.2.0 (Phase 0.5 - System Health + Hardware Profile) + See: ADR-013 Phase 0.5 implementation + + Changelog from 0.1.0 → 0.2.0: + - Added: system.hardware_profile (Mac model, cores) + - Added: system_health (swap, RAM, zombies, quality_flags) + - Backward compatible: All 0.1.0 fields preserved + """ + import json + from datetime import datetime, timezone + + outcome = yield + report = outcome.get_result() + + # Only report on test call phase (not setup/teardown) + if call.when == "call" and item.config.report_file: + try: + # Import version here to avoid circular imports + from mlxk2 import __version__ + except ImportError: + __version__ = "unknown" + + # Build report data (required fields) + data = { + "schema_version": "0.2.0", + "timestamp": datetime.now(timezone.utc).isoformat(), + "mlx_knife_version": __version__, + "test": item.nodeid, + "outcome": report.outcome, + } + + # Add duration if available + if hasattr(report, "duration"): + data["duration"] = report.duration + + # Add skip reason for skipped tests + if report.outcome == "skipped" and hasattr(report, "longrepr"): + # Extract skip reason from longrepr tuple + if isinstance(report.longrepr, tuple) and len(report.longrepr) >= 3: + skip_reason = report.longrepr[2] + data.setdefault("metadata", {})["skip_reason"] = skip_reason + + # Extract structured data from user_properties + # Tests can add data via: request.node.user_properties.append(("key", value)) + for key, value in item.user_properties: + if key in ("model", "performance", "stop_tokens", "system"): + # Structured sections (top-level keys) + data[key] = value + else: + # Everything else goes to metadata + data.setdefault("metadata", {})[key] = value + + # ADR-013 Phase 0.5: Collect system health metrics (v0.2.0) + # Enables automatic regression quality assessment + system_health = _get_macos_system_health() + data["system_health"] = system_health + + # ADR-013 Phase 0.5: Collect hardware profile (v0.2.0) + # Enables hardware-specific performance analysis (M1 vs M2 vs M3 vs M4) + hardware_profile = _get_macos_hardware_profile() + + # Add hardware_profile to system section (create if not exists) + if "system" not in data: + data["system"] = {} + data["system"]["hardware_profile"] = hardware_profile + + # Write JSONL (one line per report) + try: + item.config.report_file.write(json.dumps(data) + "\n") + item.config.report_file.flush() + except Exception as e: + # Don't fail tests if reporting fails + print(f"\n⚠️ Benchmark report write failed: {e}") diff --git a/tests_2.0/live/conftest.py b/tests_2.0/live/conftest.py index b7ff59e..98ec8f8 100644 --- a/tests_2.0/live/conftest.py +++ b/tests_2.0/live/conftest.py @@ -47,13 +47,24 @@ def _skip_unless_live_e2e_marker(request): This fixture ensures they are skipped in the default pytest run. Exception: show_model_portfolio marker is allowed (convenience diagnostics). + + SCOPE LIMITATION: Only applies to tests in tests_2.0/live/ directory. + Tests in parent directory manage their own markers independently. """ + # CRITICAL: Only apply to tests in live/ directory + # Tests in parent directory (tests_2.0/) handle their own skip logic + test_path = str(request.node.path) + if "/live/" not in test_path and "\\live\\" not in test_path: + return # Skip fixture for tests outside live/ directory + # Check if test has live_e2e marker if request.node.get_closest_marker("live_e2e"): - # Check if -m live_e2e or -m show_model_portfolio was specified + # Check if -m live_e2e or -m show_model_portfolio or -m wet was specified selected_markers = request.config.getoption("-m") or "" - if "live_e2e" not in selected_markers and "show_model_portfolio" not in selected_markers: - pytest.skip("Run with -m live_e2e to enable E2E tests with real models") + if ("live_e2e" not in selected_markers and + "show_model_portfolio" not in selected_markers and + "wet" not in selected_markers): + pytest.skip("Run with -m live_e2e or -m wet") def pytest_generate_tests(metafunc): @@ -77,10 +88,19 @@ def pytest_generate_tests(metafunc): IMPORTANT: This hook runs during COLLECTION phase. We check for live_e2e marker BEFORE doing portfolio discovery to avoid slow collection when marker is not requested (maintains marker-required 🔒). + + SCOPE LIMITATION: Only apply to tests in tests_2.0/live/ directory to avoid + interfering with parent directory tests that use isolated_cache. """ - # Check if live_e2e marker is requested (COLLECTION-TIME check) + # CRITICAL: Only apply this hook to tests in the live/ directory + # Tests in parent directory (tests_2.0/) should not be parametrized by Portfolio Discovery + test_path = str(metafunc.definition.path) + if "/live/" not in test_path and "\\live\\" not in test_path: + return # Skip hook for tests outside live/ directory + + # Check if live_e2e or wet marker is requested (COLLECTION-TIME check) selected_markers = metafunc.config.getoption("-m") or "" - is_live_e2e = "live_e2e" in selected_markers + is_live_e2e = "live_e2e" in selected_markers or "wet" in selected_markers # Handle text_model_key (NEW - Portfolio Separation) if "text_model_key" in metafunc.fixturenames: @@ -342,6 +362,59 @@ def vision_model_info(vision_portfolio, vision_model_key): return vision_portfolio[vision_model_key] +@pytest.fixture(autouse=True) +def _auto_report_vision_model(request): + """Auto-report vision model info to benchmark log (autouse). + + This fixture automatically adds vision model metadata to benchmark reports + for parametrized vision tests, without requiring explicit report_benchmark() calls. + + This ensures vision models appear with proper annotations in memplot.py timeline charts. + + Handles two types of vision tests: + 1. API tests with vision_model_key parameter (vision_portfolio) + 2. CLI tests in test_vision_e2e_live.py (hardcoded pixtral) + """ + # Type 1: Parametrized vision API tests (vision_model_key) + if "vision_model_key" in request.fixturenames: + # Get vision model info from fixture + try: + vision_model_info = request.getfixturevalue("vision_model_info") + except: + return + + if not vision_model_info: + return + + # Extract model metadata + model_id = vision_model_info["id"] + family, variant = _parse_model_family(model_id) + + # Vision models: ram_needed_gb is disk size (no 1.2x overhead) + ram_gb = vision_model_info["ram_needed_gb"] + disk_size_gb = ram_gb if ram_gb != float('inf') else float('inf') + + # Append to user_properties for benchmark reporting (schema v0.2.0) + request.node.user_properties.append(("model", { + "id": model_id, + "size_gb": round(disk_size_gb, 2) if disk_size_gb != float('inf') else disk_size_gb, + "family": family, + "variant": variant, + })) + return + + # Type 2: CLI vision tests (test_vision_e2e_live.py) + # These tests use subprocess.run(["mlxk", "run", "pixtral", ...]) + if 'test_vision_e2e_live.py' in request.node.nodeid: + # All CLI vision tests use pixtral (hardcoded in subprocess calls) + request.node.user_properties.append(("model", { + "id": "mlx-community/pixtral-12b-8bit", + "size_gb": 14.0, # Approximate (12B 8-bit ≈ 14GB) + "family": "pixtral", + "variant": "12b-8bit", + })) + + def _parse_model_family(model_id: str) -> tuple[str, str]: """Extract model family and variant from HuggingFace model ID. @@ -496,282 +569,4 @@ def report_benchmark(request): for key, value in extra.items(): request.node.user_properties.append((key, value)) - return _report - - -# ============================================================================ -# Benchmark Reporting (ADR-013 Phase 0 + 0.5) -# ============================================================================ - -def _get_macos_system_health() -> Dict[str, Any]: - """Collect macOS system health metrics (ADR-013 Phase 0.5 - v0.2.0). - - Uses macOS-native tools (sysctl, vm_stat, ps) - ZERO new dependencies. - Enables automatic regression quality assessment via quality_flags. - - Returns: - dict: System health metrics with keys: - - swap_used_mb: Current swap usage in MB - - ram_free_gb: Available RAM in GB - - zombie_processes: Count of zombie processes - - quality_flags: List of quality indicators - ["clean"] = healthy system - ["degraded_swap"] = swap usage detected (memory pressure) - ["degraded_zombies"] = zombie processes detected - - Quality Thresholds (empirically derived from Session 43 analysis): - - Swap: >100 MB indicates memory pressure (beta2→beta3: 1.8 GB swap = +3.4% slowdown) - - Zombies: >0 indicates stuck processes (REGRESSION-2025-12-08: 14 zombies = +90% slowdown) - """ - import subprocess - - health = { - "swap_used_mb": 0, - "ram_free_gb": 0.0, - "zombie_processes": 0, - "quality_flags": [] - } - - try: - # Get swap usage via sysctl (macOS native) - # sysctl vm.swapusage returns: "vm.swapusage: total = 0.00M used = 0.00M free = 0.00M (encrypted)" - result = subprocess.run( - ["sysctl", "vm.swapusage"], - capture_output=True, - text=True, - timeout=2 - ) - if result.returncode == 0: - # Parse: "total = X.XXM used = Y.YYM free = Z.ZZM" - for part in result.stdout.split(): - if part.endswith("M") and "used" in result.stdout: - # Extract used value (appears after "used = ") - parts = result.stdout.split("used = ") - if len(parts) > 1: - used_str = parts[1].split()[0] - # Parse size (can be M or G suffix) - if used_str.endswith("G"): - health["swap_used_mb"] = int(float(used_str[:-1]) * 1024) - elif used_str.endswith("M"): - health["swap_used_mb"] = int(float(used_str[:-1])) - break - except Exception: - pass # Swap metric is optional (not critical if it fails) - - try: - # Get free RAM via vm_stat (macOS native) - # vm_stat reports page size in the header (Apple Silicon uses 16KB pages). - result = subprocess.run( - ["vm_stat"], - capture_output=True, - text=True, - timeout=2 - ) - if result.returncode == 0: - page_size = parse_vm_stat_page_size(result.stdout) - # Parse "Pages free: 12345." - for line in result.stdout.splitlines(): - if "Pages free:" in line: - pages_free = int(line.split(":")[1].strip().rstrip(".")) - health["ram_free_gb"] = round(pages_free * page_size / (1024**3), 2) - break - except Exception: - pass # RAM metric is optional - - try: - # Get zombie process count via ps aux (macOS native) - # Zombies show as "" in ps output - result = subprocess.run( - ["ps", "aux"], - capture_output=True, - text=True, - timeout=2 - ) - if result.returncode == 0: - # Count lines containing "" - health["zombie_processes"] = result.stdout.count("") - except Exception: - pass # Zombie count is optional - - # Determine quality flags (empirical thresholds from regression analysis) - flags = [] - if health["swap_used_mb"] > 100: - flags.append("degraded_swap") - if health["zombie_processes"] > 0: - flags.append("degraded_zombies") - - # If no degradation detected, mark as clean - if not flags: - flags.append("clean") - - health["quality_flags"] = flags - return health - - -def _get_macos_hardware_profile() -> Dict[str, Any]: - """Collect macOS hardware profile (ADR-013 Phase 0.5 - v0.2.0). - - Uses macOS-native sysctl - ZERO new dependencies. - Enables hardware-specific performance analysis (M1 vs M2 vs M3 vs M4). - - Returns: - dict: Hardware profile with keys: - - model: Mac model identifier (e.g., "Mac14,9" = M3 Max) - - cores_physical: Physical CPU cores (P-cores only) - - cores_logical: Logical CPU cores (P+E cores with hyperthreading) - """ - import subprocess - - profile = { - "model": "unknown", - "cores_physical": 0, - "cores_logical": 0, - } - - try: - # Get Mac model identifier - result = subprocess.run( - ["sysctl", "-n", "hw.model"], - capture_output=True, - text=True, - timeout=2 - ) - if result.returncode == 0: - profile["model"] = result.stdout.strip() - except Exception: - pass - - try: - # Get physical cores (P-cores) - result = subprocess.run( - ["sysctl", "-n", "hw.physicalcpu"], - capture_output=True, - text=True, - timeout=2 - ) - if result.returncode == 0: - profile["cores_physical"] = int(result.stdout.strip()) - except Exception: - pass - - try: - # Get logical cores (P+E cores with hyperthreading) - result = subprocess.run( - ["sysctl", "-n", "hw.logicalcpu"], - capture_output=True, - text=True, - timeout=2 - ) - if result.returncode == 0: - profile["cores_logical"] = int(result.stdout.strip()) - except Exception: - pass - - return profile - - -def pytest_addoption(parser): - """Add --report-output option for benchmark reporting.""" - parser.addoption( - "--report-output", - action="store", - default=None, - metavar="PATH", - help="Generate benchmark reports to JSONL file (ADR-013 Phase 0.5)" - ) - - -def pytest_configure(config): - """Initialize report file if --report-output is specified.""" - config.report_file = None - if report_path := config.getoption("--report-output"): - config.report_file = Path(report_path).open("a", encoding="utf-8") - print(f"\n📊 Benchmark reporting enabled: {report_path}") - - -def pytest_unconfigure(config): - """Close report file at end of session.""" - if config.report_file: - config.report_file.close() - - -@pytest.hookimpl(hookwrapper=True) -def pytest_runtest_makereport(item, call): - """Generate benchmark report for each test (if --report-output enabled). - - Reports are written as JSONL (one JSON object per line) to allow - streaming and easy appending across test runs. - - Schema version: 0.2.0 (Phase 0.5 - System Health + Hardware Profile) - See: ADR-013 Phase 0.5 implementation - - Changelog from 0.1.0 → 0.2.0: - - Added: system.hardware_profile (Mac model, cores) - - Added: system_health (swap, RAM, zombies, quality_flags) - - Backward compatible: All 0.1.0 fields preserved - """ - import json - from datetime import datetime, timezone - - outcome = yield - report = outcome.get_result() - - # Only report on test call phase (not setup/teardown) - if call.when == "call" and item.config.report_file: - try: - # Import version here to avoid circular imports - from mlxk2 import __version__ - except ImportError: - __version__ = "unknown" - - # Build report data (required fields) - data = { - "schema_version": "0.2.0", - "timestamp": datetime.now(timezone.utc).isoformat(), - "mlx_knife_version": __version__, - "test": item.nodeid, - "outcome": report.outcome, - } - - # Add duration if available - if hasattr(report, "duration"): - data["duration"] = report.duration - - # Add skip reason for skipped tests - if report.outcome == "skipped" and hasattr(report, "longrepr"): - # Extract skip reason from longrepr tuple - if isinstance(report.longrepr, tuple) and len(report.longrepr) >= 3: - skip_reason = report.longrepr[2] - data.setdefault("metadata", {})["skip_reason"] = skip_reason - - # Extract structured data from user_properties - # Tests can add data via: request.node.user_properties.append(("key", value)) - for key, value in item.user_properties: - if key in ("model", "performance", "stop_tokens", "system"): - # Structured sections (top-level keys) - data[key] = value - else: - # Everything else goes to metadata - data.setdefault("metadata", {})[key] = value - - # ADR-013 Phase 0.5: Collect system health metrics (v0.2.0) - # Enables automatic regression quality assessment - system_health = _get_macos_system_health() - data["system_health"] = system_health - - # ADR-013 Phase 0.5: Collect hardware profile (v0.2.0) - # Enables hardware-specific performance analysis (M1 vs M2 vs M3 vs M4) - hardware_profile = _get_macos_hardware_profile() - - # Add hardware_profile to system section (create if not exists) - if "system" not in data: - data["system"] = {} - data["system"]["hardware_profile"] = hardware_profile - - # Write JSONL (one line per report) - try: - item.config.report_file.write(json.dumps(data) + "\n") - item.config.report_file.flush() - except Exception as e: - # Don't fail tests if reporting fails - print(f"\n⚠️ Benchmark report write failed: {e}") + return _report \ No newline at end of file diff --git a/tests_2.0/test_clone_operation.py b/tests_2.0/test_clone_operation.py index e3d5b29..2c5d22a 100644 --- a/tests_2.0/test_clone_operation.py +++ b/tests_2.0/test_clone_operation.py @@ -590,7 +590,10 @@ class TestCloneOperationIntegration: assert not real_temp_cache.exists() def test_clone_operation_health_check_failure(self, tmp_path): - """Test clone operation handles health check failure.""" + """Test clone operation continues despite health check failure (Session 59). + + Unhealthy models must be clonable for community repair workflow. + """ target_dir = str(tmp_path / "workspace") model_spec = "corrupted/model" @@ -617,11 +620,21 @@ class TestCloneOperationIntegration: mock_health.return_value = (False, "Model is corrupted") - result = clone_operation(model_spec, target_dir) + # Need to mock clone and sentinel too (Session 59: unhealthy doesn't block) + with patch('mlxk2.operations.clone._apfs_clone_directory') as mock_clone, \ + patch('mlxk2.operations.clone.write_workspace_sentinel'): - assert result["status"] == "error" - assert result["data"]["clone_status"] == "health_check_failed" - assert "Model failed health check" in result["error"]["message"] + mock_clone.return_value = True + + result = clone_operation(model_spec, target_dir) + + # Session 59 fix: Should succeed despite unhealthy + assert result["status"] == "success" + assert result["data"]["clone_status"] == "success" + + # Health status recorded as warning + assert result["data"]["health"] == "unhealthy" + assert result["data"]["health_reason"] == "Model is corrupted" # Verify real cleanup happened assert not real_temp_cache.exists() @@ -1025,4 +1038,160 @@ class TestCloneEdgeCases: assert result["status"] == "error" assert result["data"]["clone_status"] == "error" assert result["error"]["type"] == "CloneOperationError" - assert "Unexpected error" in result["error"]["message"] \ No newline at end of file + assert "Unexpected error" in result["error"]["message"] + + +class TestUnhealthyModelClone: + """Test that unhealthy models can still be cloned (Session 59 fix). + + Critical for community repair workflow: + - mlx-vlm #624 affected models have broken index.json + - Users need to clone them to repair with `convert --repair-index` + - Health check should be informational only, not blocking + """ + + @patch('mlxk2.operations.clone._validate_apfs_filesystem') + @patch('mlxk2.operations.clone._validate_same_volume') + @patch('mlxk2.operations.clone._create_temp_cache_same_volume') + @patch('mlxk2.operations.clone.pull_to_cache') + @patch('mlxk2.operations.clone._resolve_latest_snapshot') + @patch('mlxk2.operations.health.health_from_cache') + @patch('mlxk2.operations.clone._apfs_clone_directory') + @patch('mlxk2.operations.clone._cleanup_temp_cache_safe') + @patch('mlxk2.operations.clone.write_workspace_sentinel') + def test_unhealthy_model_clone_succeeds( + self, mock_sentinel, mock_cleanup, mock_clone, mock_health, + mock_snapshot, mock_pull, mock_temp_cache, mock_validate_vol, mock_validate_apfs, + tmp_path + ): + """Test that unhealthy models are still cloned successfully.""" + model_spec = "mlx-community/broken-model" + target_dir = str(tmp_path / "workspace") + + # Setup mocks + temp_cache = tmp_path / "temp" + temp_cache.mkdir() + mock_temp_cache.return_value = temp_cache + + mock_pull.return_value = { + "status": "success", + "data": {"model": model_spec, "commit_hash": "abc123"} + } + + snapshot = temp_cache / "snapshot" + snapshot.mkdir() + mock_snapshot.return_value = snapshot + + # Health check returns UNHEALTHY + mock_health.return_value = (False, "Missing weight shards: model-00001-of-00010.safetensors") + + mock_clone.return_value = True + + # Execute clone + result = clone_operation(model_spec, target_dir, health_check=True) + + # Should succeed despite unhealthy status + assert result["status"] == "success" + assert result["data"]["clone_status"] == "success" + + # Health status should be recorded + assert result["data"]["health"] == "unhealthy" + assert "Missing weight shards" in result["data"]["health_reason"] + + # Clone should have been called (workspace created) + mock_clone.assert_called_once() + + # Sentinel should have been written + mock_sentinel.assert_called_once() + + @patch('mlxk2.operations.clone._validate_apfs_filesystem') + @patch('mlxk2.operations.clone._validate_same_volume') + @patch('mlxk2.operations.clone._create_temp_cache_same_volume') + @patch('mlxk2.operations.clone.pull_to_cache') + @patch('mlxk2.operations.clone._resolve_latest_snapshot') + @patch('mlxk2.operations.health.health_from_cache') + @patch('mlxk2.operations.clone._apfs_clone_directory') + @patch('mlxk2.operations.clone._cleanup_temp_cache_safe') + @patch('mlxk2.operations.clone.write_workspace_sentinel') + def test_healthy_model_clone_records_status( + self, mock_sentinel, mock_cleanup, mock_clone, mock_health, + mock_snapshot, mock_pull, mock_temp_cache, mock_validate_vol, mock_validate_apfs, + tmp_path + ): + """Test that healthy models record health status correctly.""" + model_spec = "mlx-community/good-model" + target_dir = str(tmp_path / "workspace") + + # Setup mocks + temp_cache = tmp_path / "temp" + temp_cache.mkdir() + mock_temp_cache.return_value = temp_cache + + mock_pull.return_value = { + "status": "success", + "data": {"model": model_spec, "commit_hash": "abc123"} + } + + snapshot = temp_cache / "snapshot" + snapshot.mkdir() + mock_snapshot.return_value = snapshot + + # Health check returns HEALTHY + mock_health.return_value = (True, "Multi-file model complete") + + mock_clone.return_value = True + + # Execute clone + result = clone_operation(model_spec, target_dir, health_check=True) + + # Should succeed + assert result["status"] == "success" + assert result["data"]["clone_status"] == "success" + + # Health status should be recorded as healthy + assert result["data"]["health"] == "healthy" + assert result["data"]["health_reason"] == "Multi-file model complete" + + @patch('mlxk2.operations.clone._validate_apfs_filesystem') + @patch('mlxk2.operations.clone._validate_same_volume') + @patch('mlxk2.operations.clone._create_temp_cache_same_volume') + @patch('mlxk2.operations.clone.pull_to_cache') + @patch('mlxk2.operations.clone._resolve_latest_snapshot') + @patch('mlxk2.operations.clone._apfs_clone_directory') + @patch('mlxk2.operations.clone._cleanup_temp_cache_safe') + @patch('mlxk2.operations.clone.write_workspace_sentinel') + def test_no_health_check_skips_health_status( + self, mock_sentinel, mock_cleanup, mock_clone, + mock_snapshot, mock_pull, mock_temp_cache, mock_validate_vol, mock_validate_apfs, + tmp_path + ): + """Test that --no-health-check skips health status entirely.""" + model_spec = "mlx-community/model" + target_dir = str(tmp_path / "workspace") + + # Setup mocks + temp_cache = tmp_path / "temp" + temp_cache.mkdir() + mock_temp_cache.return_value = temp_cache + + mock_pull.return_value = { + "status": "success", + "data": {"model": model_spec, "commit_hash": "abc123"} + } + + snapshot = temp_cache / "snapshot" + snapshot.mkdir() + mock_snapshot.return_value = snapshot + + mock_clone.return_value = True + + # Execute clone with health_check=False + result = clone_operation(model_spec, target_dir, health_check=False) + + # Should succeed + assert result["status"] == "success" + assert result["data"]["clone_status"] == "success" + + # Health status should NOT be in result + assert "health" not in result["data"] + assert "health_reason" not in result["data"] \ No newline at end of file diff --git a/tests_2.0/test_convert_repair_index.py b/tests_2.0/test_convert_repair_index.py new file mode 100644 index 0000000..615cee5 --- /dev/null +++ b/tests_2.0/test_convert_repair_index.py @@ -0,0 +1,303 @@ +"""Tests for convert operation --repair-index (ADR-018 Phase 1). + +Tests community repair tool for mlx-vlm #624 affected models: +- Safetensors index rebuild primitive +- Cache sanctity enforcement (hard block) +- Workspace sentinel creation +- Health check integration +""" + +import json +from pathlib import Path +from unittest.mock import patch, MagicMock + +import pytest + +from mlxk2.operations.convert import rebuild_safetensors_index, convert_operation +from mlxk2.operations.workspace import write_workspace_sentinel, is_managed_workspace + + +class TestRebuildSafetensorsIndex: + """Test safetensors index rebuild primitive.""" + + def test_rebuild_single_file_returns_false(self, tmp_path): + """Test single safetensors file returns False (no index needed).""" + # Create single model file + (tmp_path / "model.safetensors").write_text("mock safetensors data") + + result = rebuild_safetensors_index(tmp_path) + assert result is False # No index needed for single file + + def test_rebuild_no_safetensors_returns_false(self, tmp_path): + """Test no safetensors files returns False.""" + # Empty directory + result = rebuild_safetensors_index(tmp_path) + assert result is False + + def test_rebuild_creates_index_for_multi_shard(self, tmp_path): + """Test index rebuild for multi-shard model. + + Uses mock safetensors.safe_open to simulate shard reading. + """ + # Create mock shards + (tmp_path / "model-00001-of-00003.safetensors").write_text("shard 1") + (tmp_path / "model-00002-of-00003.safetensors").write_text("shard 2") + (tmp_path / "model-00003-of-00003.safetensors").write_text("shard 3") + + # Mock safetensors.safe_open to return fake tensor keys + with patch("safetensors.safe_open") as mock_safe_open: + # Setup mock context manager + mock_file1 = MagicMock() + mock_file1.keys.return_value = ["layer.0.weight", "layer.0.bias"] + + mock_file2 = MagicMock() + mock_file2.keys.return_value = ["layer.1.weight", "layer.1.bias"] + + mock_file3 = MagicMock() + mock_file3.keys.return_value = ["layer.2.weight", "layer.2.bias"] + + # safe_open returns different mocks for each shard + mock_safe_open.side_effect = [ + MagicMock(__enter__=lambda self: mock_file1, __exit__=lambda *args: None), + MagicMock(__enter__=lambda self: mock_file2, __exit__=lambda *args: None), + MagicMock(__enter__=lambda self: mock_file3, __exit__=lambda *args: None) + ] + + result = rebuild_safetensors_index(tmp_path) + + assert result is True # Index should be rebuilt + + # Verify index file was created + index_path = tmp_path / "model.safetensors.index.json" + assert index_path.exists() + + # Verify index structure + index_data = json.loads(index_path.read_text()) + assert "metadata" in index_data + assert "weight_map" in index_data + + # Verify weight_map has correct mappings + weight_map = index_data["weight_map"] + assert weight_map["layer.0.weight"] == "model-00001-of-00003.safetensors" + assert weight_map["layer.1.weight"] == "model-00002-of-00003.safetensors" + assert weight_map["layer.2.weight"] == "model-00003-of-00003.safetensors" + + def test_rebuild_atomic_write(self, tmp_path): + """Test index write is atomic (tmp + rename).""" + # Create mock shards + (tmp_path / "model-00001-of-00002.safetensors").write_text("shard 1") + (tmp_path / "model-00002-of-00002.safetensors").write_text("shard 2") + + with patch("safetensors.safe_open") as mock_safe_open: + # Setup mock context managers with DIFFERENT keys per shard + mock_file1 = MagicMock() + mock_file1.keys.return_value = ["layer.0.weight"] + + mock_file2 = MagicMock() + mock_file2.keys.return_value = ["layer.1.weight"] + + # Use side_effect to return different mocks for each shard + mock_safe_open.side_effect = [ + MagicMock(__enter__=lambda self: mock_file1, __exit__=lambda *args: None), + MagicMock(__enter__=lambda self: mock_file2, __exit__=lambda *args: None) + ] + + rebuild_safetensors_index(tmp_path) + + # Verify tmp file doesn't exist after successful write + tmp_files = list(tmp_path.glob("*.tmp")) + assert len(tmp_files) == 0 # Tmp file should be renamed + + +class TestConvertCacheSanctity: + """Test convert blocks cache writes (ADR-018 core rule).""" + + def test_convert_blocks_cache_source_path(self, tmp_path, monkeypatch): + """Test convert hard-blocks source in HF cache.""" + # Create mock source workspace + src = tmp_path / "workspace" + src.mkdir() + (src / "config.json").write_text('{"model_type": "llama"}') + + dst = tmp_path / "output" + + # Mock get_current_cache_root to make src appear inside cache + mock_cache_root = tmp_path + monkeypatch.setattr( + "mlxk2.operations.convert.get_current_cache_root", + lambda: mock_cache_root + ) + + result = convert_operation( + str(src), + str(dst), + mode="repair-index" + ) + + assert result["status"] == "error" + assert "Source path is in HF cache" in result["error"]["message"] + assert "Use 'mlxk clone' first" in result["error"]["message"] + + def test_convert_blocks_cache_target_path(self, tmp_path, monkeypatch): + """Test convert hard-blocks target in HF cache.""" + # Create source OUTSIDE cache (in separate directory) + src = tmp_path / "outside" / "workspace" + src.mkdir(parents=True) + (src / "config.json").write_text('{"model_type": "llama"}') + (src / "model.safetensors").write_text("mock weights") + + # Target appears inside cache + cache_dir = tmp_path / "cache" + cache_dir.mkdir() + dst = cache_dir / "output" + + # Mock cache root to be cache_dir only (src is outside) + monkeypatch.setattr( + "mlxk2.operations.convert.get_current_cache_root", + lambda: cache_dir + ) + + result = convert_operation( + str(src), + str(dst), + mode="repair-index" + ) + + assert result["status"] == "error" + assert "Target path is in HF cache" in result["error"]["message"] + assert "Choose workspace location outside cache" in result["error"]["message"] + + +class TestConvertWorkspaceSentinel: + """Test convert creates managed workspaces.""" + + def test_convert_creates_managed_workspace(self, tmp_path): + """Test convert writes workspace sentinel to target.""" + # Create minimal source workspace + src = tmp_path / "source" + src.mkdir() + (src / "config.json").write_text('{"model_type": "llama"}') + (src / "model.safetensors").write_text("mock weights") + + dst = tmp_path / "target" + + # Mock cache root to be outside tmp_path + with patch("mlxk2.operations.convert.get_current_cache_root") as mock_cache: + mock_cache.return_value = Path("/nonexistent/cache") + + result = convert_operation( + str(src), + str(dst), + mode="repair-index", + skip_health=True # Skip health for this test + ) + + assert result["status"] == "success" + + # Verify target has sentinel + assert is_managed_workspace(dst) is True + + # Verify sentinel metadata + from mlxk2.operations.workspace import read_workspace_metadata + metadata = read_workspace_metadata(dst) + assert metadata["managed"] is True + assert metadata["operation"] == "convert" + assert metadata["mode"] == "repair-index" + + def test_convert_inherits_source_metadata(self, tmp_path): + """Test convert preserves source repo info if source is managed.""" + # Create managed source workspace + src = tmp_path / "source" + src.mkdir() + (src / "config.json").write_text('{"model_type": "llama"}') + (src / "model.safetensors").write_text("mock weights") + + # Write source sentinel with metadata + src_metadata = { + "mlxk_version": "2.0.4", + "created_at": "2025-12-29T10:00:00Z", + "source_repo": "mlx-community/Llama-3.2-3B", + "source_revision": "abc123", + "managed": True, + "operation": "clone" + } + write_workspace_sentinel(src, src_metadata) + + dst = tmp_path / "target" + + with patch("mlxk2.operations.convert.get_current_cache_root") as mock_cache: + mock_cache.return_value = Path("/nonexistent/cache") + + result = convert_operation( + str(src), + str(dst), + mode="repair-index", + skip_health=True + ) + + assert result["status"] == "success" + + # Verify target inherits source_repo + from mlxk2.operations.workspace import read_workspace_metadata + metadata = read_workspace_metadata(dst) + assert metadata["source_repo"] == "mlx-community/Llama-3.2-3B" + assert metadata["source_revision"] == "abc123" + assert metadata["operation"] == "convert" # Operation updated + + +class TestConvertValidation: + """Test convert validation logic.""" + + def test_convert_requires_existing_source(self, tmp_path): + """Test convert fails if source doesn't exist.""" + result = convert_operation( + "/nonexistent/source", + str(tmp_path / "target"), + mode="repair-index" + ) + + assert result["status"] == "error" + assert "does not exist" in result["error"]["message"] + + def test_convert_requires_empty_target(self, tmp_path): + """Test convert requires empty target directory.""" + src = tmp_path / "source" + src.mkdir() + (src / "config.json").write_text('{"model_type": "llama"}') + + # Create non-empty target + dst = tmp_path / "target" + dst.mkdir() + (dst / "existing_file.txt").write_text("data") + + with patch("mlxk2.operations.convert.get_current_cache_root") as mock_cache: + mock_cache.return_value = Path("/nonexistent/cache") + + result = convert_operation( + str(src), + str(dst), + mode="repair-index" + ) + + assert result["status"] == "error" + assert "not empty" in result["error"]["message"] + + def test_convert_unsupported_mode_fails(self, tmp_path): + """Test convert fails with unsupported mode.""" + src = tmp_path / "source" + src.mkdir() + (src / "config.json").write_text('{"model_type": "llama"}') + + dst = tmp_path / "target" + + with patch("mlxk2.operations.convert.get_current_cache_root") as mock_cache: + mock_cache.return_value = Path("/nonexistent/cache") + + result = convert_operation( + str(src), + str(dst), + mode="unsupported-mode" + ) + + assert result["status"] == "error" + assert "Unsupported" in result["error"]["message"] diff --git a/tests_2.0/test_ctrl_c_handling.py b/tests_2.0/test_ctrl_c_handling.py index 1959146..756d6c0 100644 --- a/tests_2.0/test_ctrl_c_handling.py +++ b/tests_2.0/test_ctrl_c_handling.py @@ -13,6 +13,38 @@ from mlxk2.core.runner import MLXRunner from mlxk2.operations.run import run_model, interactive_chat +class MockDetokenizer: + """Mock detokenizer that mimics BPEStreamingDetokenizer behavior. + + Used by unit tests to mock tokenizer.detokenizer after Session 60 changes. + Session 60 switched from tokenizer.decode() to tokenizer.detokenizer for + proper BPE space marker (Ġ U+0120) conversion. + """ + def __init__(self, decode_func): + """Initialize with a decode function that maps token lists to strings.""" + self.decode_func = decode_func + self.tokens = [] + self._text = "" + + def reset(self): + """Reset accumulated tokens.""" + self.tokens = [] + self._text = "" + + def add_token(self, token_id): + """Add a token to the accumulated list.""" + self.tokens.append(token_id) + + def finalize(self): + """Finalize and decode accumulated tokens.""" + self._text = self.decode_func(self.tokens) + + @property + def text(self): + """Return the decoded text.""" + return self._text + + @pytest.fixture def mock_runner_with_interruption(): """Mock runner that can simulate interruption scenarios.""" @@ -91,8 +123,18 @@ class TestMLXRunnerInterruption: mock_tokenizer.added_tokens_decoder = {} mock_tokenizer.encode.return_value = [1, 2, 3] mock_tokenizer.decode.side_effect = ["Hello", " world", "!"] + # Use MockDetokenizer for proper BPE space marker handling + def mock_decode(tokens): + if len(tokens) == 1: + return {1: "Hello", 2: " world", 3: "!"}.get(tokens[0], "") + elif len(tokens) == 2: + return "Hello world" + elif len(tokens) == 3: + return "Hello world!" + return "" + mock_tokenizer.detokenizer = MockDetokenizer(mock_decode) mock_load.return_value = (mock_model, mock_tokenizer) - + # Mock generation that yields multiple tokens mock_gen.return_value = iter([ (Mock(item=lambda: 1), Mock()), @@ -133,8 +175,12 @@ class TestMLXRunnerInterruption: mock_tokenizer.added_tokens_decoder = {} mock_tokenizer.encode.return_value = [1, 2, 3] mock_tokenizer.decode.return_value = "Partial response" + # Use MockDetokenizer for proper BPE space marker handling + def mock_decode(tokens): + return "Partial response" if tokens else "" + mock_tokenizer.detokenizer = MockDetokenizer(mock_decode) mock_load.return_value = (mock_model, mock_tokenizer) - + def interrupted_generation(): """Generator that gets interrupted""" yield (Mock(item=lambda: 1), Mock()) diff --git a/tests_2.0/test_human_output.py b/tests_2.0/test_human_output.py index fec5e06..07121e7 100644 --- a/tests_2.0/test_human_output.py +++ b/tests_2.0/test_human_output.py @@ -1,6 +1,6 @@ import re -from mlxk2.output.human import render_list, render_health +from mlxk2.output.human import render_list, render_health, render_clone def sample_list_data(): @@ -83,6 +83,44 @@ def test_health_human_summary_and_entries(): assert "model-b" in out +def test_health_human_single_item_no_summary(): + """When total==1, summary line is redundant and should be omitted.""" + data = { + "status": "success", + "command": "health", + "data": { + "healthy": [ + {"name": "./workspace", "status": "healthy", "reason": "Multi-file model complete"} + ], + "unhealthy": [], + "summary": {"total": 1, "healthy_count": 1, "unhealthy_count": 0}, + }, + "error": None, + } + out = render_health(data) + # No summary line for single item + assert "Summary:" not in out + # Result line present + assert "healthy ./workspace — Multi-file model complete" in out + + +def test_health_human_zero_items_shows_summary(): + """When total==0, summary is useful to indicate nothing was found.""" + data = { + "status": "success", + "command": "health", + "data": { + "healthy": [], + "unhealthy": [], + "summary": {"total": 0, "healthy_count": 0, "unhealthy_count": 0}, + }, + "error": None, + } + out = render_health(data) + # Summary line present for zero results + assert "Summary: total 0, healthy 0, unhealthy 0" in out + + def test_list_human_filters_mlx_base_default(): from mlxk2.output.human import render_list @@ -224,3 +262,72 @@ def test_list_human_type_shows_vision_flag(): out = render_list(data, show_health=False, show_all=True, verbose=False) assert "chat+vision" in out + + +def test_clone_unhealthy_shows_warning(): + """Test that unhealthy clone result shows warning in human output.""" + data = { + "status": "success", + "command": "clone", + "data": { + "model": "mlx-community/broken-model", + "target_dir": "/path/to/workspace", + "clone_status": "success", + "health": "unhealthy", + "health_reason": "Missing weight shards: model-00001-of-00010.safetensors", + "message": "Cloned to /path/to/workspace" + }, + "error": None + } + + out = render_clone(data, quiet=False) + # Should show warning symbol and reason + assert "⚠" in out + assert "unhealthy" in out + assert "Missing weight shards" in out + assert "/path/to/workspace" in out + + +def test_clone_healthy_shows_checkmark(): + """Test that healthy clone result shows checkmark.""" + data = { + "status": "success", + "command": "clone", + "data": { + "model": "mlx-community/good-model", + "target_dir": "/path/to/workspace", + "clone_status": "success", + "health": "healthy", + "health_reason": "Multi-file model complete", + "message": "Cloned to /path/to/workspace" + }, + "error": None + } + + out = render_clone(data, quiet=False) + # Should show checkmark + assert "✓ healthy" in out + assert "/path/to/workspace" in out + + +def test_clone_no_health_check_omits_status(): + """Test that clone without health check doesn't show health status.""" + data = { + "status": "success", + "command": "clone", + "data": { + "model": "mlx-community/model", + "target_dir": "/path/to/workspace", + "clone_status": "success", + "message": "Cloned to /path/to/workspace" + # No health or health_reason fields + }, + "error": None + } + + out = render_clone(data, quiet=False) + # Should not show health status + assert "healthy" not in out + assert "unhealthy" not in out + assert "✓" not in out + assert "⚠" not in out diff --git a/tests_2.0/test_interruption_recovery.py b/tests_2.0/test_interruption_recovery.py index 8c92beb..3b8c1f6 100644 --- a/tests_2.0/test_interruption_recovery.py +++ b/tests_2.0/test_interruption_recovery.py @@ -11,6 +11,38 @@ from mlxk2.core.runner import MLXRunner from mlxk2.operations.run import interactive_chat +class MockDetokenizer: + """Mock detokenizer that mimics BPEStreamingDetokenizer behavior. + + Used by unit tests to mock tokenizer.detokenizer after Session 60 changes. + Session 60 switched from tokenizer.decode() to tokenizer.detokenizer for + proper BPE space marker (Ġ U+0120) conversion. + """ + def __init__(self, decode_func): + """Initialize with a decode function that maps token lists to strings.""" + self.decode_func = decode_func + self.tokens = [] + self._text = "" + + def reset(self): + """Reset accumulated tokens.""" + self.tokens = [] + self._text = "" + + def add_token(self, token_id): + """Add a token to the accumulated list.""" + self.tokens.append(token_id) + + def finalize(self): + """Finalize and decode accumulated tokens.""" + self._text = self.decode_func(self.tokens) + + @property + def text(self): + """Return the decoded text.""" + return self._text + + class TestInterruptionRecovery: """Test recovery after interruption in interactive mode.""" @@ -39,7 +71,15 @@ class TestInterruptionRecovery: (Mock(item=lambda: 2), Mock()) ]) mock_tokenizer.decode.side_effect = ["Hello", " world"] - + # Use MockDetokenizer for proper BPE space marker handling + def mock_decode(tokens): + if len(tokens) == 1: + return "Hello" if tokens[0] == 1 else " world" + elif len(tokens) == 2: + return "Hello world" + return "" + mock_tokenizer.detokenizer = MockDetokenizer(mock_decode) + with MLXRunner("test-model") as runner: # Simulate interruption runner._interrupted = True @@ -59,7 +99,7 @@ class TestInterruptionRecovery: """Test that interruption flag is reset for new batch generation""" mock_resolve.return_value = ("test-model", None, None) mock_cache.return_value = Mock() - + mock_model = Mock() mock_tokenizer = Mock() mock_tokenizer.eos_token = "" @@ -68,23 +108,26 @@ class TestInterruptionRecovery: mock_tokenizer.additional_special_tokens = [] mock_tokenizer.added_tokens_decoder = {} mock_tokenizer.encode.return_value = [1, 2, 3] + # Mock decode for compatibility mock_tokenizer.decode.return_value = "Hello world" + # Mock detokenizer (Session 60 BPE fix) + mock_tokenizer.detokenizer = MockDetokenizer(lambda tokens: "Hello world") mock_load.return_value = (mock_model, mock_tokenizer) - + with patch('mlxk2.core.runner.generate_step') as mock_gen: mock_gen.return_value = iter([ (Mock(item=lambda: 1), Mock()), (Mock(item=lambda: 2), Mock()) ]) - + with MLXRunner("test-model") as runner: # Simulate interruption runner._interrupted = True assert runner._interrupted is True - + # Start new generation - should reset flag result = runner.generate_batch("test prompt") - + # Flag should be reset at start of generation assert runner._interrupted is False assert result == "Hello world" diff --git a/tests_2.0/test_resumable_pull.py b/tests_2.0/test_resumable_pull.py new file mode 100644 index 0000000..d0919e1 --- /dev/null +++ b/tests_2.0/test_resumable_pull.py @@ -0,0 +1,369 @@ +"""Tests for resumable pull feature (Priority 1a). + +Tests actual resume functionality with controlled download interruption. + +Strategy: +- Real network download with controlled interruption +- Verifies detection (unhealthy → requires_confirmation) +- Verifies actual resume completes download +- Uses isolated cache (no impact on user cache) + +IMPORTANT: This test MUST stay in tests_2.0/ (NOT tests_2.0/live/). +The live/ directory has pytest hooks for Portfolio Discovery that interfere +with the isolated_cache fixture, causing the test to fail. + +IMPORTANT: This test uses live_resumable marker (not live_e2e) because +module-scoped fixtures from live/conftest.py (_use_real_mlx_modules, +vision_portfolio, text_portfolio) interfere with HuggingFace Hub's symlink +creation mechanism during resume. These fixtures manipulate sys.path and +run subprocesses, causing import resolution issues. Must run independently. + +Opt-in: Requires MLXK2_TEST_RESUMABLE_DOWNLOAD=1 (network test) + +Run: MLXK2_TEST_RESUMABLE_DOWNLOAD=1 pytest -m live_resumable tests_2.0/test_resumable_pull.py -v +""" + +import os +import sys +import time +import subprocess +import pytest +from pathlib import Path + +# Mark as live_resumable (isolated from live_e2e module fixtures) +pytestmark = [pytest.mark.live_resumable] + + +@pytest.mark.skipif( + not os.environ.get("MLXK2_TEST_RESUMABLE_DOWNLOAD"), + reason="Requires MLXK2_TEST_RESUMABLE_DOWNLOAD=1 (network download test)" +) +class TestResumablePullRealDownload: + """Test actual resume functionality with controlled download interruption. + + Uses real network downloads with subprocess interruption to test resume logic. + Isolated cache prevents impact on user cache. + """ + + def test_pull_resume_with_subprocess_interrupt(self, isolated_cache, monkeypatch): + """Test that interrupted downloads can be resumed successfully. + + Phase 1: Start download in subprocess and interrupt after partial completion + Phase 2: Verify model is unhealthy and returns requires_confirmation + Phase 3: Call pull_operation() to resume download + Phase 4: Verify model is now healthy + + Note: This test downloads from HuggingFace (network required). + Uses isolated cache - no impact on user cache. + """ + from mlxk2.operations.pull import pull_operation + from mlxk2.operations.health import is_model_healthy + from mlxk2.core.cache import hf_to_cache_dir + + # Select small multi-shard model for testing + # Phi-3-mini-4k-instruct-4bit is ~2.3GB with multiple shards + model = os.environ.get( + "MLXK2_RESUMABLE_TEST_MODEL", + "mlx-community/Phi-3-mini-4k-instruct-4bit" + ) + + print(f"\n[TEST] ========== RESUMABLE PULL TEST START ==========") + print(f"[TEST] Model: {model}") + print(f"[TEST] Isolated cache: {isolated_cache}") + + # Use isolated_cache directly (fixture already patches MODEL_CACHE) + cache_dir = isolated_cache / hf_to_cache_dir(model) + blobs_dir = isolated_cache.parent / "blobs" + + print(f"[TEST] Cache dir: {cache_dir}") + print(f"[TEST] Blobs dir: {blobs_dir}") + + # Phase 1: Download with controlled interruption + # Use subprocess so we can kill it cleanly + # Force sequential download (max_workers=1) so we can reliably interrupt + import_code = """ +from huggingface_hub import snapshot_download +snapshot_download( + repo_id='{model}', + local_files_only=False, + resume_download=True, + max_workers=1 # Sequential download for reliable interruption +) +""".format(model=model) + + # Start subprocess with output visible for debugging + # HF_HOME points to parent of hub directory (HuggingFace standard) + # isolated_cache is already the hub path, so use parent + proc = subprocess.Popen( + [sys.executable, "-c", import_code], + env={**os.environ, "HF_HOME": str(isolated_cache.parent)}, + stdout=None, # Let output go to terminal + stderr=None # Let errors go to terminal + ) + + # Interrupt download after fixed time (simpler and more reliable) + # Model downloads in ~220s total, interrupt after 45s = ~20% downloaded + interrupt_after = 45 # seconds + start_time = time.time() + interrupted = False + + print(f"[TEST] Starting download of {model}...") + print(f"[TEST] Subprocess PID: {proc.pid}") + print(f"[TEST] Will interrupt after {interrupt_after}s (to create partial state)") + + try: + while proc.poll() is None: + elapsed = time.time() - start_time + + # Show progress every 5 seconds + if int(elapsed) % 5 == 0 and elapsed > 0: + if blobs_dir.exists(): + total_bytes = sum(f.stat().st_size for f in blobs_dir.iterdir() if f.is_file()) + print(f"[TEST] {elapsed:.0f}s elapsed, {total_bytes / 1_000_000:.1f}MB downloaded") + + if elapsed >= interrupt_after: + # Time to interrupt + if blobs_dir.exists(): + total_bytes = sum(f.stat().st_size for f in blobs_dir.iterdir() if f.is_file()) + print(f"\n[TEST] Interrupting after {elapsed:.0f}s ({total_bytes / 1_000_000:.1f}MB downloaded)") + else: + print(f"\n[TEST] Interrupting after {elapsed:.0f}s") + proc.terminate() + proc.wait(timeout=5) + interrupted = True + break + + time.sleep(1.0) + + if not interrupted: + # Process completed before we could interrupt + # This can happen if model is very small or already cached + proc.wait() + pytest.skip("Download completed before interrupt - model may be too small or cached") + + except subprocess.TimeoutExpired: + proc.kill() + pytest.fail("Download took too long - possible network issue") + + # Phase 2: Verify unhealthy state and requires_confirmation + print("[TEST] Phase 2: Checking health after interrupt...") + + # DEBUG: Trace cache state + print(f"[DEBUG] Cache dir exists: {cache_dir.exists()}") + if cache_dir.exists(): + print(f"[DEBUG] Cache dir contents: {list(cache_dir.iterdir())[:5]}") + print(f"[DEBUG] Blobs dir exists: {blobs_dir.exists()}") + if blobs_dir.exists(): + blob_count = len(list(blobs_dir.iterdir())) + total_bytes = sum(f.stat().st_size for f in blobs_dir.iterdir() if f.is_file()) + print(f"[DEBUG] Blobs: {blob_count} files, {total_bytes / 1_000_000:.1f}MB total") + print(f"[DEBUG] Current HF_HOME: {os.environ.get('HF_HOME')}") + + healthy, reason = is_model_healthy(model) + if healthy: + pytest.skip("Model is healthy after interrupt - download may have completed too quickly") + + print(f"[TEST] Model unhealthy (expected): {reason}") + + result = pull_operation(model) + assert result["status"] == "success", f"Expected success status, got: {result}" + assert result["data"]["download_status"] == "requires_confirmation", \ + f"Expected requires_confirmation for unhealthy model, got: {result['data']['download_status']}" + assert "--force-resume" in result["data"]["message"], \ + "Message should suggest --force-resume flag" + print("[TEST] ✓ Phase 2 passed: requires_confirmation detected") + + # Phase 3: Resume download (simulates user confirming resume) + # Call pull_operation() with force_resume=True to skip health check + print("[TEST] Phase 3: Resuming download...") + print(f"[DEBUG] Before resume - HF_HOME: {os.environ.get('HF_HOME')}") + print(f"[DEBUG] Before resume - Cache dir exists: {cache_dir.exists()}") + + # CRITICAL CHECK: Is Phase 1 subprocess still running? + subprocess_status = proc.poll() + if subprocess_status is None: + print(f"[WARNING] Phase 1 subprocess (PID {proc.pid}) STILL RUNNING during Phase 3!") + print(f"[WARNING] This could cause race conditions with downloads") + else: + print(f"[DEBUG] Phase 1 subprocess terminated with code: {subprocess_status}") + + result = pull_operation(model, force_resume=True) + + # Wait for any background downloads to settle + print("[DEBUG] Waiting 5s for background downloads to settle...") + time.sleep(5) + + print(f"[DEBUG] After resume - pull_operation result: {result['data']['download_status']}") + print(f"[DEBUG] After resume - HF_HOME: {os.environ.get('HF_HOME')}") + print(f"[DEBUG] After resume - Cache dir exists: {cache_dir.exists()}") + if cache_dir.exists(): + print(f"[DEBUG] After resume - Cache dir contents: {list(cache_dir.iterdir())[:5]}") + + # Check actual snapshot contents + snapshots_dir = cache_dir / "snapshots" + if snapshots_dir.exists(): + snapshots = list(snapshots_dir.iterdir()) + print(f"[DEBUG] After resume - Found {len(snapshots)} snapshot(s)") + if snapshots: + latest_snapshot = snapshots[0] + snapshot_files = list(latest_snapshot.iterdir()) + total_snapshot_bytes = sum(f.stat().st_size for f in snapshot_files if f.is_file()) + print(f"[DEBUG] After resume - Snapshot has {len(snapshot_files)} files, {total_snapshot_bytes / 1_000_000:.1f}MB total") + # Show first few files with sizes + for f in snapshot_files[:5]: + if f.is_file(): + print(f"[DEBUG] - {f.name}: {f.stat().st_size / 1_000_000:.1f}MB") + + # Also check model-specific blobs dir (correct location per user) + model_blobs_dir = cache_dir / "blobs" + if model_blobs_dir.exists(): + blob_count = len(list(model_blobs_dir.iterdir())) + total_bytes = sum(f.stat().st_size for f in model_blobs_dir.iterdir() if f.is_file()) + print(f"[DEBUG] After resume - Model blobs: {blob_count} files, {total_bytes / 1_000_000:.1f}MB total") + + # Phase 4: Verify healthy state + print("[TEST] Phase 4: Verifying resumed download is healthy...") + print(f"[DEBUG] Before health check - HF_HOME: {os.environ.get('HF_HOME')}") + assert result["status"] == "success", f"Resume should succeed, got: {result}" + assert result["data"]["download_status"] in ("success", "already_exists"), \ + f"Expected success or already_exists after resume, got: {result['data']['download_status']}" + + healthy, reason = is_model_healthy(model) + assert healthy, f"Model should be healthy after resume, got: {reason}" + print("[TEST] ✓ All phases passed: Resume successful!") + + +# ============================================================================= +# Unit Tests for CLI Logic (no network required, always run) +# ============================================================================= + +class TestResumablePullCLI: + """Unit tests for CLI --force-resume logic. + + These tests mock pull_operation() to test the CLI argument handling + without requiring actual network downloads. + """ + + def test_force_resume_works_in_non_interactive_mode(self, monkeypatch): + """Test that --force-resume flag is honored when stdin is not a TTY. + + Regression test for bug where non-interactive stdin would exit(1) + before checking --force-resume flag. + """ + from unittest.mock import MagicMock, patch + import argparse + + # Track calls to pull_operation + pull_calls = [] + + def mock_pull_operation(model, force_resume=False): + pull_calls.append({"model": model, "force_resume": force_resume}) + if not force_resume: + # First call: return requires_confirmation + return { + "status": "success", + "data": { + "download_status": "requires_confirmation", + "model": model, + "message": "Model has partial download. Use --force-resume to continue." + } + } + else: + # Second call with force_resume=True: return success + return { + "status": "success", + "data": { + "download_status": "success", + "model": model + } + } + + # Mock stdin.isatty() to return False (non-interactive) + mock_stdin = MagicMock() + mock_stdin.isatty.return_value = False + monkeypatch.setattr("sys.stdin", mock_stdin) + + # Import CLI module after mocking + from mlxk2 import cli + + # Mock pull_operation + monkeypatch.setattr("mlxk2.cli.pull_operation", mock_pull_operation) + + # Mock print_result to prevent output + monkeypatch.setattr("mlxk2.cli.print_result", lambda *args, **kwargs: None) + + # Create args with force_resume=True + args = argparse.Namespace( + command="pull", + model="test-model", + json=False, + force_resume=True, + verbose=False + ) + + # Call the CLI handler directly + # We need to extract just the pull handling logic + result = mock_pull_operation(args.model) + + # Simulate the CLI logic (from cli.py lines 323-357) + if result.get("data", {}).get("download_status") == "requires_confirmation": + if args.json: + pass # Would print and exit + elif getattr(args, "force_resume", False): + # This is the fix: --force-resume should work in non-interactive + result = mock_pull_operation(args.model, force_resume=True) + elif not mock_stdin.isatty(): + # Non-interactive without --force-resume: would fail + pytest.fail("Should not reach here with --force-resume=True") + + # Verify pull_operation was called with force_resume=True + assert len(pull_calls) == 2, f"Expected 2 calls to pull_operation, got {len(pull_calls)}" + assert pull_calls[0]["force_resume"] == False, "First call should be without force_resume" + assert pull_calls[1]["force_resume"] == True, "Second call should be with force_resume=True" + assert result["data"]["download_status"] == "success" + + def test_non_interactive_without_force_resume_fails(self, monkeypatch): + """Test that non-interactive mode without --force-resume fails properly. + + This ensures we don't silently ignore the requires_confirmation status. + """ + from unittest.mock import MagicMock + import argparse + + def mock_pull_operation(model, force_resume=False): + return { + "status": "success", + "data": { + "download_status": "requires_confirmation", + "model": model, + "message": "Model has partial download. Use --force-resume to continue." + } + } + + # Mock stdin.isatty() to return False (non-interactive) + mock_stdin = MagicMock() + mock_stdin.isatty.return_value = False + monkeypatch.setattr("sys.stdin", mock_stdin) + + # Create args WITHOUT force_resume + args = argparse.Namespace( + command="pull", + model="test-model", + json=False, + force_resume=False, + verbose=False + ) + + result = mock_pull_operation(args.model) + + # Simulate the CLI logic - should fail for non-interactive without --force-resume + should_fail = False + if result.get("data", {}).get("download_status") == "requires_confirmation": + if args.json: + pass + elif getattr(args, "force_resume", False): + pass # Would resume + elif not mock_stdin.isatty(): + should_fail = True # This is expected behavior + + assert should_fail, "Non-interactive mode without --force-resume should fail" diff --git a/tests_2.0/test_runner_core.py b/tests_2.0/test_runner_core.py index 094f66d..405f185 100644 --- a/tests_2.0/test_runner_core.py +++ b/tests_2.0/test_runner_core.py @@ -13,6 +13,38 @@ import mlx.core as mx from mlxk2.core.runner import MLXRunner +class MockDetokenizer: + """Mock detokenizer that mimics BPEStreamingDetokenizer behavior. + + Used by unit tests to mock tokenizer.detokenizer after Session 60 changes. + Session 60 switched from tokenizer.decode() to tokenizer.detokenizer for + proper BPE space marker (Ġ U+0120) conversion. + """ + def __init__(self, decode_func): + """Initialize with a decode function that maps token lists to strings.""" + self.decode_func = decode_func + self.tokens = [] + self._text = "" + + def reset(self): + """Reset accumulated tokens.""" + self.tokens = [] + self._text = "" + + def add_token(self, token_id): + """Add a token to the accumulated list.""" + self.tokens.append(token_id) + + def finalize(self): + """Finalize and decode accumulated tokens.""" + self._text = self.decode_func(self.tokens) + + @property + def text(self): + """Return the decoded text.""" + return self._text + + @contextmanager def mock_runner_environment(temp_cache_dir, model_name="test-model"): """Mock the environment needed for MLXRunner tests.""" @@ -114,7 +146,9 @@ class TestMLXRunnerBasic: return "unknown" mocks['mock_tokenizer'].decode.side_effect = mock_decode - + # Use MockDetokenizer for proper BPE space marker handling + mocks['mock_tokenizer'].detokenizer = MockDetokenizer(mock_decode) + with MLXRunner(model_name) as runner: tokens = list(runner.generate_streaming("test prompt", max_tokens=2)) @@ -155,7 +189,7 @@ class TestMLXRunnerStopTokens: def test_chat_stop_tokens_filtered_when_enabled(self, temp_cache_dir): """Chat stop tokens are filtered only when explicitly enabled""" model_name = "test-model" - + with mock_runner_environment(temp_cache_dir, model_name) as mocks: with patch('mlxk2.core.runner.generate_step') as mock_gen: mock_gen.return_value = [ @@ -176,10 +210,12 @@ class TestMLXRunnerStopTokens: # Fallback for other cases return "" mocks['mock_tokenizer'].decode.side_effect = mock_decode + # Mock detokenizer (Session 60 BPE fix) + mocks['mock_tokenizer'].detokenizer = MockDetokenizer(mock_decode) with MLXRunner(model_name) as runner: result = runner.generate_batch("test prompt", use_chat_stop_tokens=True) - + # Should stop at chat stop token assert "\nHuman:" not in result assert result == "Response" @@ -205,6 +241,8 @@ class TestMLXRunnerStopTokens: return "Response\nHuman: rest" return "" mocks['mock_tokenizer'].decode.side_effect = mock_decode + # Mock detokenizer (Session 60 BPE fix) + mocks['mock_tokenizer'].detokenizer = MockDetokenizer(mock_decode) with MLXRunner(model_name) as runner: result = runner.generate_batch("test prompt") @@ -215,7 +253,7 @@ class TestMLXRunnerStopTokens: def test_streaming_vs_batch_consistency(self, temp_cache_dir): """Test that streaming and batch modes produce identical output""" model_name = "test-model" - + with mock_runner_environment(temp_cache_dir, model_name) as mocks: # Same mock sequence for both tests def mock_generation(): @@ -241,16 +279,18 @@ class TestMLXRunnerStopTokens: return "Hello world!" return "" mocks['mock_tokenizer'].decode.side_effect = mock_decode + # Mock detokenizer (Session 60 BPE fix) + mocks['mock_tokenizer'].detokenizer = MockDetokenizer(mock_decode) with MLXRunner(model_name) as runner: # Test streaming with patch('mlxk2.core.runner.generate_step', return_value=mock_generation()): streaming_result = "".join(runner.generate_streaming("test")) - + # Test batch with patch('mlxk2.core.runner.generate_step', return_value=mock_generation()): batch_result = runner.generate_batch("test") - + assert streaming_result == batch_result diff --git a/tests_2.0/test_stop_tokens_live.py b/tests_2.0/test_stop_tokens_live.py index a0a50e0..0f9bd54 100644 --- a/tests_2.0/test_stop_tokens_live.py +++ b/tests_2.0/test_stop_tokens_live.py @@ -1,6 +1,6 @@ """Real-model stop token detection tests for Issue #32 (ADR-009). -This test suite validates stop token handling with real models that exhibit +This test suite validates stop token handling with real TEXT models that exhibit known issues: - MXFP4: Visible `<|end|>` tokens in output - Qwen 2.5: Self-conversation (chat template role markers) @@ -11,6 +11,11 @@ Test Strategy (ADR-009): 2. Phase 2: Fix validation (verify 2-LOC fix works) 3. Phase 3: Empirical mapping (document tokenizer configs) +Portfolio Discovery: +- Auto-discovers MLX TEXT chat models only (excludes Vision "chat+vision") +- Uses MLXRunner (mlx-lm) which cannot load Vision models (mllama etc.) +- See Portfolio Separation: live/test_utils.py for separated text/vision portfolios + Opt-in via: pytest -m live_stop_tokens Requires: HF_HOME set to SSD cache (CoW same-volume requirement, ADR-007) @@ -33,6 +38,10 @@ from typing import Dict, Any, Optional import importlib import importlib.util +# Portfolio Separation reference: live/test_utils.py provides discover_text_models() +# but we can't use it here due to circular import (test_utils imports from this file) +# Instead, we fix discover_mlx_models_in_user_cache() to exclude Vision models directly + # Opt-in marker for live tests pytestmark = [pytest.mark.live_stop_tokens, pytest.mark.slow] @@ -173,11 +182,11 @@ def discover_mlx_models_in_user_cache() -> List[Dict[str, Any]]: Filters for: - Framework: MLX only (not GGUF/PyTorch) - Health: healthy only - - Runtime: runtime_compatible only (mlx-lm can load) - - Type: chat models only (for stop token testing) + - Runtime: runtime_compatible only (mlx-lm/mlx-vlm can load) + - Type: chat models (TEXT + VISION, includes all model_type="chat") - Note: Vision models are included if runtime_compatible=True (Python 3.10+). - Server E2E tests handle vision models gracefully (HTTP 501 → pytest.skip). + Note: Returns BOTH text and vision models. Caller must filter by capabilities + if needed (e.g., portfolio_models fixture filters to TEXT-only). Returns: List of dicts with keys: model_id, ram_needed_gb, snapshot_path, weight_count @@ -215,16 +224,16 @@ def discover_mlx_models_in_user_cache() -> List[Dict[str, Any]]: # Filter per schema modelObject fields discovered = [] for model in models: - # Filter: MLX + healthy + runtime_compatible + chat + # Filter: MLX + healthy + runtime_compatible + chat (TEXT + VISION) model_type = model.get("model_type") - is_chat_family = ( + is_chat = ( isinstance(model_type, str) and - ("chat" in model_type.lower()) # includes chat and chat+vision + model_type == "chat" # Includes both text and vision chat models ) if (model.get("framework") == "MLX" and model.get("health") == "healthy" and model.get("runtime_compatible") is True and - is_chat_family): + is_chat): # RAM estimation: size_bytes * 1.2 overhead size_bytes = model.get("size_bytes", 0) @@ -287,26 +296,59 @@ TEST_MODELS = { @pytest.fixture(scope="module") def portfolio_models(): - """Dynamic model portfolio: discovered models OR hardcoded fallback. + """Dynamic TEXT model portfolio: discovered models OR hardcoded fallback. + + Discovers MLX TEXT chat models only (excludes Vision "chat+vision"). + Uses MLXRunner (mlx-lm) which cannot load Vision models. Enables portfolio testing when HF_HOME is set, falls back to 3 hardcoded test models otherwise (backward compatibility). """ - discovered = discover_mlx_models_in_user_cache() + all_models = discover_mlx_models_in_user_cache() # Returns TEXT + VISION - if discovered: - # Convert discovered models to TEST_MODELS format + if all_models: + # Filter to TEXT-only models (exclude Vision) + # Vision models have "vision" in capabilities array (from mlxk list --json) + import subprocess + import json + import os + + env = os.environ.copy() + if env.get("HF_HOME"): + try: + result_data = subprocess.run( + [sys.executable, "-m", "mlxk2.cli", "list", "--json"], + capture_output=True, + text=True, + timeout=30, + env=env + ) + if result_data.returncode == 0: + data = json.loads(result_data.stdout) + models_list = data.get("data", {}).get("models", []) + # Build set of vision model IDs + vision_ids = {m["name"] for m in models_list if "vision" in m.get("capabilities", [])} + # Filter out vision models + text_models = [m for m in all_models if m["model_id"] not in vision_ids] + else: + text_models = all_models # Fallback: include all + except Exception: + text_models = all_models # Fallback: include all + else: + text_models = all_models # No HF_HOME, use all + + # Convert discovered TEXT models to TEST_MODELS format result = {} - for i, model in enumerate(discovered): + for i, model in enumerate(text_models): key = f"discovered_{i:02d}" result[key] = { "id": model["model_id"], "ram_needed_gb": model["ram_needed_gb"], "expected_issue": None, # Unknown for discovered models - "description": f"Discovered: {model['model_id']} ({model['weight_count']} weights)" + "description": f"Discovered: {model['model_id']} ({model.get('weight_count', 'unknown')} weights)" } - print(f"\n🔍 Portfolio Discovery: Found {len(result)} MLX models in cache") + print(f"\n🔍 Portfolio Discovery: Found {len(result)} MLX TEXT models in cache") return result else: # Fallback to hardcoded test models @@ -346,6 +388,59 @@ TEST_PROMPT = "Write one sentence about cats." MAX_TOKENS = 50 +def pytest_generate_tests(metafunc): + """Dynamically parametrize empirical mapping test with discovered model keys. + + This hook runs during test collection (before test execution). + Enables process-per-model isolation: each model runs in separate pytest process. + + Architecture Decision (Session 56): + - Prevents memory leak accumulation (71GB swap with 20 models in one process) + - OS-level cleanup between models (process exit guarantees full cleanup) + - Reflects real-world usage (users never load 20+ models sequentially) + """ + if metafunc.function.__name__ == "test_empirical_mapping_single_model": + # Lightweight discovery for parametrization (same logic as portfolio_models fixture) + from live.test_utils import discover_mlx_models_in_user_cache + + all_models = discover_mlx_models_in_user_cache() + + if all_models: + # Filter to TEXT-only models (exclude Vision) - same as portfolio_models fixture + import json + env = os.environ.copy() + if env.get("HF_HOME"): + try: + result_data = subprocess.run( + [sys.executable, "-m", "mlxk2.cli", "list", "--json"], + capture_output=True, + text=True, + timeout=30, + env=env + ) + if result_data.returncode == 0: + data = json.loads(result_data.stdout) + models_list = data.get("data", {}).get("models", []) + vision_ids = {m["name"] for m in models_list if "vision" in m.get("capabilities", [])} + text_models = [m for m in all_models if m["model_id"] not in vision_ids] + else: + text_models = all_models + except Exception: + text_models = all_models + else: + text_models = all_models + + # Generate model keys (discovered_00, discovered_01, ...) + model_keys = [f"discovered_{i:02d}" for i in range(len(text_models))] + else: + # Fallback to hardcoded test models + model_keys = list(TEST_MODELS.keys()) + + # Parametrize with model keys (each key becomes a separate test) + # ids= makes test names readable: test_empirical_mapping_single_model[discovered_00] + metafunc.parametrize("model_key_param", model_keys, ids=lambda x: x) + + class TestStopTokensValidation: """Validation: Verify stop token handling works correctly (Issue #32, ADR-009).""" @@ -362,10 +457,10 @@ class TestStopTokensValidation: - Root cause: Runner only checked singular eos_token_id - Fix: Use eos_token_ids Set to handle multiple EOS tokens """ - # Only run when explicitly selected with -m live_stop_tokens + # Only run when explicitly selected with -m live_stop_tokens or -m wet selected = request.config.getoption("-m") or "" - if "live_stop_tokens" not in selected: - pytest.skip("Run with -m live_stop_tokens to enable live model tests") + if "live_stop_tokens" not in selected and "wet" not in selected: + pytest.skip("Run with -m live_stop_tokens or -m wet to enable live model tests") # RAM Safety Check should_skip, reason = should_skip_model("mxfp4") @@ -410,10 +505,10 @@ class TestStopTokensValidation: - Model stops cleanly after its response - No chat template markers in output """ - # Only run when explicitly selected with -m live_stop_tokens + # Only run when explicitly selected with -m live_stop_tokens or -m wet selected = request.config.getoption("-m") or "" - if "live_stop_tokens" not in selected: - pytest.skip("Run with -m live_stop_tokens to enable live model tests") + if "live_stop_tokens" not in selected and "wet" not in selected: + pytest.skip("Run with -m live_stop_tokens or -m wet to enable live model tests") # RAM Safety Check should_skip, reason = should_skip_model("qwen25") @@ -467,10 +562,10 @@ class TestStopTokensValidation: - No self-conversation - Serves as regression baseline """ - # Only run when explicitly selected with -m live_stop_tokens + # Only run when explicitly selected with -m live_stop_tokens or -m wet selected = request.config.getoption("-m") or "" - if "live_stop_tokens" not in selected: - pytest.skip("Run with -m live_stop_tokens to enable live model tests") + if "live_stop_tokens" not in selected and "wet" not in selected: + pytest.skip("Run with -m live_stop_tokens or -m wet to enable live model tests") # RAM Safety Check should_skip, reason = should_skip_model("llama32") @@ -478,9 +573,18 @@ class TestStopTokensValidation: pytest.skip(reason) from mlxk2.core.runner import MLXRunner + from mlxk2.core.cache import get_current_model_cache, hf_to_cache_dir + from pathlib import Path model_id = TEST_MODELS["llama32"]["id"] + # Check if model exists in cache + cache = get_current_model_cache() + model_dir = cache / hf_to_cache_dir(model_id) + snapshots_dir = model_dir / "snapshots" + if not snapshots_dir.exists() or not any(snapshots_dir.iterdir()): + pytest.skip(f"Model not in cache: {model_id}") + # Run inference with MLXRunner(model_id) as runner: output = runner.generate_batch( @@ -513,11 +617,17 @@ class TestStopTokensEmpiricalMapping: """Phase 3: Empirical mapping - document tokenizer configs and observed tokens.""" @pytest.mark.live_stop_tokens - def test_empirical_mapping_all_models(self, portfolio_models, request): - """Document tokenizer configs and empirically observed stop tokens. + def test_empirical_mapping_single_model(self, model_key_param, portfolio_models, request): + """Document tokenizer configs and empirically observed stop tokens (ONE model per test). + + ARCHITECTURE DECISION (Session 56): + - Each model runs in SEPARATE pytest process (process isolation) + - OS guarantees complete memory cleanup between models + - Prevents memory leak accumulation (71GB swap with 20 models in one process) + - Reflects real-world usage (users never load 20 models sequentially) Uses portfolio_models fixture for dynamic model discovery. - Generates report: stop_token_config_report.json + Each test writes JSONL fragment, final report generated by finalize test. Report Format (ADR-009): { @@ -528,41 +638,36 @@ class TestStopTokensEmpiricalMapping: "workaround_needed": True/False } """ - # Only run when explicitly selected with -m live_stop_tokens + # Only run when explicitly selected with -m live_stop_tokens or -m wet selected = request.config.getoption("-m") or "" - if "live_stop_tokens" not in selected: - pytest.skip("Run with -m live_stop_tokens to enable portfolio discovery") + if "live_stop_tokens" not in selected and "wet" not in selected: + pytest.skip("Run with -m live_stop_tokens or -m wet to enable portfolio discovery") from mlxk2.core.runner import MLXRunner - report = {} + # Get model_key from pytest parametrize + model_key = model_key_param + model_info = portfolio_models[model_key] + model_id = model_info["id"] + system_ram = get_system_ram_gb() ram_budget = get_safe_ram_budget_gb() - - # Calculate actual budget ratio used budget_ratio = ram_budget / system_ram if system_ram > 0 else 0.40 - # Add system info to report - report["_system_info"] = { - "system_ram_gb": round(system_ram, 1), - "ram_budget_gb": round(ram_budget, 1), - "budget_ratio": round(budget_ratio, 2) - } - - for model_key, model_info in portfolio_models.items(): - model_id = model_info["id"] - - # Skip models that exceed RAM budget - should_skip, skip_reason = should_skip_model(model_key, portfolio_models) - if should_skip: - print(f"\nSkipping {model_key}: {skip_reason}") - report[model_key] = { - "model_id": model_id, - "skipped": True, - "skip_reason": skip_reason - } - continue - + # Skip models that exceed RAM budget + should_skip, skip_reason = should_skip_model(model_key, portfolio_models) + if should_skip: + print(f"\nSkipping {model_key}: {skip_reason}") + result = { + "model_key": model_key, + "model_id": model_id, + "skipped": True, + "skip_reason": skip_reason, + "system_ram_gb": round(system_ram, 1), + "ram_budget_gb": round(ram_budget, 1), + "budget_ratio": round(budget_ratio, 2) + } + else: with MLXRunner(model_id) as runner: # Get tokenizer config tokenizer = runner.tokenizer @@ -588,18 +693,77 @@ class TestStopTokensEmpiricalMapping: potential_stop_tokens = ["<|end|>", "<|eot_id|>", "<|im_end|>", "<|endoftext|>"] found_stop_tokens = [t for t in potential_stop_tokens if t in output] - report[model_key] = { + result = { + "model_key": model_key, "model_id": model_id, "configured_eos_token": eos_token, "configured_eos_token_id": eos_token_id, "configured_eos_token_ids": eos_token_ids, "generated_output": output[:100], # First 100 chars for reference "visible_stop_tokens": found_stop_tokens, - "workaround_needed": bool(found_stop_tokens) + "workaround_needed": bool(found_stop_tokens), + "system_ram_gb": round(system_ram, 1), + "ram_budget_gb": round(ram_budget, 1), + "budget_ratio": round(budget_ratio, 2) } - # Write report + # Write JSONL fragment (append mode - each test writes one line) + fragments_path = Path("stop_token_config_fragments.jsonl") + with open(fragments_path, "a") as f: + f.write(json.dumps(result) + "\n") + + print(f"\n{'='*60}") + print(f"EMPIRICAL MAPPING: {model_key}") + print(f"{'='*60}") + print(json.dumps(result, indent=2)) + + @pytest.mark.live_stop_tokens + def test_empirical_mapping_generate_report(self, request): + """Finalize: Aggregate JSONL fragments into final JSON report. + + Runs AFTER all single-model tests complete. + Reads stop_token_config_fragments.jsonl and generates stop_token_config_report.json. + """ + # Only run when explicitly selected + selected = request.config.getoption("-m") or "" + if "live_stop_tokens" not in selected and "wet" not in selected: + pytest.skip("Run with -m live_stop_tokens or -m wet to enable portfolio discovery") + + fragments_path = Path("stop_token_config_fragments.jsonl") report_path = Path("stop_token_config_report.json") + + if not fragments_path.exists(): + pytest.skip("No fragments found - single-model tests may not have run") + + # Read all JSONL fragments + fragments = [] + with open(fragments_path, "r") as f: + for line in f: + if line.strip(): + fragments.append(json.loads(line)) + + # Build final report + report = {} + + # Extract system info from first fragment + if fragments: + first = fragments[0] + report["_system_info"] = { + "system_ram_gb": first.get("system_ram_gb", 0), + "ram_budget_gb": first.get("ram_budget_gb", 0), + "budget_ratio": first.get("budget_ratio", 0) + } + + # Add all model results + for fragment in fragments: + model_key = fragment.pop("model_key") + # Remove system_info fields from individual entries + fragment.pop("system_ram_gb", None) + fragment.pop("ram_budget_gb", None) + fragment.pop("budget_ratio", None) + report[model_key] = fragment + + # Write final JSON report report_path.write_text(json.dumps(report, indent=2)) print(f"\n{'='*60}") @@ -614,3 +778,7 @@ class TestStopTokensEmpiricalMapping: if isinstance(v, dict) and v.get("workaround_needed") ] print(f"\nModels needing fix: {models_needing_fix}") + + # Cleanup fragments + fragments_path.unlink() + print(f"Cleaned up: {fragments_path}") diff --git a/tests_2.0/test_workspace_sentinel.py b/tests_2.0/test_workspace_sentinel.py new file mode 100644 index 0000000..97d399f --- /dev/null +++ b/tests_2.0/test_workspace_sentinel.py @@ -0,0 +1,304 @@ +"""Tests for workspace sentinel primitives (ADR-018 Phase 0a). + +Tests workspace infrastructure: +- Sentinel write with atomic rename +- Managed workspace detection +- Health check integration with workspaces +- Backward compatibility with unmanaged workspaces +""" + +import json +from pathlib import Path + +import pytest + +from mlxk2.operations.workspace import ( + write_workspace_sentinel, + is_managed_workspace, + read_workspace_metadata, + SENTINEL_FILENAME +) +from mlxk2.operations.health import health_check_workspace + + +class TestWorkspaceSentinel: + """Test workspace sentinel write/read primitives.""" + + def test_write_sentinel_atomic(self, tmp_path): + """Test atomic write with rename.""" + metadata = { + "mlxk_version": "2.0.4", + "created_at": "2025-12-29T10:30:00Z", + "source_repo": "mlx-community/Llama-3.2-3B", + "source_revision": "abc123", + "managed": True, + "operation": "clone" + } + + write_workspace_sentinel(tmp_path, metadata) + + sentinel = tmp_path / SENTINEL_FILENAME + assert sentinel.exists(), "Sentinel file should exist" + + # Verify JSON is valid and matches input + data = json.loads(sentinel.read_text()) + assert data["managed"] is True + assert data["mlxk_version"] == "2.0.4" + assert data["operation"] == "clone" + assert data["source_repo"] == "mlx-community/Llama-3.2-3B" + + def test_write_sentinel_creates_valid_json(self, tmp_path): + """Test sentinel is well-formed JSON.""" + metadata = { + "mlxk_version": "2.0.4", + "created_at": "2025-12-29T10:30:00Z", + "managed": True, + "operation": "convert" + } + + write_workspace_sentinel(tmp_path, metadata) + + sentinel = tmp_path / SENTINEL_FILENAME + content = sentinel.read_text() + + # Should be valid JSON + data = json.loads(content) + assert isinstance(data, dict) + + # Should have trailing newline (conventional) + assert content.endswith("\n") + + def test_write_sentinel_requires_path_object(self, tmp_path): + """Test workspace_path must be Path object.""" + metadata = {"mlxk_version": "2.0.4", "created_at": "2025-12-29T10:30:00Z", + "managed": True, "operation": "clone"} + + with pytest.raises(TypeError, match="must be Path"): + write_workspace_sentinel(str(tmp_path), metadata) + + def test_write_sentinel_validates_required_fields(self, tmp_path): + """Test metadata must have required fields.""" + # Missing 'mlxk_version' + incomplete_metadata = { + "created_at": "2025-12-29T10:30:00Z", + "managed": True, + "operation": "clone" + } + + with pytest.raises(ValueError, match="Missing required metadata fields"): + write_workspace_sentinel(tmp_path, incomplete_metadata) + + def test_write_sentinel_allows_additional_fields(self, tmp_path): + """Test forward compatibility: extra fields allowed.""" + metadata = { + "mlxk_version": "2.0.4", + "created_at": "2025-12-29T10:30:00Z", + "managed": True, + "operation": "clone", + "custom_field": "future_feature", # Extra field + "another_field": 42 + } + + write_workspace_sentinel(tmp_path, metadata) + + data = json.loads((tmp_path / SENTINEL_FILENAME).read_text()) + assert data["custom_field"] == "future_feature" + assert data["another_field"] == 42 + + +class TestManagedWorkspaceDetection: + """Test managed vs unmanaged workspace detection.""" + + def test_is_managed_workspace_true(self, tmp_path): + """Test managed workspace detection (has valid sentinel).""" + metadata = { + "mlxk_version": "2.0.4", + "created_at": "2025-12-29T10:30:00Z", + "managed": True, + "operation": "clone" + } + write_workspace_sentinel(tmp_path, metadata) + + assert is_managed_workspace(tmp_path) is True + + def test_is_managed_workspace_false_no_sentinel(self, tmp_path): + """Test unmanaged workspace (no sentinel).""" + # Empty directory, no sentinel + assert is_managed_workspace(tmp_path) is False + + def test_is_managed_workspace_false_managed_field_false(self, tmp_path): + """Test sentinel exists but managed=False.""" + metadata = { + "mlxk_version": "2.0.4", + "created_at": "2025-12-29T10:30:00Z", + "managed": False, # Explicitly unmanaged + "operation": "manual" + } + write_workspace_sentinel(tmp_path, metadata) + + assert is_managed_workspace(tmp_path) is False + + def test_is_managed_workspace_false_invalid_json(self, tmp_path): + """Test corrupted sentinel (invalid JSON).""" + sentinel = tmp_path / SENTINEL_FILENAME + sentinel.write_text("{invalid json") + + assert is_managed_workspace(tmp_path) is False + + def test_is_managed_workspace_handles_non_path_input(self): + """Test graceful handling of non-Path input.""" + assert is_managed_workspace("not_a_path") is False + assert is_managed_workspace(None) is False + + def test_read_workspace_metadata_valid(self, tmp_path): + """Test reading sentinel metadata.""" + metadata = { + "mlxk_version": "2.0.4", + "created_at": "2025-12-29T10:30:00Z", + "source_repo": "mlx-community/Model", + "managed": True, + "operation": "clone" + } + write_workspace_sentinel(tmp_path, metadata) + + read_data = read_workspace_metadata(tmp_path) + assert read_data["mlxk_version"] == "2.0.4" + assert read_data["source_repo"] == "mlx-community/Model" + + def test_read_workspace_metadata_no_sentinel(self, tmp_path): + """Test reading returns empty dict if no sentinel.""" + read_data = read_workspace_metadata(tmp_path) + assert read_data == {} + + def test_read_workspace_metadata_invalid_json(self, tmp_path): + """Test reading returns empty dict if JSON invalid.""" + sentinel = tmp_path / SENTINEL_FILENAME + sentinel.write_text("not json") + + read_data = read_workspace_metadata(tmp_path) + assert read_data == {} + + +class TestWorkspaceHealthCheck: + """Test health check integration with managed/unmanaged workspaces.""" + + def test_health_check_workspace_managed(self, tmp_path): + """Test health check on managed workspace.""" + # Create minimal valid workspace + (tmp_path / "config.json").write_text('{"model_type": "llama"}') + metadata = { + "mlxk_version": "2.0.4", + "created_at": "2025-12-29T10:30:00Z", + "managed": True, + "operation": "clone" + } + write_workspace_sentinel(tmp_path, metadata) + + healthy, reason, managed = health_check_workspace(tmp_path) + assert managed is True, "Should detect managed workspace" + # Note: May fail health due to missing weights, but managed flag works + + def test_health_check_workspace_unmanaged(self, tmp_path): + """Test health check on unmanaged workspace.""" + # Create minimal workspace without sentinel + (tmp_path / "config.json").write_text('{"model_type": "llama"}') + + healthy, reason, managed = health_check_workspace(tmp_path) + assert managed is False, "Should detect unmanaged workspace" + # Health check still runs, just flagged as unmanaged + + def test_health_check_workspace_no_config(self, tmp_path): + """Test workspace without config.json is unhealthy.""" + # Empty directory + metadata = { + "mlxk_version": "2.0.4", + "created_at": "2025-12-29T10:30:00Z", + "managed": True, + "operation": "clone" + } + write_workspace_sentinel(tmp_path, metadata) + + healthy, reason, managed = health_check_workspace(tmp_path) + assert healthy is False + assert "No config.json" in reason + assert managed is True # Still recognized as managed + + def test_health_check_workspace_returns_three_values(self, tmp_path): + """Test health_check_workspace returns (bool, str, bool).""" + (tmp_path / "config.json").write_text('{"model_type": "llama"}') + + result = health_check_workspace(tmp_path) + assert isinstance(result, tuple) + assert len(result) == 3 + healthy, reason, managed = result + assert isinstance(healthy, bool) + assert isinstance(reason, str) + assert isinstance(managed, bool) + + +class TestHealthCheckOperationWorkspaceIntegration: + """Test health_check_operation CLI integration with workspace paths.""" + + def test_health_check_operation_detects_workspace_path(self, tmp_path): + """Test health_check_operation recognizes workspace paths.""" + from mlxk2.operations.health import health_check_operation + + # Create minimal workspace + (tmp_path / "config.json").write_text('{"model_type": "llama"}') + (tmp_path / "model.safetensors").write_text("fake weights") + + metadata = { + "mlxk_version": "2.0.4", + "created_at": "2025-12-29T10:30:00Z", + "managed": True, + "operation": "clone" + } + write_workspace_sentinel(tmp_path, metadata) + + # Call health_check_operation with workspace path + result = health_check_operation(str(tmp_path)) + + assert result["status"] == "success" + assert result["data"]["summary"]["total"] == 1 + + # Should have checked the workspace (not cache) + assert len(result["data"]["healthy"]) + len(result["data"]["unhealthy"]) == 1 + + # First result should include 'managed' flag + model_info = (result["data"]["healthy"] + result["data"]["unhealthy"])[0] + assert "managed" in model_info + assert model_info["managed"] is True + assert model_info["name"] == str(tmp_path) + + def test_health_check_operation_workspace_unmanaged(self, tmp_path): + """Test health_check_operation with unmanaged workspace.""" + from mlxk2.operations.health import health_check_operation + + # Unmanaged workspace (no sentinel) + (tmp_path / "config.json").write_text('{"model_type": "llama"}') + (tmp_path / "model.safetensors").write_text("fake weights") + + result = health_check_operation(str(tmp_path)) + + assert result["status"] == "success" + model_info = (result["data"]["healthy"] + result["data"]["unhealthy"])[0] + assert model_info["managed"] is False + + def test_health_check_operation_workspace_vs_cache(self, tmp_path): + """Test workspace path takes precedence over cache name resolution.""" + from mlxk2.operations.health import health_check_operation + + # Create workspace with same name as potential cache model + ws_name = "mlx-community-model" + workspace = tmp_path / ws_name + workspace.mkdir() + (workspace / "config.json").write_text('{"model_type": "llama"}') + (workspace / "model.safetensors").write_text("fake weights") + + # Call with workspace path + result = health_check_operation(str(workspace)) + + # Should detect as workspace (has 'managed' field), not cache model + assert result["status"] == "success" + model_info = (result["data"]["healthy"] + result["data"]["unhealthy"])[0] + assert "managed" in model_info, "Should include 'managed' field (workspace detected)"