diff --git a/.gitignore b/.gitignore index 69a69b4..924e523 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ venv/ venv39/ -venv310/ +venv31?/ venv_*/ test_env*/ test_results*.log diff --git a/CHANGELOG.md b/CHANGELOG.md index 3c7f50e..2f4e294 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,64 @@ # Changelog +## [2.0.4-beta.7] - 2026-01-18 + +### Highlights + +**Production Ready Clone & Push:** `mlxk clone` and `mlxk push` operations are not anymore alpha gated. The `convert` operation remains experimental (requires `MLXK2_ENABLE_ALPHA_FEATURES=1`). + +**Test Isolation Fix:** Live tests (requiring models/network) are now properly excluded from default `pytest` runs. Previously documented but never enforced, this critical fix prevents accidental execution of resource-intensive tests during development. + +**Vision Batch Processing Terminology:** Clarified that mlx-knife uses "batch" in the traditional computing sense (sequential job processing) rather than ML inference batching (parallel batch_size > 1). Output now shows "Chunk 1/3" instead of "Batch 1/3" to reduce confusion with ML terminology. Environment variable renamed to `MLXK2_VISION_CHUNK_SIZE` for consistency with `--chunk` parameter. + +**Vision/Text Inference Differentiation:** Benchmark reports now differentiate between Vision and Text inference for multimodal models. Vision-capable models (e.g., Pixtral) can perform BOTH Vision inference (with `--image` flag) and Text inference (without `--image`), but previous reports only showed total time. Beta.7 introduces separate rows per modality with Mode column and per-modality RAM tracking. Automatic detection via pytest fixtures (`vision_model_key`/`text_model_key`), explicit tagging for pipe tests. Schema v0.2.1 adds `metadata.inference_modality` field (backward compatible with v0.2.0). + +**Workspace Discovery for `list`:** The `list` command now supports local workspace paths with flexible pattern matching. Use `mlxk list ./` to scan current directory, `mlxk list ./gemma-` for prefix matching, or `mlxk list ./path/to/workspace` for exact paths. Push command now validates ambiguous patterns (multiple matches) with descriptive errors. + +### Added + +- **Workspace Support for `list` Operation:** Local workspace paths with flexible pattern matching. Three modes: exact match, directory scan (`./`), prefix match (`./gemma-`). JSON API adds `display_name` field. See README.md "Local Paths" section. + +- **Ambiguous Pattern Handling for `push`:** Prefix patterns with multiple matches return `ambiguous_workspace` error with `matches` array. + +- **Vision Chunk Streaming:** Multi-image vision requests now stream SSE events per-chunk (real-time feedback). WebUI shows progress during processing instead of silent wait. + +- **Benchmark Schema v0.2.1:** New `metadata.inference_modality` field differentiates Vision/Text inference. Automatic detection via pytest fixtures, per-modality RAM tracking in reports. Backward compatible with v0.2.0. + +- **`MLXK2_MAX_TOKENS` Environment Variable:** Server respects `MLXK2_MAX_TOKENS` env var as alternative to `--max-tokens` CLI flag (useful for launchd/systemd service configs). + +### Changed + +- **Vision Model Hallucination Fix:** Fixed models describing images they weren't seeing. Local numbering strategy prevents cross-chunk hallucination. See README.md "Reliability" section. + +- **Code Quality & Compatibility:** Ruff linter fixes, Python 3.12+ datetime compatibility, resumable clone filesystem settle time. + +- **EXIF Metadata:** GPS precision increased to 4 decimals (~11m). GPS-Timestamp primary, DateTimeOriginal fallback. + +- **mlx-vlm Upstream Update:** c536165 → fc8c92e (MXFP4 quantization support). + +- **Clean Server Logging:** Suppressed transformers noise in `--log-json` output. + +- **Alpha-Gate Removal:** `clone` and `push` now production-ready. `convert` remains experimental. + +- **Test Isolation Fix:** Live tests properly excluded from default `pytest` run via `pytest.ini`. + +- **Vision Terminology:** **Breaking:** `MLXK2_VISION_BATCH_SIZE` → `MLXK2_VISION_CHUNK_SIZE`. Output shows "Chunk 1/3". + +- **SERVER-HANDBOOK.md:** Documented `chunk` parameter and corrected 7 documentation inaccuracies (see Fixed). + +### Fixed + +- **Server Robustness (3 Fixes):** + - `/v1/models`: Crash prevention when cache directory doesn't exist + - Exit codes: Proper propagation for CI/CD (including negative signal codes) + - Vision routing: Multi-turn conversations now correctly check only last user message for images + +- **Benchmark Report:** Hardware scan now handles JSONL files without system block in first entry. + +- **SERVER-HANDBOOK.md (7 Corrections):** Image limits (no 50MB total), /v1/models schema, vision streaming, text placeholder, supervised mode, GIF/WebP. See handbook for details. + +--- + ## [2.0.4-beta.6] - 2026-01-07 ### Highlights @@ -658,7 +717,7 @@ This release completes the 2.0.2 recovery plan (Issue #32) with extensive empiri - **De-versioned**: Changed from "Validation (2.0.2)" to "Current validation" (timeless guide, not version-specific) - **ADR references maintained**: Architecture context preserved -- **CLAUDE.md accuracy audit**: +- **Internal documentation audit**: - **ADR-009 status**: Updated to reflect 2.0.1 + 2.0.2 completion timeline - **ADR-011 status**: Updated to 73/81 tests passing, 17 models discovered - **Roadmap**: Updated with recovery plan progress @@ -1120,8 +1179,7 @@ Experimental `push` (upload only) and documentation/testing refinements. - Uploaded file count: Remains `null` when hub does not return per-file operations; no heuristic guessing. ### Docs -- TESTING.md: Added “Reference: Push CLI and JSON”, `--dry-run` examples, and a mini matrix (default vs markers/opt-in). -- CLAUDE.md: Updated Current Focus/Decisions + session summary for push quiet mode, no-op, `--dry-run`. +- TESTING.md: Added "Reference: Push CLI and JSON", `--dry-run` examples, and a mini matrix (default vs markers/opt-in). ### Tests - Offline push tests added/extended, including dry-run planning; live push remains opt-in via `wet`/`live_push` markers and required env vars. diff --git a/README.md b/README.md index 5dfed30..84fe160 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,9 @@ MLX Knife Demo

-**Current Version: 2.0.4-beta.6** (Stable: 2.0.3) +**Current Version: 2.0.4-beta.7** (Stable: 2.0.3) -[![GitHub Release](https://img.shields.io/badge/version-2.0.4--beta.6-blue.svg)](https://github.com/mzau/mlx-knife/releases) +[![GitHub Release](https://img.shields.io/badge/version-2.0.4--beta.7-blue.svg)](https://github.com/mzau/mlx-knife/releases) [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0) [![Python 3.9+](https://img.shields.io/badge/python-3.10+(3.9)-blue.svg)](https://www.python.org/downloads/) [![Apple Silicon](https://img.shields.io/badge/Apple%20Silicon-green.svg)](https://support.apple.com/en-us/HT211814) @@ -24,7 +24,7 @@ - **Community Model Repair Tool** - Fix broken mlx-vlm models with `--repair-index` - **Resumable Downloads** - Interrupted clone/pull operations continue automatically - **Safe Vision Batch Processing** - Automatic chunking prevents Metal OOM crashes -- **Workspace Path Support** - Run/show/server commands work with local directories +- **Workspace Path Support** - Run/show/server/list commands work with local directories ### Core Functionality - **List & Manage Models**: Browse your HuggingFace cache with MLX-specific filtering @@ -76,7 +76,7 @@ This license applies **only** to the `mlx-knife` code and **does not extend** to MLX Knife has been comprehensively tested and verified on: ✅ **Python 3.9.6 - 3.14** - Text LLMs fully supported (mlx-lm 0.28.4+) -✅ **Python 3.10 - 3.14** - Vision models supported (mlx-vlm 0.3.9+; beta.4 uses commit c536165df2b3b4aece3a795b2e414349f935e750 with Pixtral fix) +✅ **Python 3.10 - 3.14** - Vision models supported (mlx-vlm 0.3.9+; beta.7 uses commit fc8c92e31983a52761f37d503f903ec40bebbd62 with MXFP4 support) **Note:** Vision features require Python 3.10+. Native macOS Python 3.9.6 users need to upgrade (e.g., via Homebrew). @@ -106,17 +106,17 @@ mlxk --version # → mlxk 2.0.3 (latest stable on PyPI) ### Via GitHub (Latest Beta) ```bash -# Install 2.0.4-beta.6 (Workspace operations + Resumable clone) -pip install "git+https://github.com/mzau/mlx-knife.git@v2.0.4-beta.6" +# Install 2.0.4-beta.7 (Workspace discovery + Server robustness) +pip install "git+https://github.com/mzau/mlx-knife.git@v2.0.4-beta.7" # With Vision support (Python 3.10+ required) -pip install "git+https://github.com/mzau/mlx-knife.git@v2.0.4-beta.6#egg=mlx-knife[vision]" +pip install "git+https://github.com/mzau/mlx-knife.git@v2.0.4-beta.7#egg=mlx-knife[vision]" # Verify installation -mlxk --version # → mlxk 2.0.4b6 +mlxk --version # → mlxk 2.0.4b7 ``` -**Beta.6 note:** Uses mlx-vlm commit c536165df2b3b4aece3a795b2e414349f935e750 (includes Pixtral text-only fix). The `[vision]` extra automatically installs the correct version. +**Beta.7 note:** Uses mlx-vlm commit fc8c92e (includes MXFP4 quantization support). The `[vision]` extra automatically installs the correct version. **For production use:** Wait for 2.0.4 stable on PyPI (requires mlx-vlm 0.3.10 release). @@ -134,7 +134,7 @@ pip install -e ".[dev,test]" pip install -e ".[dev,test,vision]" # Verify installation -mlxk --version # → mlxk 2.0.4b5 +mlxk --version # → mlxk 2.0.4b7 # Run tests and quality checks (before committing) pytest -v @@ -176,6 +176,22 @@ mlxk run "Phi-3-mini" -c mlxk serve --port 8080 ``` +## Commands + +| Command | Description | +|---------|-------------| +| `list` | Model discovery with JSON output; supports cache and workspace paths | +| `show` | Detailed model information with --files, --config | +| `health` | Corruption detection and cache analysis | +| `pull` | HuggingFace model downloads with corruption detection | +| `rm` | Model deletion with lock cleanup and fuzzy matching | +| `run` | Interactive and single-shot model execution with streaming/batch modes | +| `server`/`serve` | OpenAI-compatible API server; SIGINT-robust (Supervisor); SSE streaming | +| `clone` | Model workspace cloning - create local editable copy from cache | +| `push` | Upload to HuggingFace Hub (requires `--private` flag for safety) | +| 🔒 `convert` | **Experimental** - Workspace transformations; requires `MLXK2_ENABLE_ALPHA_FEATURES=1` | +| 🔒 `pipe mode` | **Beta feature** - Unix pipes with `mlxk run - ...`; requires `MLXK2_ENABLE_PIPES=1` | + ## Model References MLX-Knife supports multiple ways to reference models: @@ -194,15 +210,21 @@ mlxk run "Phi-4" "Hello" # Fuzzy match mlxk show "Qwen3@e96" --json # Specific version ``` -### Local Paths (2.0.4-beta.6+) +### Local Paths (2.0.4-beta.7+) | Format | Example | |--------|---------| | Relative | `./my-workspace` | | Absolute | `/Volumes/External/model` | -| Home | `~/models/fine-tuned` | +| Prefix match | `./gemma-` (all workspaces starting with "gemma-") | +| Directory | `.` (all workspaces in current directory) | ```bash +# List workspaces +mlxk list . # All workspaces in current directory +mlxk list ./gemma- # Prefix match: gemma-3n-4bit, gemma-3n-FIXED-4bit, ... +mlxk list $PWD/models # Absolute path → absolute output + # Clone → Run mlxk clone org/model ./workspace mlxk run ./workspace "Hello" @@ -212,20 +234,21 @@ mlxk convert ./broken ./fixed --repair-index mlxk run ./fixed "Test" ``` +**Output format:** List output mirrors input format - relative patterns produce relative names (like `ls`), absolute paths produce absolute names. + **Disambiguating paths vs cache names:** When a local directory exists with the same name as a cached model, use `./` prefix to force workspace resolution. Otherwise, cache lookup is attempted first. --- -## Workspace Development Workflow (2.0.4-beta.6+) +## Workspace Development Workflow (2.0.4-beta.7+) **Complete local development cycle** for model experimentation, repair, and testing without HuggingFace round-trips: ```bash -export MLXK2_ENABLE_ALPHA_FEATURES=1 - # Clone → Repair → Test → Publish (optional) mlxk clone "model" ./workspace -mlxk convert ./workspace ./fixed --repair-index # Fix mlx-vlm #624 models +MLXK2_ENABLE_ALPHA_FEATURES=1 mlxk convert ./workspace ./fixed --repair-index +mlxk list . # See all local workspaces mlxk run ./fixed "test prompt" # Local inference mlxk server --model ./fixed # Dev server mlxk push ./fixed "your-org/model" # Optional publish @@ -248,8 +271,9 @@ mlxk push ./fixed "your-org/model" # Optional publish | `clone` | ✅ Creates | `mlxk clone model ./workspace` | | `convert` | ✅ Yes | `mlxk convert ./in ./out --repair-index` | | `push` | ✅ Yes | `mlxk push ./workspace "org/name"` | +| `list` | ✅ Yes | `mlxk list .` or `mlxk list ./gemma-` | | `pull` | ❌ Cache only | Downloads to HuggingFace cache | -| `list` | ❌ Cache only | Lists cached models only | +| `rm` | ❌ Cache only | Use `rm -rf ./workspace` for local directories | --- @@ -274,23 +298,6 @@ open webui/index.html **Note:** nChat is a separate project designed for the entire BROKE ecosystem (MLX Knife + BROKE Cluster). See [nChat README](https://github.com/mzau/broke-nchat/blob/main/README.md) for CORS configuration. -## Commands - -| Command | Description | -|---------|-------------| -| `server`/`serve` | OpenAI-compatible API server; SIGINT-robust (Supervisor); SSE streaming | -| `run` | Interactive and single-shot model execution with streaming/batch modes | -| `list` | Model discovery with JSON output | -| `health` | Corruption detection and cache analysis | -| `show` | Detailed model information with --files, --config | -| `pull` | HuggingFace model downloads with corruption detection | -| `rm` | Model deletion with lock cleanup and fuzzy matching | -| 🔒 `push` | **Alpha feature** - Upload to HuggingFace Hub; requires `MLXK2_ENABLE_ALPHA_FEATURES=1` | -| 🔒 `clone` | **Alpha feature** - Model workspace cloning; requires `MLXK2_ENABLE_ALPHA_FEATURES=1` | -| 🔒 `convert` | **Beta feature** - Workspace transformations (repair-index, quantize); `--repair-index` fixes mlx-vlm #624 models | -| 🔒 `pipe mode` | **Beta feature** - Unix pipes with `mlxk run - ...`; requires `MLXK2_ENABLE_PIPES=1` | - - ## Multi-Modal Support MLX Knife supports multiple input modalities beyond text. All multi-modal features share a **common output pattern**: model responses are followed by collapsible metadata tables for transparency and traceability. @@ -326,6 +333,8 @@ mlxk run "mlx-community/Llama-3.2-11B-Vision-Instruct-4bit" "What is 2+2?" #### Batch Processing +**Terminology Note:** mlx-knife uses "batch" in the traditional computing sense (sequential job processing in groups), not ML inference batching (parallel batch_size > 1 in a single forward pass). Images are processed sequentially in groups for memory safety, not performance parallelization. + **Breaking Change (2.0.4-beta.6):** Vision processing now defaults to processing **one image at a time** for maximum stability on all systems. Use `--chunk N` to process multiple images per batch when your system can handle it. ```bash @@ -338,21 +347,21 @@ mlxk run pixtral "Describe images" --chunk 5 --image photos/*.jpg # Alternative: Use --prompt flag (useful when experimenting with different prompts) mlxk run pixtral --chunk 5 --image photos/*.jpg --prompt "Describe images" -# Set default batch size via environment variable -export MLXK2_VISION_BATCH_SIZE=3 +# Set default chunk size via environment variable +export MLXK2_VISION_CHUNK_SIZE=3 mlxk run pixtral "Describe images" --image photos/*.jpg ``` **Why chunking?** -- **Safety:** Prevents Metal OOM crashes with large image batches -- **Isolation:** Fresh model load per chunk prevents state leakage between batches +- **Safety:** Prevents Metal OOM crashes by limiting images per processing group (`--chunk N`) +- **Isolation:** Fresh inference session per chunk (KV cache cleared, conversation context reset) - **Trade-off:** ~2-3s model load overhead per chunk vs guaranteed isolation -**⚠️ Important:** Some vision models may hallucinate details about non-existent images when: -- Processing larger chunks (model sees global context like "8 total images" but only has 4 in current batch) -- Prompts use plural forms that don't match actual image count (e.g., "describe these images" when chunk=1) +**Reliability (2.0.4-beta.7):** Vision models can sometimes describe details they didn't actually see. MLX Knife prevents this automatically: +- **Default (chunk=1):** Most reliable - each image processed independently +- **Larger chunks:** Still safe, but models may occasionally confuse details between images in the same batch -Default chunk=1 with singular prompts provides maximum robustness. +For maximum accuracy, use the default chunk=1 (no configuration needed). **Server API:** ```bash @@ -361,39 +370,42 @@ curl -X POST http://localhost:8000/v1/chat/completions \ -d '{"model": "pixtral", "chunk": 3, "messages": [...]}' ``` +**Note:** `chunk` is an mlx-knife extension parameter. See [SERVER-HANDBOOK.md](docs/SERVER-HANDBOOK.md) for details. + #### Metadata Output Format When processing images, MLX Knife automatically prepends metadata in a **collapsible table** (collapsed by default) **before** the model output: ```
-📸 Batch 1/3: Images 1-4 +📸 Chunk 1/3: Images 1-4 | Image | Filename | Original | Location | Date | Camera | |-------|----------|----------|----------|------|--------| -| 1 | image_abc123.jpeg | beach.jpg | 📍 32.79°N, 16.92°W | 📅 2023-12-06 12:19 | 📷 Apple iPhone SE | -| 2 | image_def456.jpeg | mountain.jpg | 📍 32.87°N, 17.17°W | 📅 2023-12-10 15:42 | 📷 Apple iPhone SE | -| 3 | image_xyz789.jpeg | sunset.jpg | 📍 32.82°N, 17.05°W | 📅 2023-12-08 18:30 | 📷 Apple iPhone SE | -| 4 | image_uvw456.jpeg | forest.jpg | 📍 32.88°N, 17.12°W | 📅 2023-12-09 10:15 | 📷 Apple iPhone SE | +| 1 | image_abc123.jpeg | beach.jpg | 📍 32.7900°N, 16.9200°W | 📅 2023-12-06 12:19 | 📷 Apple iPhone SE | +| 2 | image_def456.jpeg | mountain.jpg | 📍 32.8700°N, 17.1700°W | 📅 2023-12-10 15:42 | 📷 Apple iPhone SE | +| 3 | image_xyz789.jpeg | sunset.jpg | 📍 32.8200°N, 17.0500°W | 📅 2023-12-08 18:30 | 📷 Apple iPhone SE | +| 4 | image_uvw456.jpeg | forest.jpg | 📍 32.8800°N, 17.1200°W | 📅 2023-12-09 10:15 | 📷 Apple iPhone SE |
A beach with palm trees and clear blue water. A mountain landscape with snow-capped peaks... ``` -**Batch information in summary:** -- Shows current batch and total batches (e.g., "Batch 1/3") -- Shows image range in current batch (e.g., "Images 1-4") +**Chunk information in summary:** +- Shows current chunk and total chunks (e.g., "Chunk 1/3") +- Shows image range in current chunk (e.g., "Images 1-4") - Helps track progress in WebUI and prevents confusion about which images are being described -- **Important:** Batch context prevents hallucination by making scope clear to both model and user **Why metadata comes first:** -- Vision models can **reference metadata in their analysis** (filename, GPS coordinates, date visible in prompt context) -- Clearer association with output when processing multiple chunks +- The model sees GPS, date, and camera info when analyzing images (enables location/time-aware descriptions) +- The markdown table shows you exactly what the model knows about each image +- Helps verify which description belongs to which file **Metadata includes:** - **Image ID** → **Filename mapping** (identify which description belongs to which file) - **GPS coordinates** (latitude/longitude, if available in EXIF) + - Precision: 4 decimal places (~11m accuracy) for street-level context - **Capture date/time** (ISO 8601 format) - **Camera model** (device info) @@ -410,12 +422,11 @@ mlxk run vision-model --image photo.jpg "describe" #### Limitations -- **Non-streaming:** Vision runs always use batch mode (no streaming output) -- **Image limits:** Model-dependent due to Metal buffer constraints (~41.7GB on Apple Silicon) +- **Image limits:** Model-dependent due to Metal / unified-memory constraints and peak activation usage - **pixtral-12b-8bit:** Up to 5 images tested on M2 Max 64GB (multi-image capable) - **Llama-3.2-11B / Other models:** Single-image only - **Larger models (24B+):** Limited to 1-2 images on 64GB RAM - - **Per-image:** 20 MB max, 50 MB total per request + - **Default server guardrails:** 20 MB per image, 50 MB total (configurable). Base64 encoding adds ~33% overhead. #### Server API @@ -799,7 +810,7 @@ Enable experimental and alpha features: | Variable | Description | Default | Since | |----------|-------------|---------|-------| -| `MLXK2_ENABLE_ALPHA_FEATURES` | Enable alpha commands (`clone`, `push`) | `0` (disabled) | 2.0.0 | +| `MLXK2_ENABLE_ALPHA_FEATURES` | Enable alpha commands (`convert`) | `0` (disabled) | 2.0.0 | | `MLXK2_ENABLE_PIPES` | Enable Unix pipe integration (`mlxk run -`) | `0` (disabled) | 2.0.4 | | `MLXK2_EXIF_METADATA` | Extract EXIF metadata from images (Vision models) | `1` (enabled) | 2.0.4 | @@ -813,10 +824,9 @@ echo "Hello" | mlxk run model - "translate to Spanish" export MLXK2_EXIF_METADATA=0 mlxk run vision-model --image photo.jpg "describe this" -# Enable alpha features for development +# Enable alpha features for convert command export MLXK2_ENABLE_ALPHA_FEATURES=1 -mlxk clone model-name ./workspace -mlxk push ./workspace org/model --private --create +mlxk convert ./broken ./fixed --repair-index ``` ### Server Configuration @@ -837,12 +847,12 @@ Control vision model behavior (Python 3.10+, beta): | Variable | Description | Default | Since | |----------|-------------|---------|-------| -| `MLXK2_VISION_BATCH_SIZE` | Default chunk size for vision image processing | `1` | 2.0.4-beta.6 | +| `MLXK2_VISION_CHUNK_SIZE` | Default chunk size for vision image processing | `1` | 2.0.4-beta.7 | **Examples:** ```bash -# Process 3 images per batch instead of 1 (faster but requires more RAM) -export MLXK2_VISION_BATCH_SIZE=3 +# Process 3 images per chunk instead of 1 (faster but requires more RAM) +export MLXK2_VISION_CHUNK_SIZE=3 mlxk run pixtral --image photos/*.jpg "Describe images" # CLI flag overrides environment variable @@ -949,7 +959,7 @@ mlxk health --json | jq '.data.summary' ``` -## Feature Gates: `clone`, `push` (Alpha), `pipe mode` (Beta) +## Workspace Features: `clone`, `push`, `convert` ### Workspace Structure @@ -992,25 +1002,23 @@ workspace/ ### `clone` - Model Workspace Creation -`mlxk clone` is a hidden alpha feature. Enable with `MLXK2_ENABLE_ALPHA_FEATURES=1`. It creates a local workspace from a cached model for modification and development. +`mlxk clone` creates a local workspace from a cached model for modification and development. - Creates isolated workspace from cached models - Supports APFS copy-on-write optimization on same-volume scenarios - Includes health check integration for workspace validation +- Resumable: Interrupted pulls resume automatically - Use case: Fork-modify-push workflows Example: ```bash -# Enable alpha features -export MLXK2_ENABLE_ALPHA_FEATURES=1 - # Clone model to workspace mlxk clone org/model ./workspace ``` ### `push` - Upload to Hub -`mlxk push` is a hidden alpha feature. Enable with `MLXK2_ENABLE_ALPHA_FEATURES=1`. It uploads a local folder to a Hugging Face model repository using `huggingface_hub/upload_folder`. +`mlxk push` uploads a local folder to a Hugging Face model repository using `huggingface_hub/upload_folder`. - Requires `HF_TOKEN` (write-enabled). - Default branch: `main` (explicitly override with `--branch`). @@ -1024,23 +1032,18 @@ mlxk clone org/model ./workspace - Local workspace check: use `--check-only` to validate a workspace without uploading. Produces `workspace_health` in JSON (no token/network required). - Dry-run planning: use `--dry-run` to compute a plan vs remote without uploading. Returns `dry_run: true`, `dry_run_summary {added, modified:null, deleted}`, and sample `added_files`/`deleted_files`. - Testing: see TESTING.md ("Push Testing (2.0)") for offline tests and opt-in live checks with markers/env. -- Intended for early testers only. Carefully review the result on the Hub after pushing. +- Carefully review the result on the Hub after pushing. - Responsibility: **You are responsible for complying with Hugging Face Hub policies and applicable laws (e.g., copyright/licensing) for any uploaded content.** Example: ```bash -# Enable alpha features -export MLXK2_ENABLE_ALPHA_FEATURES=1 - -# Use push command +# Upload to private repo mlxk push --private ./workspace org/model --create --commit "init" ``` -These features are not final and may change or be removed in future releases. +### `convert` - Workspace Transformations (Experimental) -### `convert` - Workspace Transformations (Beta) - -`mlxk convert` transforms workspaces (repair, quantize, etc.). The `--repair-index` mode is beta (feature complete) and fixes safetensors index/shard mismatches. +`mlxk convert` is **experimental** and requires `MLXK2_ENABLE_ALPHA_FEATURES=1`. Currently implements `--repair-index` for fixing safetensors index mismatches (mlx-vlm #624). Future modes like `--quantize` are planned but not yet implemented. **Use case:** Repair models affected by mlx-vlm #624 conversion bug (7+ mlx-community Vision models). @@ -1179,7 +1182,7 @@ Apache License 2.0 — see `LICENSE` (root) and `mlxk2/NOTICE`.

Made with ❤️ by The BROKE team BROKE Logo
- Version 2.0.4-beta.6 | January 2026
+ Version 2.0.4-beta.7 | January 2026
💬 Web UI: nChat - lightweight chat interface🔮 Multi-node: BROKE Cluster

diff --git a/TESTING-DETAILS.md b/TESTING-DETAILS.md index f657612..6089647 100644 --- a/TESTING-DETAILS.md +++ b/TESTING-DETAILS.md @@ -4,7 +4,7 @@ This document contains version-specific details, complete file listings, and imp ## Current Status -✅ **2.0.4-beta.6** — Probe/Policy architecture complete; Vision support Phase 1-3 (CLI + Server); Pipes/Memory-Aware; EXIF metadata; **Test Portfolio Separation complete**; Workspace Infrastructure (ADR-018 Phase 0a+0b+0c); Convert Operation (ADR-018 Phase 1); Resumable Clone. +✅ **2.0.4-beta.7** — Probe/Policy architecture complete; Vision support Phase 1-3 (CLI + Server); Pipes/Memory-Aware; EXIF metadata; **Test Portfolio Separation complete**; Workspace Infrastructure (ADR-018 Phase 0a+0b+0c); Convert Operation (ADR-018 Phase 1); Resumable Clone; **Benchmark Schema v0.2.1** (Vision/Text inference modality differentiation). ### Test Results (Official Reference) @@ -12,7 +12,7 @@ This document contains version-specific details, complete file listings, and imp ``` Platform: macOS 26.2 (Tahoe), M2 Max, 64GB RAM Python: 3.9-3.14 (Multi-Python verified) -Results: 550 passed, 56 skipped +Results: 553 passed, 56 skipped (includes 4 vision chunk streaming tests) Note: Default suite works on 16GB. Wet-umbrella: 64GB recommended (M1 Max 32GB untested) ``` @@ -23,11 +23,11 @@ Results: 144+ passed, 21 skipped **Wet Umbrella (4-Phase Integration):** ``` -Phase 1 (wet marker): 152 passed, 34 skipped, 579 deselected +Phase 1 (wet marker): 161 passed, 72 skipped, 579 deselected (Schema v0.2.1) Phase 2 (live_pull): 3 passed, 630 deselected Phase 3 (live_clone): 3 passed, 630 deselected Phase 4 (live_vision_pipe): 3 passed (requires vision+text models, skips if unavailable) -Total: 161 passed across all phases +Total: 170 passed across all phases ``` ✅ **Production verified & reported:** M1, M1 Max, M2 Max in real-world use @@ -52,46 +52,7 @@ Total: 161 passed across all phases - **2 Show Portfolio tests** - Display text/vision portfolios separately (requires HF_HOME) - **7 Issue #27 tests** - Real-model health validation (requires HF_HOME or MLXK2_USER_HF_HOME setup) -**Portfolio Discovery** (ADR-009) is implemented in `tests_2.0/test_stop_tokens_live.py`. When `HF_HOME` is set, tests auto-discover all MLX chat models in user cache using `mlxk list --json` (production command). This ensures Issue #32 fix is validated across the full model portfolio. **Current validation:** 17 models discovered, 15 testable (60% RAM budget), 73/81 tests passing, 0 failures. Portfolio includes: Phi-3, DeepSeek-R1, GPT-oss, Llama, Qwen, Mistral, Mixtral families. - -**New coverage in 2.0.4-beta.1:** -- JSON-mode interactive rejection emits JSON on stdout with exit code 1. -- Pipe stdin semantics for `mlxk run` (`-` reads stdin, non-TTY forces batch) behind `MLXK2_ENABLE_PIPES=1`. -- **SIGPIPE handling:** Handler set to SIG_DFL for graceful pipe termination (e.g., `mlxk run | head -1`). -- **BrokenPipeError handling:** Streaming and batch output catch BrokenPipeError for robust pipe chains. -- **Vision CLI support (ADR-012 Phase 1-2):** Implementation with `--image` flag - - VisionRunner wraps mlx-vlm backend (non-streaming, batch-only) - - Auto-routing: Vision models use mlx-vlm; text models use mlx-lm - - **5 deterministic CLI E2E tests:** Chess position reading, OCR text extraction, color recognition, chart label reading, large image support (2.7MB validates 10MB limit) -- **Vision Server support (ADR-012 Phase 3):** HTTP API for vision requests - - Backend-aware `get_or_load_model()`: Loads MLXRunner OR VisionRunner based on policy - - `ChatMessage.content` extended for OpenAI Vision format (`Union[str, List[Dict]]`) - - Streaming graceful degradation (SSE emulation) for vision models (mlx-vlm doesn't support true streaming) - - **17 unit tests** in `test_server_vision.py` (ChatMessage, image detection, helpers) - - **3 E2E tests** in `test_vision_server_e2e.py` (Base64 image, streaming graceful degradation, text on vision server) -- **Test Portfolio Separation (CLAUDE.md):** Text and Vision models tested independently - - **Separate discovery functions:** `discover_text_models()` and `discover_vision_models()` with vision capability filtering - - **RAM calculation modularization:** Text models use 1.2x multiplier; Vision models use 0.70 threshold (ADR-016) - - **New fixtures:** `text_portfolio`, `vision_portfolio`, `text_model_info`, `vision_model_info` - - **Parametrized E2E tests:** text_XX (23 text models), vision_XX (3 vision models) - deterministic indices - - **21 new unit tests:** 10 portfolio discovery tests, 11 RAM calculation tests - - **Benchmark reporting updated:** Dynamically selects correct model_info fixture - - **Diagnostic tool:** `show_portfolios.py` displays separated portfolios with RAM estimates -- `mlx-run` wrapper entrypoint argv injection. -- Tests added: `tests_2.0/test_cli_run_exit_codes.py` (pipe/JSON/SIGPIPE/BrokenPipe), `tests_2.0/test_cli_run_wrapper.py`, `tests_2.0/live/test_vision_e2e_live.py` (5 vision CLI E2E tests), `tests_2.0/test_server_vision.py` (17 vision server unit tests), `tests_2.0/live/test_vision_server_e2e.py` (3 vision server E2E tests), `tests_2.0/test_portfolio_discovery.py` (10 tests), `tests_2.0/test_ram_calculation.py` (11 tests), `tests_2.0/live/test_portfolio_fixtures.py` (7 validation tests), `tests_2.0/show_portfolios.py` (diagnostic script). - -**New coverage in 2.0.4-beta.6:** -- **Vision Batch Processing (ADR-012 Phase 1c):** `--chunk N` flag for processing images in isolated batches - - Default: `--chunk 1` (incremental output, fresh VisionRunner per chunk) - - Server support: Unlimited images with safe chunking (MAX_SAFE_CHUNK_SIZE=5) - - Context-line in prompt: Batch info visible to model and user -- **Vision→Geo Pipe Integration Tests:** Smoke tests for complete pipeline (marker: `live_vision_pipe`) - - **3 tests** in `tests_2.0/live/test_pipe_vision_geo.py`: Vision batch processing, complete pipe workflow, chunk isolation - - Validates: Sessions 72-75 fixes (chunk isolation, pipe stdin + `--prompt`, server chunking) - - Uses: `tests_2.0/assets/geo-test/` (9 JPEGs with EXIF metadata) - - PASSED criteria: Process exits 0, output not empty, mentions expected terms (smoke test only, no quality metrics) - -For complete test file structure, see [Appendix](#complete-test-file-structure-201). +**Portfolio Discovery** (ADR-009) auto-discovers MLX models in user cache using `mlxk list --json`. Validates fixes across the full model portfolio with RAM-aware skipping. --- @@ -102,13 +63,13 @@ For complete test file structure, see [Appendix](#complete-test-file-structure-2 | Default suite | `pytest -v` | — | JSON-API (list/show/health), Human-Output, Model-Resolution, Health-Policy, Push Offline (`--check-only`, `--dry-run`), Spec/Schema checks | No | | Spec only | `pytest -m spec -v` | `spec` | Schema/contract tests, version sync, docs example validation | No | | Exclude spec | `pytest -m "not spec" -v` | `not spec` | Everything except spec/schema checks | No | -| Push offline | `pytest -k push -v` | — | Push offline tests (tests alpha feature: `--check-only`, `--dry-run`, error handling); no network, no credentials needed | No | +| Push offline | `pytest -k push -v` | — | Push offline tests (`--check-only`, `--dry-run`, error handling); no network, no credentials needed | No | | Live pipe mode | `MLXK2_ENABLE_PIPES=1 pytest -m live_e2e tests_2.0/live/test_cli_pipe_live.py -v` | `live_e2e`; Env: `HF_HOME`, `MLXK2_ENABLE_PIPES=1` | Stdin `-`, pipe auto-batch, JSON interactive error path, list→run pipe; first eligible model from portfolio discovery | No (uses local cache) | | Vision→Geo pipe | `MLXK2_ENABLE_PIPES=1 pytest -m live_vision_pipe -v` | `live_vision_pipe` (new marker); Env: `HF_HOME` (requires vision + text models), `MLXK2_ENABLE_PIPES=1`; Optional: `MLXK2_VISION_BATCH_SIZE=N` (default: 1) | **Smoke test for complete Vision→Geo pipeline.** Validates: Vision batch processing (`--chunk 1`), chunk isolation (no state leakage), pipe stdin + `--prompt` combination, geo inference. **PASSED criteria:** Process exits 0, output not empty, output mentions expected terms. Uses `tests_2.0/assets/geo-test/` (9 JPEGs with EXIF). | No (uses local cache) | -| Live push | `MLXK2_ENABLE_ALPHA_FEATURES=1 pytest -m live_push -v` | `live_push` (subset of `wet`) + Env: `MLXK2_ENABLE_ALPHA_FEATURES=1`, `MLXK2_LIVE_PUSH=1`, `HF_TOKEN`, `MLXK2_LIVE_REPO`, `MLXK2_LIVE_WORKSPACE` | JSON push against the real Hub; on errors the test SKIPs (diagnostic) | Yes | +| Live push | `pytest -m live_push -v` | `live_push` (subset of `wet`) + Env: `MLXK2_LIVE_PUSH=1`, `HF_TOKEN`, `MLXK2_LIVE_REPO`, `MLXK2_LIVE_WORKSPACE` | JSON push against the real Hub; on errors the test SKIPs (diagnostic) | Yes | | Live list | `pytest -m live_list -v` | `live_list` (subset of `wet`) + Env: `HF_HOME` (user cache with models) | Tests list/health against user cache models | No (uses local cache) | -| Clone offline | `pytest -k clone -v` | — | Clone offline tests (tests alpha feature: APFS validation, temp cache, CoW workflow); no network needed | No | -| Live clone (ADR-007) | `MLXK2_ENABLE_ALPHA_FEATURES=1 pytest -m live_clone -v` | `live_clone` + Env: `MLXK2_ENABLE_ALPHA_FEATURES=1`, `MLXK2_LIVE_CLONE=1`, `HF_TOKEN`, `MLXK2_LIVE_CLONE_MODEL`, `MLXK2_LIVE_CLONE_WORKSPACE` | Real clone workflow: pull→temp cache→APFS same-volume clone→workspace (ADR-007 Phase 1 constraints: same volume + APFS required) | Yes | +| Clone offline | `pytest -k clone -v` | — | Clone offline tests (APFS validation, temp cache, CoW workflow); no network needed | No | +| Live clone (ADR-007) | `pytest -m live_clone -v` | `live_clone` + Env: `MLXK2_LIVE_CLONE=1`, `HF_TOKEN`, `MLXK2_LIVE_CLONE_MODEL`, `MLXK2_LIVE_CLONE_WORKSPACE` | Real clone workflow: pull→temp cache→APFS same-volume clone→workspace (ADR-007 Phase 1 constraints: same volume + APFS required) | Yes | | Live stop tokens (ADR-009) | `pytest -m live_stop_tokens -v` | `live_stop_tokens` (required); Optional: `HF_HOME` (enables portfolio discovery) | Issue #32: Validates stop token behavior with real models. **With HF_HOME:** Portfolio Discovery auto-discovers all MLX chat models (filter: MLX+healthy+runtime+chat), RAM-aware skip, empirical report. **Without HF_HOME:** Uses 3 predefined models (see "Optional Setup" section for model requirements). | No (uses local cache) | | Live run | `pytest -m live_run -v` | `live_run` + Env: `MLXK2_USER_HF_HOME` or `HF_HOME` (user cache with `mlx-community/Phi-3-mini-4k-instruct-4bit`) | Regression tests for Issue #37: Validates private/org MLX model framework detection in run command (renames Phi-3 to simulate private-org model) | No (uses local cache) | | Live E2E (ADR-011) | `HF_HOME=/path/to/cache pytest -m live_e2e -v` | `live_e2e` (required) + Env: `HF_HOME` (optional, enables Portfolio Discovery); Requires: `httpx` installed | **✅ Working:** Server/HTTP/CLI validation with real models. Portfolio Discovery auto-discovers all MLX chat models via `mlxk list --json` (filter: MLX+healthy+runtime+chat), parametrized tests (one server per model), RAM-aware skip. | No (uses local cache) | @@ -135,10 +96,10 @@ pytest -k "clone and not live" -v pytest -m "not spec" -v # Live Push only -MLXK2_ENABLE_ALPHA_FEATURES=1 MLXK2_LIVE_PUSH=1 HF_TOKEN=... MLXK2_LIVE_REPO=... MLXK2_LIVE_WORKSPACE=... pytest -m live_push -v +MLXK2_LIVE_PUSH=1 HF_TOKEN=... MLXK2_LIVE_REPO=... MLXK2_LIVE_WORKSPACE=... pytest -m live_push -v # Live Clone only -MLXK2_ENABLE_ALPHA_FEATURES=1 MLXK2_LIVE_CLONE=1 HF_TOKEN=... MLXK2_LIVE_CLONE_MODEL=... MLXK2_LIVE_CLONE_WORKSPACE=... pytest -m live_clone -v +MLXK2_LIVE_CLONE=1 HF_TOKEN=... MLXK2_LIVE_CLONE_MODEL=... MLXK2_LIVE_CLONE_WORKSPACE=... pytest -m live_clone -v # Live List only HF_HOME=/path/to/user/cache pytest -m live_list -v @@ -381,6 +342,75 @@ def test_my_feature(text_portfolio): **Why:** Default test run excludes ALL `live` tests via `pytest -m "not live"` (used in `test-multi-python.sh`). New live tests are automatically excluded without script changes. +### Fixture Guidelines (Schema v0.2.1 - Benchmark Modality Detection) + +**CRITICAL:** New live tests MUST use modality-specific fixtures for accurate benchmark reporting: + +```python +# ✅ CORRECT - Use modality-specific fixtures +def test_my_text_feature(text_model_key, text_model_info): + """Text inference test - automatically tagged as 'text' modality.""" + pass + +def test_my_vision_feature(vision_model_key, vision_model_info): + """Vision inference test - automatically tagged as 'vision' modality.""" + pass + +# ❌ DEPRECATED - Avoid legacy fixtures +def test_old_style(model_key): # Don't use - shows as "Unknown (legacy)" in reports + pass +``` + +**Available Fixtures:** + +| Fixture | Modality | Use Case | +|---------|----------|----------| +| `text_model_key` | Text | Parametrized text model tests | +| `text_model_info` | Text | Access model metadata (size, path) | +| `vision_model_key` | Vision | Parametrized vision model tests | +| `vision_model_info` | Vision | Access vision model metadata | + +**DEPRECATED Fixtures (do not use in new code):** + +| Deprecated | Replacement | Reason | +|------------|-------------|--------| +| `model_key` | `text_model_key` | No modality detection | +| `portfolio_models` | `text_portfolio` | Ambiguous modality | + +**How Modality Detection Works (Schema v0.2.1):** + +The pytest hooks in `tests_2.0/conftest.py` automatically detect inference modality: + +1. **Fixture-based detection:** Tests using `vision_model_key` → `inference_modality: "vision"` +2. **Fixture-based detection:** Tests using `text_model_key` → `inference_modality: "text"` +3. **Explicit override:** Pipe tests can set modality via `request.node.user_properties` +4. **Legacy fallback:** Tests without modality fixtures → `inference_modality: "unknown"` + +**Why This Matters:** + +Benchmark reports differentiate Vision vs Text inference for mixed-modality models: + +``` +Model Size Mode Tests Time RAM (GB) +pixtral-12b-8bit 12.6GB Vision 8 316.0s 17.5-29.1 +pixtral-12b-8bit 12.6GB Text 1 14.3s 20.3 +``` + +Without modality-specific fixtures, tests appear as "Unknown (legacy)" - making reports less useful. + +**Non-Parametrized Tests:** + +For tests that don't use parametrized fixtures but still need modality reporting: + +```python +@pytest.fixture(autouse=True) +def _report_text_modality(request): + """Explicitly tag non-parametrized tests as text inference.""" + request.node.user_properties.append(("inference_modality", "text")) +``` + +See `tests_2.0/live/test_cli_pipe_live.py` for an example. + ### Compatibility Rule (Technical Background) **Why separate runs?** @@ -829,7 +859,7 @@ find "$MLXK2_USER_HF_HOME/hub" -type f \ ### Copy-on-Write (CoW) Optimization -**New in 2.0.4-beta.1:** Test model copies use CoW on macOS/APFS for instant, disk-free clones. +Test model copies use CoW on macOS/APFS for instant, disk-free clones. **How it works:** - Volume detection: `_get_volume_root()` finds mount point, `_is_apfs_volume()` verifies APFS @@ -872,7 +902,7 @@ find "$MLXK2_USER_HF_HOME/hub" -type f \ ### Vision Model Health Tests (ADR-012 Phase 2) -**New in 2.0.4-beta.1:** Real vision model health validation with controlled mutations. +Real vision model health validation with controlled mutations. ```bash # Set user cache @@ -995,7 +1025,7 @@ mlxk pull mlx-community/Llama-3.2-3B-Instruct-4bit # ~4GB RAM ### E2E Tests with Portfolio Separation (ADR-011 + Portfolio Separation) -**Status:** ✅ Working (Portfolio Separation complete, CLAUDE.md) +**Status:** ✅ Working (Portfolio Separation complete) Auto-discovers and validates Server/HTTP/CLI interfaces with real models, separated into text and vision portfolios. @@ -1086,7 +1116,7 @@ pytest -m live_e2e --collect-only # Should work without errors # Llama-3.2-90B-Vision (46.4GB, 72.5% ratio) → ⏭️ SKIP (exceeds 70%) ``` -### max_tokens Strategy: Vision vs Text (Session 31) +### max_tokens Strategy: Vision vs Text **Problem:** Vision and text models have fundamentally different context management strategies. @@ -1108,16 +1138,11 @@ pytest -m live_e2e --collect-only # Should work without errors - **Example:** Llama-3.2-11B-Vision (128K context) → Default: 2048 max_tokens - **Implementation:** `get_effective_max_tokens_vision(runner, requested_max_tokens)` -**Future (Phase 1c - Batch Processing):** -- Vision: Processes 24 images → Batched stateless (each image independent) +**Batch Processing:** +- Vision: Processes multiple images → Batched stateless (each image independent) - Text: Receives ALL vision outputs → Full shift-window context for complex queries - Example: "Compare Image 1 and Image 15" requires text model with full history -**Test Updates (Session 31):** -- E2E Vision tests: Updated from `50-100` → `2048` tokens -- Reflects realistic server defaults (no artificial limits) -- Prevents test failures from truncated responses - ### Text Portfolio E2E Tests **Status:** ✅ Complete (Portfolio Separation) @@ -1191,7 +1216,7 @@ pytest -m live_e2e --collect-only # Should work without errors 4. **test_vision_to_text_model_switch_filters_images** (special integration test) - Tests Vision→Text model switching with conversation history - Server filters `image_url` content for text models - - Validates: Multimodal history filtering (Session 26, VISION-MULTIMODAL-HISTORY-ISSUE.md) + - Validates: Multimodal history filtering - **1 test** (uses both portfolios) **RAM Gating:** @@ -1345,7 +1370,7 @@ MLXK2_LIVE_PUSH=1 \ --- -### A5. Complete Test File Structure (2.0.4-beta.5) +### A5. Complete Test File Structure (2.0.4-beta.7) ``` scripts/ @@ -1391,7 +1416,7 @@ tests_2.0/ │ ├── test_vision_server_e2e.py # Vision Server E2E tests with VISION models (ADR-012 Phase 3 + Portfolio Separation, parametrized: vision_XX) │ └── test_vm_stat_parsing.py # vm_stat output parsing validation (macOS memory metrics) ├── test_adr004_error_logging.py # ADR-004 error logging and redaction (tokens, paths) -├── test_capabilities.py # Probe/Policy architecture (ADR-012, ADR-016, Session 18-19, 45 tests) +├── test_capabilities.py # Probe/Policy architecture (ADR-012, ADR-016, 45 tests) ├── test_cli_log_json_flag.py # CLI --log-json flag behavior and JSON log format ├── test_cli_push_args.py # Push CLI args and JSON error/output handling (offline) ├── test_cli_run_exit_codes.py # CLI exit codes + pipe/JSON regressions, stdin '-', non-TTY batch, interactive JSON error, SIGPIPE, BrokenPipeError @@ -1413,7 +1438,8 @@ tests_2.0/ ├── test_json_api_show.py # JSON API show contract (base/files/config) ├── test_legacy_formats.py # Legacy model format detection (Issue #37) ├── test_model_naming.py # Conversion rules, bijection, parsing -├── test_multimodal_filtering.py # Multimodal history filtering (Vision→Text model switching, Session 27) +├── test_model_resolution_workspace.py # Workspace path resolution tests (ADR-018, explicit path detection, prefix matching) +├── test_multimodal_filtering.py # Multimodal history filtering (Vision→Text model switching) ├── test_portfolio_discovery.py # Portfolio separation discovery tests (10 tests: text/vision filtering, RAM formulas) ├── test_push_dry_run.py # Push dry-run diff planning (added/modified/deleted) ├── test_push_extended.py # Extended push: no-op vs commit, branch/retry, .hfignore @@ -1435,7 +1461,8 @@ tests_2.0/ ├── test_stop_tokens_live.py # Stop token validation with real models (marker: live_stop_tokens, ADR-009) ├── test_token_limits.py # Dynamic token calculation; server vs run policies ├── test_vision_adapter.py # Vision HTTP adapter unit tests (46 tests: Base64 decoding, OpenAI format parsing, sequential images, image ID persistence) -├── test_vision_exif.py # EXIF extraction tests (ADR-017 Phase 1, 8 tests: GPS, DateTime, Camera, collapsible table, privacy controls) +├── test_vision_chunk_streaming.py # Vision chunk streaming tests (4 tests: SSE format, multi-chunk streaming, single-chunk routing, generator integration) +├── test_vision_exif.py # EXIF extraction tests (8 tests: GPS, DateTime, Camera, collapsible table, privacy controls) ├── test_workspace_sentinel.py # Workspace infrastructure tests (ADR-018 Phase 0a, 20 tests: sentinel primitives, atomic write, managed/unmanaged detection, health checks, CLI integration) └── test_convert_repair_index.py # Convert operation tests (ADR-018 Phase 1, 11 tests: rebuild_safetensors_index, cache sanctity, workspace sentinels, validation) ``` diff --git a/benchmarks/README.md b/benchmarks/README.md index 437ee63..c44fbc9 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -72,7 +72,7 @@ See `schemas/LEARNINGS-FOR-v1.0.md` for details. ## Memory Timeline Visualization -**Tool:** `tools/memplot.py` | **Created:** Session 45 (2025-12-21) +**Tool:** `tools/memplot.py` ### Quick Start diff --git a/benchmarks/generate_benchmark_report.py b/benchmarks/generate_benchmark_report.py index bbed403..5ff4010 100644 --- a/benchmarks/generate_benchmark_report.py +++ b/benchmarks/generate_benchmark_report.py @@ -19,7 +19,7 @@ import json import sys from datetime import datetime, timezone from pathlib import Path -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional try: import jsonschema @@ -142,16 +142,35 @@ def calculate_statistics(data: List[dict]) -> Dict: degraded_ram = sum(1 for flags in quality_flags if "degraded_ram" in flags) degraded_zombies = sum(1 for flags in quality_flags if "degraded_zombies" in flags) - # Per-model statistics + # Per-model statistics (with inference modality breakdown) + # Filter: Only count actual inference tests (duration >= 0.5s) + # This excludes infrastructure tests like test_vision_model_info_fixture_works + inference_tests = [e for e in passed_with_model if e["duration"] >= 0.5] + model_stats = {} - for entry in passed_with_model: + for entry in inference_tests: model_id = entry["model"]["id"] if model_id not in model_stats: model_stats[model_id] = { "id": model_id, - "size_gb": entry["model"]["size_gb"], + "size_gb": entry["model"].get("size_gb", 0), # Default to 0 if missing (e.g., pipe tests) + # Total stats (legacy, always populated) "count": 0, "total_time": 0, + # Per-modality breakdown (NEW in v0.2.1) + "vision_count": 0, + "vision_time": 0.0, + "vision_ram_min": float("inf"), + "vision_ram_max": 0, + "text_count": 0, + "text_time": 0.0, + "text_ram_min": float("inf"), + "text_ram_max": 0, + "unknown_count": 0, + "unknown_time": 0.0, + "unknown_ram_min": float("inf"), + "unknown_ram_max": 0, + # System health (global, for backward compat) "ram_min": float("inf"), "ram_max": 0, "swap_max": 0, @@ -159,19 +178,50 @@ def calculate_statistics(data: List[dict]) -> Dict: } stats = model_stats[model_id] + duration = entry["duration"] + + # Update totals (always) stats["count"] += 1 - stats["total_time"] += entry["duration"] + stats["total_time"] += duration + + # Update modality-specific stats (NEW in v0.2.1) + modality = entry.get("metadata", {}).get("inference_modality", "unknown") + if modality == "vision": + stats["vision_count"] += 1 + stats["vision_time"] += duration + elif modality == "text": + stats["text_count"] += 1 + stats["text_time"] += duration + else: # "unknown" or any other value (backward compat) + stats["unknown_count"] += 1 + stats["unknown_time"] += duration + # Handle optional system_health (backward compatibility) + if "system_health" in entry: + ram_gb = entry["system_health"].get("ram_free_gb", 0) + # Update per-modality RAM stats + if modality == "vision": + stats["vision_ram_min"] = min(stats["vision_ram_min"], ram_gb) + stats["vision_ram_max"] = max(stats["vision_ram_max"], ram_gb) + elif modality == "text": + stats["text_ram_min"] = min(stats["text_ram_min"], ram_gb) + stats["text_ram_max"] = max(stats["text_ram_max"], ram_gb) + else: + stats["unknown_ram_min"] = min(stats["unknown_ram_min"], ram_gb) + stats["unknown_ram_max"] = max(stats["unknown_ram_max"], ram_gb) + + # Handle optional system_health - global stats (backward compatibility) if "system_health" in entry: stats["ram_min"] = min(stats["ram_min"], entry["system_health"].get("ram_free_gb", 0)) stats["ram_max"] = max(stats["ram_max"], entry["system_health"].get("ram_free_gb", 0)) stats["swap_max"] = max(stats["swap_max"], entry["system_health"].get("swap_used_mb", 0)) stats["zombies_max"] = max(stats["zombies_max"], entry["system_health"].get("zombie_processes", 0)) - # Per-test statistics + # Per-test statistics (use inference_tests to filter infrastructure tests) + # Group by (test_name, modality) to differentiate Vision/Text phases of same test import statistics test_stats = {} - for entry in passed_with_model: + for entry in inference_tests: # Extract test function name and normalize (remove parametrization) test_full = entry["test"].split("::")[-1] test_name = test_full.split("[")[0] # Remove [discovered_XX] part @@ -179,39 +229,46 @@ def calculate_statistics(data: List[dict]) -> Dict: model_id = entry["model"]["id"] model_short = model_id.replace("mlx-community/", "").split("-")[0] # Short name duration = entry["duration"] + modality = entry.get("metadata", {}).get("inference_modality", "unknown") - if test_name not in test_stats: - test_stats[test_name] = { + # Key: (test_name, modality) to separate Vision/Text phases + key = (test_name, modality) + + if key not in test_stats: + test_stats[key] = { "name": test_name, + "modality": modality, "models": set(), "runs": [], } - test_stats[test_name]["models"].add(model_id) - test_stats[test_name]["runs"].append({ + test_stats[key]["models"].add(model_id) + test_stats[key]["runs"].append({ "model": model_id, "model_short": model_short, "duration": duration }) - # Calculate aggregates per test - for test_name, stats in test_stats.items(): - durations = [r["duration"] for r in stats["runs"]] - stats["model_count"] = len(stats["models"]) - stats["median_time"] = statistics.median(durations) if durations else 0 + # Calculate aggregates per test (key is now tuple: (test_name, modality)) + for key, test_data in test_stats.items(): + durations = [r["duration"] for r in test_data["runs"]] + test_data["model_count"] = len(test_data["models"]) + test_data["median_time"] = statistics.median(durations) if durations else 0 # Find fastest and slowest - sorted_runs = sorted(stats["runs"], key=lambda r: r["duration"]) - stats["fastest"] = sorted_runs[0] if sorted_runs else None - stats["slowest"] = sorted_runs[-1] if sorted_runs else None + sorted_runs = sorted(test_data["runs"], key=lambda r: r["duration"]) + test_data["fastest"] = sorted_runs[0] if sorted_runs else None + test_data["slowest"] = sorted_runs[-1] if sorted_runs else None # Convert set to list for JSON serialization - stats["models"] = list(stats["models"]) + test_data["models"] = list(test_data["models"]) - # Hardware profile (from first entry, optional for backward compatibility) + # Hardware profile (scan for first entry with data, handles manual JSONL entries) hw_profile = {} - if data and "system" in data[0] and "hardware_profile" in data[0]["system"]: - hw_profile = data[0]["system"]["hardware_profile"] + for entry in data: + if "system" in entry and "hardware_profile" in entry["system"]: + hw_profile = entry["system"]["hardware_profile"] + break return { "total_tests": len(data), @@ -252,7 +309,7 @@ def generate_markdown(stats: Dict, input_file: Path, compare_file: Optional[Path """Generate Markdown report from statistics.""" version = stats["mlx_knife_version"] date = input_file.stem.split("-v")[0] # Extract date from filename - now = datetime.now(timezone.utc).isoformat() + now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S UTC") # Header md = f"""# Benchmark Report v{TEMPLATE_VERSION}: {version} @@ -371,31 +428,92 @@ Quality Flags (Thresholds: RAM <5 GB free, zombies >0): if compare_stats: md += f"""``` -{'Model':<42} {'Size':<7} {'Tests':<5} {'Time':<8} {'Old':<8} {'Δ':<8} {'Change':<10} {'RAM (GB)':<12} -{'='*42} {'='*7} {'='*5} {'='*8} {'='*8} {'='*8} {'='*10} {'='*12} +{'Model':<40} {'Size':<7} {'Mode':<6} {'Tests':<5} {'Time':<8} {'Old':<8} {'Δ':<8} {'Change':<10} {'RAM (GB)':<12} +{'='*40} {'='*7} {'='*6} {'='*5} {'='*8} {'='*8} {'='*8} {'='*10} {'='*12} """ else: md += f"""``` -{'Model':<50} {'Size':<8} {'Tests':<6} {'Time':<10} {'RAM (GB)':<20} -{'='*50} {'='*8} {'='*6} {'='*10} {'='*20} +{'Model':<50} {'Size':<8} {'Mode':<6} {'Tests':<6} {'Time':<10} {'RAM (GB)':<20} +{'='*50} {'='*8} {'='*6} {'='*6} {'='*10} {'='*20} """ for model in sorted_models: # Shorten model ID (remove mlx-community/ prefix) model_short = model['id'].replace('mlx-community/', '') - max_len = 40 if compare_stats else 48 + max_len = 38 if compare_stats else 48 if len(model_short) > max_len: model_short = model_short[:max_len-3] + "..." + # Global RAM range (for backward compat / fallback) ram_range = f"{model['ram_min']:.1f}-{model['ram_max']:.1f}" if compare_stats: old_model = compare_models.get(model['id']) - if old_model: + + # Separate rows per modality (same as non-comparison mode) + rows_written = 0 + + # Vision modality + if model['vision_count'] > 0: + v_ram_min = model['vision_ram_min'] + v_ram_max = model['vision_ram_max'] + if v_ram_min == float('inf'): + v_ram_range = "-" + elif v_ram_min == v_ram_max: + v_ram_range = f"{v_ram_min:.1f}" + else: + v_ram_range = f"{v_ram_min:.1f}-{v_ram_max:.1f}" + + # Get old vision stats (if available) + if old_model and old_model.get('vision_count', 0) > 0: + old_time = old_model['vision_time'] + delta = model['vision_time'] - old_time + change_pct = (delta / old_time * 100) if old_time > 0 else 0 + if change_pct > 5: + status = "⚠️" + elif change_pct < -1: + status = "✅" + else: + status = "" + change_str = f"{change_pct:+.1f}% {status}" + md += f"{model_short:<40} {model['size_gb']:>5.1f}GB {'Vision':<6} {model['vision_count']:<5} {model['vision_time']:>6.1f}s {old_time:>6.1f}s {delta:>+6.1f}s {change_str:<10} {v_ram_range:<12}\n" + else: + md += f"{model_short:<40} {model['size_gb']:>5.1f}GB {'Vision':<6} {model['vision_count']:<5} {model['vision_time']:>6.1f}s {'N/A':<8} {'N/A':<8} {'NEW':<10} {v_ram_range:<12}\n" + rows_written += 1 + + # Text modality + if model['text_count'] > 0: + t_ram_min = model['text_ram_min'] + t_ram_max = model['text_ram_max'] + if t_ram_min == float('inf'): + t_ram_range = "-" + elif t_ram_min == t_ram_max: + t_ram_range = f"{t_ram_min:.1f}" + else: + t_ram_range = f"{t_ram_min:.1f}-{t_ram_max:.1f}" + + # Get old text stats (if available) + if old_model and old_model.get('text_count', 0) > 0: + old_time = old_model['text_time'] + delta = model['text_time'] - old_time + change_pct = (delta / old_time * 100) if old_time > 0 else 0 + if change_pct > 5: + status = "⚠️" + elif change_pct < -1: + status = "✅" + else: + status = "" + change_str = f"{change_pct:+.1f}% {status}" + md += f"{model_short:<40} {model['size_gb']:>5.1f}GB {'Text':<6} {model['text_count']:<5} {model['text_time']:>6.1f}s {old_time:>6.1f}s {delta:>+6.1f}s {change_str:<10} {t_ram_range:<12}\n" + else: + md += f"{model_short:<40} {model['size_gb']:>5.1f}GB {'Text':<6} {model['text_count']:<5} {model['text_time']:>6.1f}s {'N/A':<8} {'N/A':<8} {'NEW':<10} {t_ram_range:<12}\n" + rows_written += 1 + + # Fallback for legacy data (no modality info) - rare in comparison mode + if rows_written == 0 and old_model: old_time = old_model['total_time'] delta = model['total_time'] - old_time change_pct = (delta / old_time * 100) if old_time > 0 else 0 - # Status indicator if change_pct > 5: status = "⚠️" elif change_pct < -1: @@ -403,37 +521,121 @@ Quality Flags (Thresholds: RAM <5 GB free, zombies >0): else: status = "" change_str = f"{change_pct:+.1f}% {status}" - md += f"{model_short:<42} {model['size_gb']:>5.1f}GB {model['count']:<5} {model['total_time']:>6.1f}s {old_time:>6.1f}s {delta:>+6.1f}s {change_str:<10} {ram_range:<12}\n" - else: - md += f"{model_short:<42} {model['size_gb']:>5.1f}GB {model['count']:<5} {model['total_time']:>6.1f}s {'N/A':<8} {'N/A':<8} {'NEW':<10} {ram_range:<12}\n" + md += f"{model_short:<40} {model['size_gb']:>5.1f}GB {'-':<6} {model['count']:<5} {model['total_time']:>6.1f}s {old_time:>6.1f}s {delta:>+6.1f}s {change_str:<10} {ram_range:<12}\n" + elif rows_written == 0: + # New model with no modality info + md += f"{model_short:<40} {model['size_gb']:>5.1f}GB {'-':<6} {model['count']:<5} {model['total_time']:>6.1f}s {'N/A':<8} {'N/A':<8} {'NEW':<10} {ram_range:<12}\n" else: - md += f"{model_short:<50} {model['size_gb']:>6.1f}GB {model['count']:<6} {model['total_time']:>8.1f}s {ram_range:<20}\n" + # Separate rows per modality (no "Mixed" ambiguity) + # Each modality gets its own line with specific stats + RAM + rows_written = 0 + + if model['vision_count'] > 0: + # Use modality-specific RAM range (single value if min==max) + v_ram_min = model['vision_ram_min'] + v_ram_max = model['vision_ram_max'] + if v_ram_min == float('inf'): + v_ram_range = "-" + elif v_ram_min == v_ram_max: + v_ram_range = f"{v_ram_min:.1f}" + else: + v_ram_range = f"{v_ram_min:.1f}-{v_ram_max:.1f}" + md += f"{model_short:<50} {model['size_gb']:>6.1f}GB {'Vision':<6} {model['vision_count']:<6} {model['vision_time']:>8.1f}s {v_ram_range:<20}\n" + rows_written += 1 + + if model['text_count'] > 0: + # Use modality-specific RAM range (single value if min==max) + t_ram_min = model['text_ram_min'] + t_ram_max = model['text_ram_max'] + if t_ram_min == float('inf'): + t_ram_range = "-" + elif t_ram_min == t_ram_max: + t_ram_range = f"{t_ram_min:.1f}" + else: + t_ram_range = f"{t_ram_min:.1f}-{t_ram_max:.1f}" + md += f"{model_short:<50} {model['size_gb']:>6.1f}GB {'Text':<6} {model['text_count']:<6} {model['text_time']:>8.1f}s {t_ram_range:<20}\n" + rows_written += 1 + + # Fallback for legacy data (no modality info) + if rows_written == 0: + md += f"{model_short:<50} {model['size_gb']:>6.1f}GB {'-':<6} {model['count']:<6} {model['total_time']:>8.1f}s {ram_range:<20}\n" md += "```\n\n" - # Model Categories + # Model Categories (with modality differentiation) large_models = [m for m in sorted_models if m['size_gb'] >= 20] medium_models = [m for m in sorted_models if 10 <= m['size_gb'] < 20] small_models = [m for m in sorted_models if m['size_gb'] < 10] + def format_category_stats(models_list, category_name): + """Format category statistics with Vision/Text breakdown.""" + if not models_list: + return "" + + # Collect Vision and Text stats + vision_models = [m for m in models_list if m.get('vision_count', 0) > 0] + text_models = [m for m in models_list if m.get('text_count', 0) > 0] + + output = f"{category_name}: {len(models_list)} models\n" + output += f" Avg size: {sum(m['size_gb'] for m in models_list) / len(models_list):.1f} GB\n" + + # Vision stats + if vision_models: + avg_vision_time = sum(m['vision_time']/m['vision_count'] for m in vision_models) / len(vision_models) + + # Collect RAM values (filter sentinel values) + vision_ram_mins = [m['vision_ram_min'] for m in vision_models if m['vision_ram_min'] != float('inf')] + vision_ram_maxs = [m['vision_ram_max'] for m in vision_models if m['vision_ram_max'] > 0] + + output += f" Vision Tests:\n" + output += f" Models tested: {len(vision_models)}\n" + output += f" Avg test time: {avg_vision_time:.1f}s\n" + + # Only output RAM range if data available + if vision_ram_mins and vision_ram_maxs: + all_vision_ram_min = min(vision_ram_mins) + all_vision_ram_max = max(vision_ram_maxs) + output += f" RAM range: {all_vision_ram_min:.1f}-{all_vision_ram_max:.1f} GB\n" + + # Text stats + if text_models: + avg_text_time = sum(m['text_time']/m['text_count'] for m in text_models) / len(text_models) + + # Collect RAM values (filter sentinel values) + text_ram_mins = [m['text_ram_min'] for m in text_models if m['text_ram_min'] != float('inf')] + text_ram_maxs = [m['text_ram_max'] for m in text_models if m['text_ram_max'] > 0] + + output += f" Text Tests:\n" + output += f" Models tested: {len(text_models)}\n" + output += f" Avg test time: {avg_text_time:.1f}s\n" + + # Only output RAM range if data available + if text_ram_mins and text_ram_maxs: + all_text_ram_min = min(text_ram_mins) + all_text_ram_max = max(text_ram_maxs) + output += f" RAM range: {all_text_ram_min:.1f}-{all_text_ram_max:.1f} GB\n" + + # Fallback for legacy data (no modality info) + if not vision_models and not text_models: + avg_time = sum(m['total_time']/m['count'] for m in models_list) / len(models_list) + avg_ram = sum(m['ram_min'] for m in models_list) / len(models_list) + output += f" Avg test time: {avg_time:.1f}s\n" + output += f" Avg min RAM: {avg_ram:.1f} GB\n" + + return output + md += "### Model Categories\n\n" - md += f"""``` -LARGE MODELS (≥20 GB): {len(large_models)} models - Avg size: {sum(m['size_gb'] for m in large_models) / len(large_models):.1f} GB - Avg test time: {sum(m['total_time']/m['count'] for m in large_models) / len(large_models):.1f}s - Avg min RAM: {sum(m['ram_min'] for m in large_models) / len(large_models):.1f} GB - -MEDIUM MODELS (10-20 GB): {len(medium_models)} models - Avg size: {sum(m['size_gb'] for m in medium_models) / len(medium_models):.1f} GB - Avg test time: {sum(m['total_time']/m['count'] for m in medium_models) / len(medium_models):.1f}s - Avg min RAM: {sum(m['ram_min'] for m in medium_models) / len(medium_models):.1f} GB - -SMALL MODELS (<10 GB): {len(small_models)} models - Avg size: {sum(m['size_gb'] for m in small_models) / len(small_models):.1f} GB - Avg test time: {sum(m['total_time']/m['count'] for m in small_models) / len(small_models):.1f}s - Avg min RAM: {sum(m['ram_min'] for m in small_models) / len(small_models):.1f} GB -``` -""" if large_models and medium_models and small_models else "" + if large_models or medium_models or small_models: + md += "```\n" + if large_models: + md += format_category_stats(large_models, "LARGE MODELS (≥20 GB)") + md += "\n" + if medium_models: + md += format_category_stats(medium_models, "MEDIUM MODELS (10-20 GB)") + md += "\n" + if small_models: + md += format_category_stats(small_models, "SMALL MODELS (<10 GB)") + md += "```\n" md += "\n---\n\n" @@ -444,35 +646,44 @@ SMALL MODELS (<10 GB): {len(small_models)} models # Sort tests by model count (descending) - most representative tests first sorted_tests = sorted(stats['tests'].values(), key=lambda t: t['model_count'], reverse=True) - # Build comparison lookup for tests + # Build comparison lookup for tests (key: (name, modality)) compare_tests = {} if compare_stats: - compare_tests = {t['name']: t for t in compare_stats['tests'].values()} + compare_tests = {(t['name'], t.get('modality', 'unknown')): t for t in compare_stats['tests'].values()} if compare_stats: md += f"""``` -{'Test Name':<40} {'Models':<7} {'Fastest':<20} {'Slowest':<20} {'Med':<6} {'Old':<6} {'Δ Med':<8} -{'='*40} {'='*7} {'='*20} {'='*20} {'='*6} {'='*6} {'='*8} +{'Test Name':<38} {'Mode':<6} {'Models':<7} {'Fastest':<18} {'Slowest':<18} {'Med':<6} {'Old':<6} {'Δ Med':<8} +{'='*38} {'='*6} {'='*7} {'='*18} {'='*18} {'='*6} {'='*6} {'='*8} """ else: md += f"""``` -{'Test Name':<50} {'Models':<7} {'Fastest':<25} {'Slowest':<25} {'Med Time'} -{'='*50} {'='*7} {'='*25} {'='*25} {'='*8} +{'Test Name':<44} {'Mode':<6} {'Models':<7} {'Fastest':<22} {'Slowest':<22} {'Med Time'} +{'='*44} {'='*6} {'='*7} {'='*22} {'='*22} {'='*8} """ for test in sorted_tests: # Shorten test name if needed - max_test_len = 38 if compare_stats else 48 + max_test_len = 36 if compare_stats else 42 test_short = test['name'] if len(test_short) > max_test_len: test_short = test_short[:max_test_len-3] + "..." + # Format modality (Vision/Text/-) + modality = test.get('modality', 'unknown') + if modality == 'vision': + mode_str = 'Vision' + elif modality == 'text': + mode_str = 'Text' + else: + mode_str = '-' + # Format fastest/slowest fastest = test['fastest'] slowest = test['slowest'] if fastest and slowest: - max_model_len = 18 if compare_stats else 23 + max_model_len = 16 if compare_stats else 20 fastest_str = f"{fastest['model_short']} ({fastest['duration']:.1f}s)" slowest_str = f"{slowest['model_short']} ({slowest['duration']:.1f}s)" if len(fastest_str) > max_model_len: @@ -483,16 +694,16 @@ SMALL MODELS (<10 GB): {len(small_models)} models med_time = test['median_time'] if compare_stats: - old_test = compare_tests.get(test['name']) + old_test = compare_tests.get((test['name'], test.get('modality', 'unknown'))) if old_test: old_med = old_test['median_time'] delta_pct = ((med_time - old_med) / old_med * 100) if old_med > 0 else 0 delta_str = f"{delta_pct:+.1f}%" - md += f"{test_short:<40} {test['model_count']:<7} {fastest_str:<20} {slowest_str:<20} {med_time:<5.1f}s {old_med:<5.1f}s {delta_str:<8}\n" + md += f"{test_short:<38} {mode_str:<6} {test['model_count']:<7} {fastest_str:<18} {slowest_str:<18} {med_time:<5.1f}s {old_med:<5.1f}s {delta_str:<8}\n" else: - md += f"{test_short:<40} {test['model_count']:<7} {fastest_str:<20} {slowest_str:<20} {med_time:<5.1f}s {'N/A':<6} {'NEW':<8}\n" + md += f"{test_short:<38} {mode_str:<6} {test['model_count']:<7} {fastest_str:<18} {slowest_str:<18} {med_time:<5.1f}s {'N/A':<6} {'NEW':<8}\n" else: - md += f"{test_short:<50} {test['model_count']:<7} {fastest_str:<25} {slowest_str:<25} {med_time:.1f}s\n" + md += f"{test_short:<44} {mode_str:<6} {test['model_count']:<7} {fastest_str:<22} {slowest_str:<22} {med_time:.1f}s\n" md += "```\n\n" diff --git a/benchmarks/schemas/MIGRATIONS.md b/benchmarks/schemas/MIGRATIONS.md index ac8c61f..f22714d 100644 --- a/benchmarks/schemas/MIGRATIONS.md +++ b/benchmarks/schemas/MIGRATIONS.md @@ -34,24 +34,68 @@ This document tracks schema evolution for MLX Knife test reports. --- +### 0.2.0 (2025-12-08) - Scheduling-Enhanced + +**Status:** Stable (used in 2.0.4-beta.3+) + +**Added fields:** +- `model.framework`: Model framework identifier (e.g., 'MLX', 'GGUF') +- `model.quantization`: Quantization format (e.g., '4bit', '8bit', 'fp16') +- `performance.model_load_time_s`: Model loading time (critical for scheduling) +- `performance.time_to_first_token_s`: User-perceived latency metric +- `performance.cleanup_time_s`: Resource release timing +- `performance.peak_ram_gb`: Peak RAM usage during inference +- `performance.stable_ram_gb`: Steady-state RAM after warmup +- `system.hardware_profile`: Detailed hardware profiling (Mac model, cores, GPU) +- `system_health`: System health metrics (swap, RAM, zombies, quality flags) +- `timeline`: Optional detailed execution timeline for bottleneck analysis + +**Design rationale:** +- Enables memory-based scheduling decisions (ADR-016) +- Supports hardware profiling for benchmark clustering +- Quality assessment flags for benchmark validation +- Backward compatible: v0.1.0 reports remain valid + +**Breaking changes:** None (additive only) + +**Migration:** N/A (automatic upgrade) + +--- + +### 0.2.1 (2026-01-09) - Inference Modality + +**Status:** Stable (used in 2.0.4-beta.7+) + +**Added fields:** +- `metadata.inference_modality`: Type of inference performed + - Values: `"vision"` | `"text"` | `"audio"` | `"video"` + - Purpose: Differentiate Vision/Text inference for multimodal models + - Example: Pixtral can do both Vision (with --image) and Text (without --image) + +**Design rationale:** +- Vision-capable models perform BOTH Vision and Text inference +- Benchmark reports need to differentiate these for accurate statistics +- Per-model stats can now show: "Vision: 136s (90%), Text: 15s (10%)" +- Future-proof for audio/video/multimodal inference types + +**Automatic detection:** +- Vision inference: Tests with `vision_model_key` fixture OR `--image` CLI arg +- Text inference: Tests with `text_model_key` fixture OR no `--image` arg +- Pipe tests: Explicit per-phase tagging (e.g., `[vision_phase]`, `[text_phase]`) + +**Backward compatible:** +- Old reports without `inference_modality` remain valid +- Tools gracefully degrade: show only total time for legacy entries +- Mixed data (old + new) shows "Unknown (legacy)" breakdown + +**Breaking changes:** None (additive only) + +**Migration:** N/A (automatic upgrade, optional field) + +--- + ## Future Versions (Planned) -### 0.2.0 (TBD - Phase 1, when model field stabilizes) - -**Proposed changes:** -- Make `model.id` required when `outcome == "passed"` (enforce for model tests) -- Add `model.framework_version` (mlx-lm version for reproducibility) -- Standardize `stop_tokens.workaround` enum (based on collected data) -- Add `test_type` enum (stop_tokens, performance, health, etc.) - -**Migration:** -- Scripts will backfill `model.framework_version` from git history -- `stop_tokens.workaround` will be normalized (free text → enum) -- Old reports remain valid (historical data preserved) - -**Breaking changes:** -- TBD based on Phase 0 learnings - --- ### 1.0.0 (TBD - Phase 3, community-ready) diff --git a/benchmarks/schemas/report-current.schema.json b/benchmarks/schemas/report-current.schema.json index 02d7e83..6b0a81c 120000 --- a/benchmarks/schemas/report-current.schema.json +++ b/benchmarks/schemas/report-current.schema.json @@ -1 +1 @@ -report-v0.2.schema.json \ No newline at end of file +report-v0.2.1.schema.json \ No newline at end of file diff --git a/benchmarks/schemas/report-v0.2.1.schema.json b/benchmarks/schemas/report-v0.2.1.schema.json new file mode 100644 index 0000000..dec45de --- /dev/null +++ b/benchmarks/schemas/report-v0.2.1.schema.json @@ -0,0 +1,269 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "MLX Knife Test Report v0.2.1 (Inference Modality)", + "description": "Schema v0.2.1: Adds inference_modality field for Vision/Text differentiation. Backward compatible with v0.2.0.", + "type": "object", + "required": ["schema_version", "timestamp", "mlx_knife_version", "test", "outcome"], + "properties": { + "schema_version": { + "type": "string", + "enum": ["0.2.1", "0.2.0", "0.1.0"], + "description": "Schema version. 0.2.1 adds inference_modality. 0.2.0 and 0.1.0 reports remain valid." + }, + "timestamp": { + "type": "string", + "format": "date-time", + "description": "ISO 8601 timestamp of test execution (UTC recommended)" + }, + "mlx_knife_version": { + "type": "string", + "pattern": "^\\d+\\.\\d+\\.\\d+", + "description": "mlx-knife version that generated this report (SemVer, e.g., '2.0.4')" + }, + "test": { + "type": "string", + "description": "Test identifier (pytest nodeid format: path::test_name[params])" + }, + "outcome": { + "type": "string", + "enum": ["passed", "failed", "skipped"], + "description": "Test execution result" + }, + "duration": { + "type": "number", + "minimum": 0, + "description": "Test duration in seconds" + }, + "model": { + "type": "object", + "description": "Model under test (if applicable)", + "properties": { + "id": { + "type": "string", + "description": "HuggingFace model ID (org/name format)" + }, + "size_gb": { + "type": "number", + "minimum": 0, + "description": "Model size in gigabytes (disk)" + }, + "family": { + "type": "string", + "description": "Model family (e.g., 'phi-3', 'llama', 'qwen')" + }, + "variant": { + "type": "string", + "description": "Model variant (e.g., '4k-instruct', 'chat', 'base')" + }, + "framework": { + "type": "string", + "description": "Model framework (e.g., 'MLX', 'GGUF', 'PyTorch'). New in v0.2." + }, + "quantization": { + "type": "string", + "description": "Quantization format (e.g., '4bit', '8bit', 'fp16'). New in v0.2." + } + } + }, + "performance": { + "type": "object", + "description": "Performance metrics for scheduling decisions", + "properties": { + "tokens_per_sec": { + "type": "number", + "minimum": 0, + "description": "Generation speed (tokens/second). Legacy v0.1 field." + }, + "ram_peak_mb": { + "type": "number", + "minimum": 0, + "description": "Peak RAM usage in megabytes. Legacy v0.1 field (deprecated, use peak_ram_gb)." + }, + "duration_s": { + "type": "number", + "minimum": 0, + "description": "Total inference duration in seconds. Legacy v0.1 field." + }, + "prompt_tokens": { + "type": "integer", + "minimum": 0, + "description": "Number of prompt tokens. Legacy v0.1 field." + }, + "completion_tokens": { + "type": "integer", + "minimum": 0, + "description": "Number of generated tokens. Legacy v0.1 field." + }, + "model_load_time_s": { + "type": "number", + "minimum": 0, + "description": "Model loading time in seconds. Critical for scheduling startup costs. New in v0.2." + }, + "time_to_first_token_s": { + "type": "number", + "minimum": 0, + "description": "Time to first token in seconds (user-perceived latency). New in v0.2." + }, + "cleanup_time_s": { + "type": "number", + "minimum": 0, + "description": "Cleanup/shutdown time in seconds. Critical for resource release scheduling. New in v0.2." + }, + "peak_ram_gb": { + "type": "number", + "minimum": 0, + "description": "Peak RAM usage in gigabytes during inference. Critical for memory-based scheduling. New in v0.2." + }, + "stable_ram_gb": { + "type": "number", + "minimum": 0, + "description": "Steady-state RAM usage in gigabytes (after warmup). New in v0.2." + } + } + }, + "stop_tokens": { + "type": "object", + "description": "Stop token behavior (ADR-009 validation data)", + "properties": { + "configured": { + "type": "array", + "items": {"type": "string"}, + "description": "Stop tokens configured for the model" + }, + "detected": { + "type": "array", + "items": {"type": "string"}, + "description": "Stop tokens actually found in response" + }, + "workaround": { + "type": "string", + "description": "Workaround identifier (e.g., 'phi-3-dual-eos', 'none')" + }, + "leaked": { + "type": "boolean", + "description": "Whether stop tokens leaked into output (bug indicator)" + } + } + }, + "system": { + "type": "object", + "description": "System information for hardware profiling and scheduling", + "properties": { + "platform": { + "type": "string", + "description": "OS platform (e.g., 'darwin', 'linux')" + }, + "platform_version": { + "type": "string", + "description": "OS version (e.g., 'macOS 14.6', 'Ubuntu 22.04')" + }, + "python_version": { + "type": "string", + "description": "Python version (e.g., '3.11.5')" + }, + "mlx_version": { + "type": "string", + "description": "MLX framework version" + }, + "hardware": { + "type": "string", + "description": "Human-readable hardware identifier (e.g., 'M2 Max', 'M1'). Legacy v0.1 field." + }, + "ram_total_gb": { + "type": "number", + "description": "Total system RAM in GB" + }, + "hardware_profile": { + "type": "object", + "description": "Detailed hardware profile for precise scheduling. New in v0.2.", + "properties": { + "model_identifier": { + "type": "string", + "description": "System model identifier (e.g., 'Mac14,6' for M2 Max MacBook Pro 16-inch)" + }, + "chip": { + "type": "string", + "description": "Chip name (e.g., 'Apple M2 Max', 'Apple M3 Pro')" + }, + "chip_cores_performance": { + "type": "integer", + "minimum": 0, + "description": "Number of performance CPU cores" + }, + "chip_cores_efficiency": { + "type": "integer", + "minimum": 0, + "description": "Number of efficiency CPU cores" + }, + "gpu_cores": { + "type": "integer", + "minimum": 0, + "description": "Number of GPU cores (Metal)" + }, + "metal_version": { + "type": "string", + "description": "Metal API version (e.g., '3.1')" + } + } + } + } + }, + "system_health": { + "type": "object", + "description": "System health metrics for benchmark quality assessment. New in v0.2.", + "properties": { + "swap_gb": { + "type": "number", + "minimum": 0, + "description": "Swap usage in gigabytes at benchmark time" + }, + "ram_free_gb": { + "type": "number", + "minimum": 0, + "description": "Free RAM in gigabytes at benchmark start" + }, + "zombies_detected": { + "type": "integer", + "minimum": 0, + "description": "Number of zombie MLX processes detected before benchmark" + }, + "quality_flags": { + "type": "array", + "items": { + "type": "string", + "enum": ["clean", "degraded_swap", "degraded_zombies", "degraded_memory"] + }, + "description": "Quality assessment flags. 'clean' = ideal conditions, others indicate degraded benchmarks." + } + } + }, + "timeline": { + "type": "object", + "description": "Optional detailed execution timeline for bottleneck analysis. New in v0.2.", + "properties": { + "server_start_ms": {"type": "number", "minimum": 0}, + "model_load_start_ms": {"type": "number", "minimum": 0}, + "model_load_complete_ms": {"type": "number", "minimum": 0}, + "server_ready_ms": {"type": "number", "minimum": 0}, + "request_sent_ms": {"type": "number", "minimum": 0}, + "first_token_ms": {"type": "number", "minimum": 0}, + "stream_complete_ms": {"type": "number", "minimum": 0}, + "shutdown_request_ms": {"type": "number", "minimum": 0}, + "cleanup_complete_ms": {"type": "number", "minimum": 0} + } + }, + "metadata": { + "type": "object", + "description": "Extensible metadata for experimentation. New in v0.2.1: inference_modality field.", + "properties": { + "inference_modality": { + "type": "string", + "enum": ["vision", "text", "audio", "video"], + "description": "Type of inference performed (e.g., 'vision' for image inputs, 'text' for text-only). New in v0.2.1. Future-proof for multimodal." + } + }, + "additionalProperties": true + } + }, + "additionalProperties": false +} diff --git a/docs/ADR/ADR-018-Convert-Operation.md b/docs/ADR/ADR-018-Convert-Operation.md index fcc3323..668002c 100644 --- a/docs/ADR/ADR-018-Convert-Operation.md +++ b/docs/ADR/ADR-018-Convert-Operation.md @@ -1,17 +1,22 @@ # ADR-018: Convert Operation -**Status:** Partially Implemented +**Status:** Implemented (Phases 0a-0c + 1 complete in 2.0.4-beta.6) **Created:** 2025-12-18 -**Updated:** 2025-12-30 (Phase 0a+1 complete for 2.0.4-beta.5, Phase 0b+0c planned for beta.6) +**Updated:** 2026-01-10 (Gate status: clone/push production, convert experimental) **Context:** Users need to (a) quantize MLX workspaces locally without polluting the HF cache and (b) repair MLX/HF compliance issues (notably safetensors index/shard mismatches) in a deterministic way. **Phase Status:** - **Phase 0a:** Workspace infrastructure — ✅ Implemented (2.0.4-beta.5) -- **Phase 0b:** Resumable clone — 🚧 Planned (2.0.4-beta.6) -- **Phase 0c:** Workspace run/show/server support — 🚧 Planned (2.0.4-beta.6) +- **Phase 0b:** Resumable clone — ✅ Implemented (2.0.4-beta.6) +- **Phase 0c:** Workspace run/show/server support — ✅ Implemented (2.0.4-beta.6) - **Phase 1:** `--repair-index` — ✅ Implemented (2.0.4-beta.5) -**Note:** Phase 0b+0c complete the workspace infrastructure before 2.0.4 stable release. This enables full `clone → convert → run` workflow with resume support and no HF push requirement. +**Feature Gates (2.0.4-beta.7+):** +- `clone`, `push`: **Production** (no gate required) +- `convert`: **Experimental** (requires `MLXK2_ENABLE_ALPHA_FEATURES=1`) + - Rationale: `--quantize` not yet implemented, only `--repair-index` available + +**Note:** Complete workspace infrastructure shipped in 2.0.4-beta.6. Full `clone → convert → run/show/server` workflow with resume support, no HF push requirement. --- @@ -462,23 +467,21 @@ mlxk health ./ws-fixed # Should be healthy - **Files:** `mlxk2/operations/workspace.py` (NEW), `health.py` (extended), `clone.py` (integrated) - **Tests:** 20 new tests, all passing -- [ ] **Phase 0b (2.0.4-beta.6):** 🚧 Resumable clone +- [x] **Phase 0b (2.0.4-beta.6):** ✅ Resumable clone - Temp cache reuse with user prompt (analog to resumable pull) - Conditional cleanup based on workspace health - UX parity with pull operation - `--force-resume` flag for non-interactive use - - **Effort:** ~1 session + - **Status:** Complete (Sessions 67-70, beta.6) -- [ ] **Phase 0c (2.0.4-beta.6):** 🚧 Workspace run/show/server support +- [x] **Phase 0c (2.0.4-beta.6):** ✅ Workspace run/show/server support - Direct workspace execution: `mlxk run ./workspace "prompt"` - Workspace inspection: `mlxk show ./workspace` - Local dev server: `mlxk server --model ./workspace` - Central implementation in `resolve_model_for_operation()` + runners - Server: `/v1/models` shows workspace with `"owned_by": "workspace"` - - **Files:** `model_resolution.py`, `runner/__init__.py`, `vision_runner.py`, `show.py`, `server_base.py` (~85 LOC) - - **Tests:** +10-12 tests - - **Effort:** ~1 session - - **Benefit:** Complete local workflow without HF push + - **Files:** `model_resolution.py`, `runner/__init__.py`, `vision_runner.py`, `show.py`, `server_base.py` + - **Status:** Complete (Sessions 68-69, beta.6) - [x] **Phase 1 (2.0.4-beta.5):** ✅ `--repair-index` for safetensors index/shard mismatch - `rebuild_safetensors_index()` primitive diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index a5489d6..740e1d2 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -126,4 +126,4 @@ See module docstring for detailed API documentation. ## Changelog -- **2025-12-07:** Initial version (Session 19, extracted from vision_server_leitplanken.md) +- **2025-12-07:** Initial version diff --git a/docs/SERVER-HANDBOOK.md b/docs/SERVER-HANDBOOK.md index 54c1559..b15a771 100644 --- a/docs/SERVER-HANDBOOK.md +++ b/docs/SERVER-HANDBOOK.md @@ -5,7 +5,7 @@ **Last Updated:** 2025-12-15 > **Audience:** Server operators, DevOps, API consumers -> **Not for:** Developers (see `ARCHITECTURE.md` and ADRs instead) +> **For implementation details:** See `ARCHITECTURE.md` and `docs/ADR/` (developer documentation) --- @@ -63,10 +63,20 @@ mlxk serve --host 0.0.0.0 --port 8000 } ], "max_tokens": 2048, - "temperature": 0.4 + "temperature": 0.4, + "chunk": 1 } ``` +**mlx-knife Extension Parameters:** +- `chunk` (integer, optional): Batch size for vision processing (default: 1). Controls how many images are processed per inference session. Higher values may trigger OOM on resource-constrained systems. Maximum: 5 (enforced by server). + +**Default chunk size:** +1. Request parameter `chunk` (highest priority) +2. Server startup: `mlxk server --chunk N` +3. Environment: `MLXK2_VISION_CHUNK_SIZE=N` +4. Default: 1 (maximum safety) + **Response:** ```json { @@ -114,6 +124,9 @@ mlxk serve --host 0.0.0.0 --port 8000 **List available models.** +Returns all cached models that are healthy and runtime-compatible. +Models are sorted with preloaded model first (if any), then alphabetically. + **Response:** ```json { @@ -122,13 +135,31 @@ mlxk serve --host 0.0.0.0 --port 8000 { "id": "mlx-community/Llama-3.2-3B-Instruct-4bit", "object": "model", - "created": 1702345678, - "owned_by": "mlx-community" + "owned_by": "mlx-knife-2.0", + "permission": [], + "context_length": 8192 } ] } ``` +**Fields:** +- `id`: Model identifier (HuggingFace name or workspace path) +- `object`: Always `"model"` (OpenAI-compatible) +- `owned_by`: `"mlx-knife-2.0"` for cached models, `"workspace"` for local directories +- `permission`: Empty array (OpenAI legacy field) +- `context_length`: Maximum context window in tokens (may be `null` if unavailable) + +**Why context_length matters:** + +MLX Knife uses **client-side context management** (unlike OpenAI's server-side history): +- **Vision models:** Fully stateless - client holds entire conversation history +- **Text models:** Shift-window (context_length / 2 reserved for history on server) +- **Clients need this** to manage conversation pruning and token budgets +- **Load balancing:** BROKE Cluster and similar tools use this for scheduling decisions + +Note: LM Studio provides similar field as `max_context_length`. + --- ### GET /health @@ -146,11 +177,10 @@ See `examples/vision_pipe.sh` for a practical Vision→Text pipeline example (CL **Supported:** - ✅ Base64 data URLs (`data:image/jpeg;base64,...`) - ✅ Multiple images (up to 5 per request) -- ✅ Formats: JPEG, PNG +- ✅ Formats: JPEG, PNG, GIF, WebP **Limits:** - **Per-image:** 20 MB max -- **Total:** 50 MB max per request - **Count:** 5 images max per request **Important Characteristics:** @@ -259,9 +289,10 @@ Request 3: Re-upload beach.jpg → Still Image 1 (hash match) - **Completion:** `data: [DONE]\n\n` #### Vision Models -- ⚠️ **Graceful degradation:** SSE emulation (batch result split into chunks) -- **Reason:** mlx-vlm doesn't guarantee streaming support -- **Behavior:** Returns full result via SSE format for client compatibility +- ✅ **Per-chunk streaming:** Real SSE events as each image chunk completes (2.0.4-beta.7+) +- **Multiple images:** Each chunk (1-5 images) streams as it finishes processing +- **Single image:** Behaves like batch mode (one SSE event) +- **Format:** OpenAI-compatible SSE with per-chunk deltas **Request:** ```json @@ -303,9 +334,11 @@ MLXK2_ENABLE_PIPES=1 # Unix pipe integration (2.0.4-beta.1) ### Supervised Mode (Default) **Behavior:** -- Server auto-restarts on crashes +- Handles Ctrl-C gracefully (clean shutdown with 5s timeout) +- Runs server in subprocess for improved signal handling - Logs go to stderr - `--log-json` produces 100% JSON output +- **Note:** No auto-restart on crashes (use systemd/supervisor for production) **Start:** ```bash @@ -359,7 +392,8 @@ python -m mlxk2.core.server_base **Vision Models:** - **Slower than text:** Vision Encoder adds overhead - **Per-image:** ~2-5 seconds baseline + generation time -- **Multiple images:** Linear scaling (no batching in 2.0.4-beta.1) +- **Multiple images:** Processed in chunks (default: 1, max: 5 via `--chunk`) +- **Streaming:** Each chunk delivers results immediately (see Streaming section above) ### Concurrent Requests - **Current:** Sequential processing (one request at a time) @@ -412,10 +446,10 @@ pip install mlx-lm "mlx-vlm @ git+https://github.com/Blaizzy/mlx-vlm.git@c4ea290 ### Image Upload Fails (HTTP 400) **Common causes:** -- Image size > 20MB per image -- Total size > 50MB -- More than 5 images -- External URLs (not supported, use Base64) +- Image size > 20 MB per image +- More than 5 images per request +- Unsupported format (use JPEG, PNG, GIF, WebP) +- External URLs (not supported, use Base64 data URLs) - Invalid Base64 encoding **Solution:** Resize images, reduce count, or check encoding @@ -542,15 +576,15 @@ Clients MUST follow the OpenAI Chat Completions API format. MLX Knife is designe When switching from Vision to Text model mid-conversation: 1. **Client:** Continue sending full history (including previous image_url content) -2. **Server:** Automatically filters images for text models, preserves text context -3. **Result:** Text model sees `[Image 1: beach...]` placeholders instead of binary data +2. **Server:** Automatically filters images for text models, replaces with placeholders +3. **Result:** Text model sees `[n image(s) were attached]` instead of binary data **Example workflow:** ``` -1. Vision model: User sends beach.jpg → "Image 1 shows a beach" -2. Vision model: User sends mountain.jpg → "Image 2 shows a mountain" +1. Vision model: User sends 2 images → Model describes both +2. Vision model: User asks "What's different?" → Model compares 3. Switch to Text model: User asks "Which is better for vacation?" -4. Text model: Can reference "Image 1" and "Image 2" from context +4. Text model: Sees "[2 image(s) were attached]" in history, can reference the conversation ``` ### Image Deduplication diff --git a/docs/json-api-specification.md b/docs/json-api-specification.md index ab8c073..e07e87c 100644 --- a/docs/json-api-specification.md +++ b/docs/json-api-specification.md @@ -168,14 +168,14 @@ Notes: | `health` | Check model integrity and corruption | ✅ | - | | `pull` | Download models from HuggingFace | ✅ | - | | `rm` | Delete models from cache | ✅ | - | -| `clone` | Clone models to workspace directory | ✅ | `MLXK2_ENABLE_ALPHA_FEATURES=1` | -| `convert` | Repair vision model index files (--repair-index) | ✅ | `MLXK2_ENABLE_ALPHA_FEATURES=1` | -| `push` | Upload a local folder to Hugging Face (experimental) | ✅ | `MLXK2_ENABLE_ALPHA_FEATURES=1` | +| `clone` | Clone models to workspace directory | ✅ | - | +| `convert` | Workspace transformations (experimental: --repair-index) | ✅ | `MLXK2_ENABLE_ALPHA_FEATURES=1` | +| `push` | Upload a local folder to Hugging Face | ✅ | - | | `run` | Execute model inference | ✅ | - | | `serve`/`server` | OpenAI-compatible API server | ✅ | - | **Notes:** -- Commands marked with Alpha Feature require `MLXK2_ENABLE_ALPHA_FEATURES=1` environment variable to be available. +- Commands marked with `MLXK2_ENABLE_ALPHA_FEATURES=1` are experimental and require this environment variable. - **Workspace Path Support (ADR-018 Phase 0c):** Commands `show`, `run`, `serve`/`server`, and `health` now accept workspace paths (e.g., `./workspace` or `/absolute/path`) in addition to HuggingFace model IDs. Models in workspaces return `"cached": false` to distinguish them from cache-managed models. ## Model Discovery & Metadata @@ -808,8 +808,6 @@ mlxk-json rm "locked-model" --json # Error: requires --force due t ### `mlxk-json clone --json` -**Requires:** `MLXK2_ENABLE_ALPHA_FEATURES=1` - **Usage:** ```bash mlxk-json clone "Phi-3-mini" ./workspace --json # Clone to workspace directory @@ -903,8 +901,6 @@ mlxk-json clone "microsoft/DialoGPT-small" ./workspace --json # Non-MLX model ### `mlxk-json push [--create] [--private] [--branch ] [--commit "..."] [--verbose] [--check-only] --json` -**Requires:** `MLXK2_ENABLE_ALPHA_FEATURES=1` - Behavior: - Requires `HF_TOKEN` env. - Default branch: `main` (subject to change). diff --git a/mlxk2/__init__.py b/mlxk2/__init__.py index 2084fc6..9fe41f7 100644 --- a/mlxk2/__init__.py +++ b/mlxk2/__init__.py @@ -7,4 +7,4 @@ import warnings # Issue parity with 1.1.0 (Issue #22) warnings.filterwarnings('ignore', message='urllib3 v2 only supports OpenSSL 1.1.1+') -__version__ = "2.0.4b6" +__version__ = "2.0.4b7" diff --git a/mlxk2/cli.py b/mlxk2/cli.py index 48bf06d..5110ef3 100644 --- a/mlxk2/cli.py +++ b/mlxk2/cli.py @@ -169,36 +169,36 @@ def main(): pull_parser.add_argument("--json", action="store_true", help="Output in JSON format") pull_parser.add_argument("--force-resume", action="store_true", help="Force resume of partial downloads without prompting") - # Clone command (alpha) - only show if alpha features enabled - if os.getenv("MLXK2_ENABLE_ALPHA_FEATURES"): - clone_parser = subparsers.add_parser("clone", help="ALPHA: Clone a model to a local workspace") - clone_parser.add_argument("model", help="Model name to clone (org/repo[@revision])") - clone_parser.add_argument("target_dir", help="Target directory for workspace") - clone_parser.add_argument("--branch", help="Specific branch/revision to clone") - clone_parser.add_argument("--no-health-check", action="store_true", help="Skip health validation before copy") - clone_parser.add_argument("--quiet", action="store_true", help="Suppress progress output") - clone_parser.add_argument("--json", action="store_true", help="Output in JSON format") - clone_parser.add_argument("--force-resume", action="store_true", help="Force resume of partial downloads without prompting") + # Clone command - create local workspace from cached model + clone_parser = subparsers.add_parser("clone", help="Clone a model to a local workspace") + clone_parser.add_argument("model", help="Model name to clone (org/repo[@revision])") + clone_parser.add_argument("target_dir", help="Target directory for workspace") + clone_parser.add_argument("--branch", help="Specific branch/revision to clone") + clone_parser.add_argument("--no-health-check", action="store_true", help="Skip health validation before copy") + clone_parser.add_argument("--quiet", action="store_true", help="Suppress progress output") + clone_parser.add_argument("--json", action="store_true", help="Output in JSON format") + clone_parser.add_argument("--force-resume", action="store_true", help="Force resume of partial downloads without prompting") - # Convert command (ADR-018 Phase 1) - convert_parser = subparsers.add_parser( - "convert", - help="Convert workspace to workspace with transformations", - description="Transform model workspaces (repair-index, quantize, etc.)" - ) - convert_parser.add_argument("source", help="Source workspace path") - convert_parser.add_argument("target", help="Target workspace path") - convert_parser.add_argument( - "--repair-index", - action="store_true", - help="Rebuild model.safetensors.index.json from shards (fixes mlx-vlm #624)" - ) - convert_parser.add_argument( - "--skip-health", - action="store_true", - help="Skip health check on output (debug only)" - ) - convert_parser.add_argument("--json", action="store_true", help="Output in JSON format") + # Convert command (alpha) - only show if alpha features enabled + if os.getenv("MLXK2_ENABLE_ALPHA_FEATURES"): + convert_parser = subparsers.add_parser( + "convert", + help="ALPHA: Convert workspace to workspace with transformations", + description="Transform model workspaces (repair-index, quantize, etc.)" + ) + convert_parser.add_argument("source", help="Source workspace path") + convert_parser.add_argument("target", help="Target workspace path") + convert_parser.add_argument( + "--repair-index", + action="store_true", + help="Rebuild model.safetensors.index.json from shards (fixes mlx-vlm #624)" + ) + convert_parser.add_argument( + "--skip-health", + action="store_true", + help="Skip health check on output (debug only)" + ) + convert_parser.add_argument("--json", action="store_true", help="Output in JSON format") # Remove command rm_parser = subparsers.add_parser("rm", help="Delete a model") @@ -264,25 +264,24 @@ def main(): add_help=False, ) - # Push command (alpha) - only show if alpha features enabled - if os.getenv("MLXK2_ENABLE_ALPHA_FEATURES"): - push_parser = subparsers.add_parser("push", help="ALPHA: Upload a local folder to Hugging Face") - push_parser.add_argument("local_dir", help="Local folder to upload") - push_parser.add_argument("repo_id", help="Target repo as org/model") - push_parser.add_argument("--create", action="store_true", help="Create repository/branch if missing") - # Alpha.1 safety: require --private to avoid accidental public uploads - push_parser.add_argument( - "--private", - action="store_true", - required=True, - help="REQUIRED (alpha.1): Proceed only when targeting a private repo", - ) - push_parser.add_argument("--branch", default="main", help="Target branch (default: main)") - push_parser.add_argument("--commit", dest="commit_message", default="mlx-knife push", help="Commit message") - push_parser.add_argument("--verbose", action="store_true", help="Verbose details (human output)") - push_parser.add_argument("--check-only", action="store_true", help="Analyze workspace content; do not upload") - push_parser.add_argument("--dry-run", action="store_true", help="Compute changes against remote; do not upload") - push_parser.add_argument("--json", action="store_true", help="Output in JSON format") + # Push command - upload local folder to Hugging Face + push_parser = subparsers.add_parser("push", help="Upload a local folder to Hugging Face") + push_parser.add_argument("local_dir", help="Local folder to upload") + push_parser.add_argument("repo_id", help="Target repo as org/model") + push_parser.add_argument("--create", action="store_true", help="Create repository/branch if missing") + # Safety: require --private to avoid accidental public uploads + push_parser.add_argument( + "--private", + action="store_true", + required=True, + help="REQUIRED: Proceed only when targeting a private repo", + ) + push_parser.add_argument("--branch", default="main", help="Target branch (default: main)") + push_parser.add_argument("--commit", dest="commit_message", default="mlx-knife push", help="Commit message") + push_parser.add_argument("--verbose", action="store_true", help="Verbose details (human output)") + push_parser.add_argument("--check-only", action="store_true", help="Analyze workspace content; do not upload") + push_parser.add_argument("--dry-run", action="store_true", help="Compute changes against remote; do not upload") + push_parser.add_argument("--json", action="store_true", help="Output in JSON format") args = parser.parse_args() @@ -371,12 +370,6 @@ def main(): print_result(result, render_pull, args.json) elif args.command == "clone": - # Check if alpha features are enabled (should not reach here if not, but double-check) - if not os.getenv("MLXK2_ENABLE_ALPHA_FEATURES"): - result = handle_error("CommandError", "Clone command requires MLXK2_ENABLE_ALPHA_FEATURES=1") - print_result(result, None, True) # Always JSON for this error - sys.exit(1) - # Handle branch parameter by modifying model spec model_spec = args.model if getattr(args, "branch", None): @@ -393,6 +386,12 @@ def main(): print_result(result, render_clone, args.json, quiet=getattr(args, "quiet", False)) elif args.command == "convert": + # Check if alpha features are enabled (should not reach here if not, but double-check) + if not os.getenv("MLXK2_ENABLE_ALPHA_FEATURES"): + result = handle_error("CommandError", "Convert command requires MLXK2_ENABLE_ALPHA_FEATURES=1") + print_result(result, None, True) # Always JSON for this error + sys.exit(1) + from .operations.convert import convert_operation # Validate mode flags @@ -573,11 +572,6 @@ def main(): # Should never reach here (server runs indefinitely) result = {"status": "success"} elif args.command == "push": - # Check if alpha features are enabled (should not reach here if not, but double-check) - if not os.getenv("MLXK2_ENABLE_ALPHA_FEATURES"): - result = handle_error("CommandError", "Push command requires MLXK2_ENABLE_ALPHA_FEATURES=1") - print_result(result, None, True) # Always JSON for this error - sys.exit(1) result = push_operation( local_dir=args.local_dir, repo_id=args.repo_id, diff --git a/mlxk2/core/model_resolution.py b/mlxk2/core/model_resolution.py index b37281f..b65f712 100644 --- a/mlxk2/core/model_resolution.py +++ b/mlxk2/core/model_resolution.py @@ -3,7 +3,7 @@ from pathlib import Path from typing import Tuple, Optional, List from .cache import get_current_model_cache, hf_to_cache_dir, cache_dir_to_hf -from ..operations.workspace import is_workspace_path +from ..operations.workspace import is_workspace_path, is_explicit_path def expand_model_name(model_name: str) -> str: @@ -95,14 +95,10 @@ def resolve_model_for_operation(model_spec: str) -> Tuple[Optional[str], Optiona 'Mistral-Small' → cache resolution (NOT workspace, even if local dir exists) 'ambig' → (None, None, ['model1', 'model2']) """ - # NEW: Check if model_spec is an EXPLICIT workspace path (ADR-018 Phase 0c) + # Check if model_spec is an EXPLICIT workspace path (ADR-018 Phase 0c) # Only paths starting with ./ ../ / or being . or .. are treated as workspace paths # This ensures "model-name" goes through cache resolution even if a local dir exists - is_explicit_path = ( - model_spec.startswith(('./', '../', '/')) or - model_spec in ('.', '..') - ) - if is_explicit_path and is_workspace_path(model_spec): + if is_explicit_path(model_spec) and is_workspace_path(model_spec): # Explicit workspace path - return absolute path, skip cache logic return (str(Path(model_spec).resolve()), None, None) diff --git a/mlxk2/core/server_base.py b/mlxk2/core/server_base.py index ce229a4..ca4bf8a 100644 --- a/mlxk2/core/server_base.py +++ b/mlxk2/core/server_base.py @@ -602,9 +602,14 @@ def get_effective_max_tokens(runner: MLXRunner, requested_max_tokens: Optional[i Text models use shift-window context management: - server_mode=True: context_length / 2 (reserve half for history) - server_mode=False: context_length (full context for CLI) + + Priority: requested_max_tokens > _default_max_tokens (from --max-tokens CLI) > dynamic calculation """ if requested_max_tokens is not None: return requested_max_tokens + elif _default_max_tokens is not None: + # Use server-wide default from CLI --max-tokens flag + return _default_max_tokens else: # Use runner's dynamic calculation with server_mode flag return runner._calculate_dynamic_max_tokens(server_mode=server_mode) @@ -621,10 +626,13 @@ def get_effective_max_tokens_vision(runner, requested_max_tokens: Optional[int]) - Vision models typically have large context (128K+), but generation is slow - 2048 tokens ≈ 1500 words, enough for detailed image descriptions - Future: Could use context_length from config if available, but 2048 is safe. + Priority: requested_max_tokens > _default_max_tokens (from --max-tokens CLI) > 2048 default """ if requested_max_tokens is not None: return requested_max_tokens + elif _default_max_tokens is not None: + # Use server-wide default from CLI --max-tokens flag + return _default_max_tokens # Conservative default for vision (stateless, no history to reserve) # Vision inference is slow, so we don't want to generate 64K tokens by default @@ -637,19 +645,31 @@ def count_tokens(text: str) -> int: def _request_has_images(messages: List[ChatMessage]) -> bool: - """Check if any message contains image content (Vision API format). + """Check if LAST USER MESSAGE contains image content (Vision API format). + + OpenAI API semantics: Only images from the last user message are processed. + Historical images are preserved in context but not re-processed. + + This function determines routing (vision vs text path). + Must match actual vision processing behavior (ADR-012 Phase 3). Args: messages: List of ChatMessage objects Returns: - True if any message contains image_url content + True if the last user message contains image_url content """ - for msg in messages: - if isinstance(msg.content, list): - for item in msg.content: - if isinstance(item, dict) and item.get("type") == "image_url": - return True + # Find last user message (iterate backwards for efficiency) + for msg in reversed(messages): + if msg.role == "user": + # Check if THIS message has images + if isinstance(msg.content, list): + for item in msg.content: + if isinstance(item, dict) and item.get("type") == "image_url": + return True + # Found last user message, no images + return False + # No user messages at all (shouldn't happen with validation, but be defensive) return False @@ -807,8 +827,12 @@ async def list_models(): model_list = [] model_cache = get_current_model_cache() - # Find all model directories - models = [d for d in model_cache.iterdir() if d.name.startswith("models--")] + # Find all model directories (handle missing cache gracefully) + if not model_cache.exists(): + # Fresh installation or custom cache location - no models yet + models = [] + else: + models = [d for d in model_cache.iterdir() if d.name.startswith("models--")] for model_dir in models: model_name = cache_dir_to_hf(model_dir.name) @@ -1184,20 +1208,171 @@ def _process_vision_chunks_server( return "\n\n".join(all_results) +async def _stream_vision_chunks( + model_path, + model_name: str, + prompt: str, + images: List[tuple], + chunk_size: int, + image_id_map: Dict[str, int], + max_tokens: Optional[int], + temperature: float, + top_p: float, + repetition_penalty: float, + completion_id: str, + created: int, + model: str, +) -> AsyncGenerator[str, None]: + """Stream SSE events per vision chunk as they complete (OpenAI-compatible). + + Unlike _process_vision_chunks_server() which waits for all chunks, + this yields SSE events immediately after each chunk finishes. + Uses asyncio.to_thread() to keep the event loop responsive. + + Args: + model_path: Path to model snapshot directory + model_name: Model name for VisionRunner + prompt: User prompt + images: Full list of (filename, bytes) tuples + chunk_size: Images per chunk + image_id_map: Pre-computed global image IDs (from conversation history) + max_tokens, temperature, top_p, repetition_penalty: Generation params + completion_id: Unique completion ID for SSE events + created: Timestamp for SSE events + model: Model name for SSE events + + Yields: + SSE event strings (data: {...}\n\n format) + """ + import asyncio + from .vision_runner import VisionRunner + + chunks = [images[i:i+chunk_size] for i in range(0, len(images), chunk_size)] + total_images = len(images) + + # Initial role event + initial_event = { + "id": completion_id, + "object": "chat.completion.chunk", + "created": created, + "model": model, + "choices": [{ + "index": 0, + "delta": {"role": "assistant"}, + "finish_reason": None + }] + } + yield f"data: {json.dumps(initial_event)}\n\n" + + # Process each chunk and stream result immediately + for chunk_idx, chunk in enumerate(chunks, start=1): + logger.info( + f"Vision chunk {chunk_idx}/{len(chunks)} starting", + chunk=chunk_idx, + total_chunks=len(chunks), + images_in_chunk=len(chunk) + ) + + # Check shutdown before processing + if _shutdown_event.is_set(): + interrupt_event = { + "id": completion_id, + "object": "chat.completion.chunk", + "created": created, + "model": model, + "choices": [{ + "index": 0, + "delta": {"content": "\n\n[Generation interrupted]"}, + "finish_reason": "stop" + }] + } + yield f"data: {json.dumps(interrupt_event)}\n\n" + yield "data: [DONE]\n\n" + return + + # Process chunk in thread pool (keeps event loop responsive) + # NOTE: Pass chunk_images as argument to avoid closure late-binding issues + def process_chunk(chunk_images): + with VisionRunner(model_path, model_name, verbose=False) as runner: + return runner.generate( + prompt=prompt, + images=chunk_images, + max_tokens=max_tokens, + temperature=temperature, + top_p=top_p, + repetition_penalty=repetition_penalty, + image_id_map=image_id_map, + total_images=total_images, + ) + + try: + chunk_result = await asyncio.to_thread(process_chunk, chunk) + except Exception as e: + logger.error(f"Vision chunk {chunk_idx}/{len(chunks)} failed: {e}") + error_event = { + "id": completion_id, + "object": "chat.completion.chunk", + "created": created, + "model": model, + "choices": [{ + "index": 0, + "delta": {"content": f"\n\n[Error in chunk {chunk_idx}: {str(e)}]"}, + "finish_reason": "error" + }] + } + yield f"data: {json.dumps(error_event)}\n\n" + yield "data: [DONE]\n\n" + return + + # Content event for this chunk (with separator for multi-chunk) + separator = "\n\n" if chunk_idx < len(chunks) else "" + content_event = { + "id": completion_id, + "object": "chat.completion.chunk", + "created": created, + "model": model, + "choices": [{ + "index": 0, + "delta": {"content": chunk_result + separator}, + "finish_reason": None + }] + } + yield f"data: {json.dumps(content_event)}\n\n" + + logger.info( + f"Vision chunk {chunk_idx}/{len(chunks)} streamed", + chunk=chunk_idx, + total_chunks=len(chunks), + output_length=len(chunk_result) + ) + + # Final event with finish_reason + final_event = { + "id": completion_id, + "object": "chat.completion.chunk", + "created": created, + "model": model, + "choices": [{ + "index": 0, + "delta": {}, + "finish_reason": "stop" + }] + } + yield f"data: {json.dumps(final_event)}\n\n" + yield "data: [DONE]\n\n" + + async def _handle_vision_chat_completion(request: ChatCompletionRequest, runner: Any = None) -> ChatCompletionResponse: """Handle vision chat completion with images (ADR-012 Phase 3). - Non-streaming only. Uses VisionHTTPAdapter to parse OpenAI format - and VisionRunner for generation. Reuses cached model if available. + Supports per-chunk streaming for multi-image requests (stream=True yields + SSE events as each chunk completes). Single-chunk requests use batch mode + with optional SSE emulation. Args: request: Chat completion request runner: Pre-loaded model runner (optional, will load if not provided) """ - # Graceful degradation: ignore stream=true (mlx-vlm doesn't support streaming) - if request.stream: - logger.info("Vision request: stream=true ignored (not supported), using batch") - # Lazy import vision components (Python 3.9 compatibility) from ..tools.vision_adapter import VisionHTTPAdapter @@ -1254,7 +1429,7 @@ async def _handle_vision_chat_completion(request: ChatCompletionRequest, runner: created = int(time.time()) # Get chunk size (with env var override) - chunk_size = request.chunk if request.chunk != 1 else int(os.environ.get("MLXK2_VISION_BATCH_SIZE", "1")) + chunk_size = request.chunk if request.chunk != 1 else int(os.environ.get("MLXK2_VISION_CHUNK_SIZE", "1")) # Validate chunk size for Metal API stability from ..tools.vision_adapter import MAX_SAFE_CHUNK_SIZE @@ -1284,7 +1459,35 @@ async def _handle_vision_chat_completion(request: ChatCompletionRequest, runner: image_id_map=image_id_map, ) else: - # Multi-batch chunking - creates fresh runner per chunk + # Multi-chunk processing + if request.stream: + # True per-chunk streaming (yields SSE events as chunks complete) + logger.info( + f"Vision request: chunk streaming ({len(images)} images, chunk_size={chunk_size})", + model=request.model, + image_count=len(images), + chunk_size=chunk_size + ) + return StreamingResponse( + _stream_vision_chunks( + model_path=runner.model_path, + model_name=runner.model_name, + prompt=prompt, + images=images, + chunk_size=chunk_size, + image_id_map=image_id_map, + max_tokens=get_effective_max_tokens_vision(runner, request.max_tokens), + temperature=0.0, + top_p=request.top_p or 0.9, + repetition_penalty=request.repetition_penalty or 1.0, + completion_id=completion_id, + created=created, + model=request.model, + ), + media_type="text/event-stream", + headers={"Cache-Control": "no-cache"} + ) + # Non-streaming multi-chunk (batch mode) generated_text = _process_vision_chunks_server( model_path=runner.model_path, model_name=runner.model_name, @@ -1308,9 +1511,9 @@ async def _handle_vision_chat_completion(request: ChatCompletionRequest, runner: prompt_tokens = count_tokens(prompt) completion_tokens = count_tokens(generated_text) - # Graceful degradation: emulate SSE for stream=true + # Graceful degradation: emulate SSE for stream=true (single-chunk only) if request.stream: - logger.info("Vision request: emulating SSE stream (batch response as single event)") + logger.info("Vision request: emulating SSE stream (single-chunk batch response)") return StreamingResponse( _emulate_sse_stream(completion_id, created, request.model, generated_text), media_type="text/event-stream", @@ -1558,6 +1761,18 @@ def run_server( """Run the MLX Knife server 2.0.""" import os + # Suppress transformers/tokenizers noise (Session 89 + Session 90 fix) + # ENV variables already set by serve.py subprocess, but set logging programmatically + # IMPORTANT: Do NOT import transformers in global scope (breaks huggingface_hub downloads) + try: + from transformers import logging as transformers_logging + import logging as python_logging + transformers_logging.set_verbosity_error() + python_logging.getLogger("transformers.tokenization_utils").setLevel(python_logging.ERROR) + python_logging.getLogger("transformers.tokenization_utils_base").setLevel(python_logging.ERROR) + except ImportError: + pass # transformers not installed (optional dependency for vision) + # Import uvicorn lazily to keep module import light when server isn't used try: import uvicorn # type: ignore diff --git a/mlxk2/core/vision_runner.py b/mlxk2/core/vision_runner.py index 4de67ff..c1de325 100644 --- a/mlxk2/core/vision_runner.py +++ b/mlxk2/core/vision_runner.py @@ -14,7 +14,7 @@ import tempfile from dataclasses import dataclass from datetime import datetime from pathlib import Path -from typing import Dict, Iterable, Optional, Sequence, Tuple +from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple from ..operations.workspace import is_workspace_path @@ -234,6 +234,8 @@ class VisionRunner: Prepends metadata (GPS coordinates, datetime, camera) to the user prompt so the model can use this information in its response. + Uses _extract_all_image_metadata() for Single Source of Truth. + Feature flag: MLXK2_VISION_METADATA_CONTEXT=0 to disable (default: enabled) Args: @@ -252,53 +254,35 @@ class VisionRunner: if not images: return prompt - # Extract EXIF for all images + # Single Source of Truth: Extract metadata once + metadata_list = VisionRunner._extract_all_image_metadata(images, image_id_map) + metadata_lines = [] - # Per-image metadata - for idx, (filename, img_bytes) in enumerate(images, 1): - # Determine image ID - if image_id_map: - content_hash = hashlib.sha256(img_bytes).hexdigest()[:8] - img_id = image_id_map.get(content_hash, idx) - else: - img_id = idx + # DO NOT add chunk context - causes models to hallucinate missing images + # Problem: "chunk 2/5" tells model 5 total exist → hallucinates others - # Add chunk context line before first image (if chunking active) - if idx == 1 and total_images and total_images > len(images): - # Calculate chunk info - chunk_size = len(images) - if image_id_map: - # Find all IDs in current chunk to determine range - chunk_ids = [] - for fn, ib in images: - ch = hashlib.sha256(ib).hexdigest()[:8] - if ch in image_id_map: - chunk_ids.append(image_id_map[ch]) - - if chunk_ids: - start_id = min(chunk_ids) - end_id = max(chunk_ids) - batch_num = (start_id - 1) // chunk_size + 1 - total_batches = (total_images + chunk_size - 1) // chunk_size - metadata_lines.append( - f"[Processing batch {batch_num}/{total_batches}: " - f"Images {start_id}-{end_id} (chunk_size={chunk_size}, {total_images} total)]" - ) - - # Extract EXIF - exif = VisionRunner._extract_exif(img_bytes) + # Per-image metadata in bracket format + # Strategy: Use LOCAL numbering within chunk to prevent hallucinations + # - Single image (chunk_size=1): No number, just EXIF + # - Multiple images: Local numbers (1, 2, 3...) not global + for local_idx, meta in enumerate(metadata_list, 1): + exif = meta['exif'] # Build metadata string for this image - meta_parts = [f"Image {img_id}"] + meta_parts = [] + + # Only add image reference if multiple images in this chunk + if len(metadata_list) > 1: + meta_parts.append(f"Image {local_idx}") # Local numbering if exif: if exif.gps_lat is not None and exif.gps_lon is not None: - # Format GPS coordinates + # Format GPS coordinates (4 decimals = ~11m precision for street-level accuracy) lat_dir = "N" if exif.gps_lat >= 0 else "S" lon_dir = "E" if exif.gps_lon >= 0 else "W" meta_parts.append( - f"GPS: {abs(exif.gps_lat):.2f}°{lat_dir}, {abs(exif.gps_lon):.2f}°{lon_dir}" + f"GPS: {abs(exif.gps_lat):.4f}°{lat_dir}, {abs(exif.gps_lon):.4f}°{lon_dir}" ) if exif.datetime: @@ -310,7 +294,9 @@ class VisionRunner: camera = exif.camera.replace("(", "").replace(")", "").replace("generation", "gen") meta_parts.append(f"Camera: {camera}") - if len(meta_parts) > 1: # Only add if we have metadata beyond image ID + # Add metadata line if we have any metadata + # (either image number for multi-image, or EXIF data, or both) + if meta_parts: metadata_lines.append("[" + " | ".join(meta_parts) + "]") # Prepend metadata to prompt @@ -320,6 +306,54 @@ class VisionRunner: else: return prompt + @staticmethod + def _extract_all_image_metadata( + images: Sequence[Tuple[str, bytes]], + image_id_map: Optional[Dict[str, int]] = None, + ) -> List[Dict[str, Any]]: + """Extract metadata for all images (Single Source of Truth). + + Central function that extracts all metadata once, used by both: + - Bracket format (_augment_prompt_with_metadata) + - HTML table format (_add_filename_mapping) + + Args: + images: List of (filename, bytes) tuples + image_id_map: Optional mapping of content_hash -> image_id for stable numbering + + Returns: + List of metadata dicts, one per image: + { + 'image_id': int, + 'filename': str, + 'content_hash': str, + 'exif': ExifData or None, + } + """ + metadata_list = [] + + for idx, (filename, img_bytes) in enumerate(images, 1): + # Calculate content hash + content_hash = hashlib.sha256(img_bytes).hexdigest()[:8] + + # Determine image ID (stable across requests if image_id_map provided) + if image_id_map: + img_id = image_id_map.get(content_hash, idx) + else: + img_id = idx + + # Extract EXIF (respects MLXK2_EXIF_METADATA flag) + exif = VisionRunner._extract_exif(img_bytes) + + metadata_list.append({ + 'image_id': img_id, + 'filename': filename, + 'content_hash': content_hash, + 'exif': exif, + }) + + return metadata_list + @staticmethod def _extract_exif(image_bytes: bytes) -> Optional[ExifData]: """ @@ -378,15 +412,41 @@ class VisionRunner: exif.gps_lat = lat exif.gps_lon = lon - # Extract DateTime (tag 36867 = DateTimeOriginal, 306 = DateTime) - dt_original = exif_data.get(36867) or exif_data.get(306) - if dt_original: - try: - # EXIF format: "2023:12:06 12:19:21" - dt = datetime.strptime(str(dt_original), "%Y:%m:%d %H:%M:%S") - exif.datetime = dt.isoformat() # Convert to ISO 8601 - except Exception: - pass + # Extract DateTime + # Priority: + # 1. GPS-Timestamp (Tag 7+29) - precise UTC, cannot be misconfigured + # 2. DateTimeOriginal (Tag 36867) - fallback if no GPS + # 3. Tag 306 (DateTime) - NEVER use, gets modified by image editors + # + # Phase 1: Date only (time component not displayed in metadata table) + # Always UTC for consistency + dt = None + + # Try GPS timestamp first (Tag 29 = GPSDateStamp, Tag 7 = GPSTimeStamp) + if gps_info: + gps_date = gps_dict.get("GPSDateStamp") # "2025:05:11" + gps_time = gps_dict.get("GPSTimeStamp") # (12, 40, 22) + + if gps_date and gps_time: + try: + # Combine GPS date + time (always UTC) + gps_dt_str = f"{gps_date} {int(gps_time[0]):02d}:{int(gps_time[1]):02d}:{int(gps_time[2]):02d}" + dt = datetime.strptime(gps_dt_str, "%Y:%m:%d %H:%M:%S") + except Exception: + pass # Fall through to DateTimeOriginal + + # Fallback: DateTimeOriginal (Tag 36867) + if not dt: + dt_original = exif_data.get(36867) + if dt_original: + try: + # EXIF format: "2023:12:06 12:19:21" + dt = datetime.strptime(str(dt_original), "%Y:%m:%d %H:%M:%S") + except Exception: + pass + + if dt: + exif.datetime = dt.isoformat() # Convert to ISO 8601 # Extract Camera model (tag 272 = Model) camera = exif_data.get(272) @@ -458,26 +518,20 @@ class VisionRunner: Returns: Result with prepended filename mapping (metadata before model output) """ - # Extract EXIF data (optional, controlled by feature flag) + # Single Source of Truth: Extract metadata once + metadata_list = VisionRunner._extract_all_image_metadata(images, image_id_map) + + # Check if EXIF is enabled exif_enabled = os.environ.get("MLXK2_EXIF_METADATA") != "0" - exif_list = [] - if exif_enabled: - for _, raw_bytes in images: - exif_list.append(VisionRunner._extract_exif(raw_bytes)) - # Build table rows + # Build table rows from metadata rows = [] - for i, (filename, raw_bytes) in enumerate(images, 1): - if image_id_map: - # Use history-based stable IDs - content_hash = hashlib.sha256(raw_bytes).hexdigest()[:8] - img_id = image_id_map.get(content_hash, i) # Fallback to sequential - else: - # CLI mode: request-scoped sequential IDs - img_id = i + for meta in metadata_list: + img_id = meta['image_id'] + content_hash = meta['content_hash'] + filename = meta['filename'] + exif = meta['exif'] - # Compute hashed filename for display - content_hash = hashlib.sha256(raw_bytes).hexdigest()[:8] hashed_name = f"image_{content_hash}.jpeg" # Build row with optional EXIF columns @@ -485,8 +539,6 @@ class VisionRunner: if exif_enabled: # EXIF mode enabled: Always show Original + metadata columns - exif = exif_list[i - 1] if i <= len(exif_list) else None - # Original filename (always show when exif_enabled) row += f" | {Path(filename).name}" @@ -495,7 +547,7 @@ class VisionRunner: if exif.gps_lat is not None and exif.gps_lon is not None: lat_dir = "N" if exif.gps_lat >= 0 else "S" lon_dir = "E" if exif.gps_lon >= 0 else "W" - row += f" | 📍 {abs(exif.gps_lat):.2f}°{lat_dir}, {abs(exif.gps_lon):.2f}°{lon_dir}" + row += f" | 📍 {abs(exif.gps_lat):.4f}°{lat_dir}, {abs(exif.gps_lon):.4f}°{lon_dir}" else: row += " | -" @@ -547,7 +599,7 @@ class VisionRunner: chunk_size = len(images) batch_num = (start_id - 1) // chunk_size + 1 total_batches = (total_images + chunk_size - 1) // chunk_size - mapping += f"📸 Batch {batch_num}/{total_batches}: Images {start_id}-{end_id}\n\n" + mapping += f"📸 Chunk {batch_num}/{total_batches}: Images {start_id}-{end_id}\n\n" else: # Fallback if chunk_ids calculation fails mapping += f"📸 Image Metadata ({count} image{'s' if count != 1 else ''})\n\n" diff --git a/mlxk2/operations/clone.py b/mlxk2/operations/clone.py index 3ef6e5b..80c7621 100644 --- a/mlxk2/operations/clone.py +++ b/mlxk2/operations/clone.py @@ -199,14 +199,14 @@ def clone_operation(model_spec: str, target_dir: str, health_check: bool = True, # Phase 6b: Write workspace sentinel (ADR-018 Phase 0a) # Sentinel written AFTER clone success, BEFORE declaring operation complete - from datetime import datetime + from datetime import datetime, timezone # Extract commit hash if available from pull result commit_hash = pull_result["data"].get("commit_hash") metadata = { "mlxk_version": __version__, - "created_at": datetime.utcnow().isoformat() + "Z", + "created_at": datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'), "source_repo": resolved_model, "source_revision": commit_hash, "managed": True, diff --git a/mlxk2/operations/convert.py b/mlxk2/operations/convert.py index 31e6240..9b31e14 100644 --- a/mlxk2/operations/convert.py +++ b/mlxk2/operations/convert.py @@ -21,7 +21,7 @@ Philosophy: import json import logging import subprocess -from datetime import datetime +from datetime import datetime, timezone from pathlib import Path from typing import Dict, Any @@ -253,7 +253,7 @@ def convert_operation( target_metadata = { "mlxk_version": __version__, - "created_at": datetime.utcnow().isoformat() + "Z", + "created_at": datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'), "source_repo": src_metadata.get("source_repo", str(src)), "source_revision": src_metadata.get("source_revision"), "managed": True, diff --git a/mlxk2/operations/list.py b/mlxk2/operations/list.py index 23df761..41079dd 100644 --- a/mlxk2/operations/list.py +++ b/mlxk2/operations/list.py @@ -1,9 +1,52 @@ """List models operation for MLX-Knife 2.0.""" +from pathlib import Path from typing import Dict, Any, Optional, Tuple from ..core.cache import get_current_model_cache, cache_dir_to_hf from .common import build_model_object +from .workspace import find_matching_workspaces, is_explicit_path + + +def _compute_display_name(workspace_path: Path, pattern: str) -> str: + """Compute display name for workspace based on input pattern. + + If pattern is relative (./..., ../...), return relative path. + If pattern is absolute (/...), return absolute path. + + Args: + workspace_path: Resolved absolute path to workspace + pattern: Original input pattern + + Returns: + Display name matching input pattern style + """ + if pattern.startswith('/'): + # Absolute pattern → absolute output + return str(workspace_path) + + # Relative pattern → relative output + try: + pattern_path = Path(pattern).expanduser() + pattern_resolved = pattern_path.resolve() + + # Case 1: Exact workspace match (pattern points to this workspace) + # Display name is the workspace directory name + if pattern_resolved == workspace_path: + return workspace_path.name + + # Case 2: Directory scan (pattern is parent directory containing workspace) + # Display name is relative to that directory + if pattern_path.exists() and pattern_path.is_dir(): + return str(workspace_path.relative_to(pattern_resolved)) + + # Case 3: Prefix match (pattern is partial name) + # Display name is relative to parent directory + search_dir = pattern_path.parent.resolve() + return str(workspace_path.relative_to(search_dir)) + except ValueError: + # Can't compute relative path, fall back to absolute + return str(workspace_path) def _latest_snapshot(model_path) -> Tuple[Optional[str], Optional[object]]: @@ -20,13 +63,44 @@ def _latest_snapshot(model_path) -> Tuple[Optional[str], Optional[object]]: def list_models(pattern: str = None) -> Dict[str, Any]: """List all models in cache with JSON output. - + Args: - pattern: Optional pattern to filter models (case-insensitive substring match) + pattern: Optional pattern to filter models (case-insensitive substring match), + or a workspace path pattern to list local models. + + Workspace patterns (start with ./, ../, or /): + - Exact: "./my-model" → list single workspace + - Prefix: "./gemma-" → list all workspaces starting with "gemma-" """ + # Check for workspace path patterns first (ADR-018 Phase 0c compatibility) + # Explicit paths (./foo, ../foo, /foo) are always treated as workspace patterns + if pattern and is_explicit_path(pattern): + workspace_matches = find_matching_workspaces(pattern) + models = [] + for workspace_path in workspace_matches: + model_obj = build_model_object( + str(workspace_path), # hf_name = absolute path for workspaces + workspace_path, # model_root + workspace_path # selected_path (no snapshots in workspace) + ) + # Add display_name for human output (respects input pattern style) + model_obj["display_name"] = _compute_display_name(workspace_path, pattern) + models.append(model_obj) + # Return workspace results (may be empty if no matches) + # Do NOT fall through to cache search for explicit paths + return { + "status": "success", + "command": "list", + "data": { + "models": models, + "count": len(models) + }, + "error": None + } + models = [] model_cache = get_current_model_cache() - + if not model_cache.exists(): return { "status": "success", diff --git a/mlxk2/operations/push.py b/mlxk2/operations/push.py index 92c334c..3dfbb8c 100644 --- a/mlxk2/operations/push.py +++ b/mlxk2/operations/push.py @@ -19,6 +19,7 @@ import json as _json # Import APFS check from clone operation and cache utilities from mlxk2.operations.clone import _is_apfs_filesystem from mlxk2.core.cache import get_current_cache_root +from mlxk2.operations.workspace import is_explicit_path, find_matching_workspaces DEFAULT_PUSH_BRANCH = "main" @@ -78,9 +79,39 @@ def push_operation( } return result - # 2) Local folder + # 2) Local folder - with explicit path pattern support p = Path(local_dir) - if not p.exists() or not p.is_dir(): + + # If path exists as a directory, use it directly (backward compatible) + # Only use pattern matching for non-existent paths (prefix patterns) + if p.exists() and p.is_dir(): + # Direct path - use as-is (even without config.json for backward compat) + pass + elif is_explicit_path(local_dir): + # Explicit path pattern that doesn't exist as directory + # Could be: prefix pattern (./gemma-) or directory scan (.) + matches = find_matching_workspaces(local_dir) + if len(matches) == 0: + result["status"] = "error" + result["error"] = { + "type": "workspace_not_found", + "message": f"No workspace found matching: {local_dir}", + } + return result + elif len(matches) > 1: + # Ambiguous pattern - multiple workspaces match + match_names = [m.name for m in matches] + result["status"] = "error" + result["error"] = { + "type": "ambiguous_workspace", + "message": f"Ambiguous pattern '{local_dir}' matches {len(matches)} workspaces: {', '.join(match_names)}. Please specify exact path.", + "matches": [str(m) for m in matches], + } + return result + else: + # Exactly one match - use it + p = matches[0] + else: result["status"] = "error" result["error"] = { "type": "workspace_not_found", diff --git a/mlxk2/operations/run.py b/mlxk2/operations/run.py index 92169f1..6bd0b60 100644 --- a/mlxk2/operations/run.py +++ b/mlxk2/operations/run.py @@ -270,6 +270,20 @@ def run_model( Returns: Generated text on success, "Error: ..." string on failure (both modes) """ + # Suppress transformers/tokenizers noise (Session 89 + Session 90 fix) + # Set ENV variables for subprocess/tokenizer + os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1" + os.environ["TOKENIZERS_PARALLELISM"] = "false" + # IMPORTANT: Do NOT import transformers in global scope (breaks huggingface_hub downloads) + try: + from transformers import logging as transformers_logging + import logging as python_logging + transformers_logging.set_verbosity_error() + python_logging.getLogger("transformers.tokenization_utils").setLevel(python_logging.ERROR) + python_logging.getLogger("transformers.tokenization_utils_base").setLevel(python_logging.ERROR) + except ImportError: + pass # transformers not installed (optional dependency for vision) + json_mode = json_output # Pre-flight check: Verify runtime compatibility before attempting to load # This is a "best effort" check - if the model is in cache, verify it's compatible @@ -414,7 +428,7 @@ def run_model( try: # Get chunk size (with env var override) - chunk_size = chunk if chunk != 1 else int(os.environ.get("MLXK2_VISION_BATCH_SIZE", "1")) + chunk_size = chunk if chunk != 1 else int(os.environ.get("MLXK2_VISION_CHUNK_SIZE", "1")) # Validate chunk size for Metal API stability from ..tools.vision_adapter import MAX_SAFE_CHUNK_SIZE diff --git a/mlxk2/operations/serve.py b/mlxk2/operations/serve.py index 0e4e5d6..0315d8e 100644 --- a/mlxk2/operations/serve.py +++ b/mlxk2/operations/serve.py @@ -26,6 +26,12 @@ def _run_supervised_uvicorn(host: str, port: int, log_level: str, reload: bool = env["MLXK2_HOST"] = host env["MLXK2_PORT"] = str(port) env["MLXK2_LOG_LEVEL"] = log_level + + # Suppress transformers/tokenizers noise in server subprocess (Session 89 + Session 90 fix) + # IMPORTANT: Set in subprocess ENV, NOT in global __init__.py (breaks huggingface_hub downloads) + env["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1" + env["TOKENIZERS_PARALLELISM"] = "false" # Prevent fork warning in uvicorn/multiprocessing + if reload: env["MLXK2_RELOAD"] = "1" @@ -127,8 +133,10 @@ def start_server( os.environ["MLXK2_LOG_LEVEL"] = log_level if model: os.environ["MLXK2_PRELOAD_MODEL"] = model + if max_tokens is not None: + os.environ["MLXK2_MAX_TOKENS"] = str(max_tokens) if chunk != 1: - os.environ["MLXK2_VISION_BATCH_SIZE"] = str(chunk) + os.environ["MLXK2_VISION_CHUNK_SIZE"] = str(chunk) if verbose: print("Starting MLX Knife Server 2.0...") @@ -143,7 +151,12 @@ def start_server( if supervise: # Delegate to subprocess-managed uvicorn (env vars already set above) - _ = _run_supervised_uvicorn(host=host, port=port, log_level=log_level, reload=reload) + exit_code = _run_supervised_uvicorn(host=host, port=port, log_level=log_level, reload=reload) + # Propagate failure exit codes to caller (for CI/CD) + # Python's Popen.wait() returns negative values for signal deaths (-SIGTERM=-15, -SIGKILL=-9) + # Any non-zero exit code indicates failure and should be propagated + if exit_code != 0: + sys.exit(exit_code) return # Default: run uvicorn in-process diff --git a/mlxk2/operations/workspace.py b/mlxk2/operations/workspace.py index 364a7b7..4e4014d 100644 --- a/mlxk2/operations/workspace.py +++ b/mlxk2/operations/workspace.py @@ -142,6 +142,40 @@ def read_workspace_metadata(workspace_path: Path) -> Dict[str, Any]: return {} +def is_explicit_path(pattern: str) -> bool: + """Check if pattern is an explicit filesystem path (not an HF model ID). + + Only paths with explicit path markers are treated as filesystem paths. + This ensures "model-name" goes through cache resolution even if a local dir exists. + + Args: + pattern: The pattern string to check + + Returns: + True if pattern is an explicit path, False otherwise + + Examples: + >>> is_explicit_path("./gemma-3n") + True + >>> is_explicit_path("../parent/model") + True + >>> is_explicit_path("/abs/path/model") + True + >>> is_explicit_path(".") + True + >>> is_explicit_path("mlx-community/Phi-3") + False # HF model ID + >>> is_explicit_path("my-model") + False # Ambiguous, treated as HF ID + """ + if not pattern or not isinstance(pattern, str): + return False + return ( + pattern.startswith(('./', '../', '/')) or + pattern in ('.', '..') + ) + + def is_workspace_path(path) -> bool: """Check if path points to a workspace directory (managed or unmanaged). @@ -168,3 +202,77 @@ def is_workspace_path(path) -> bool: return p.exists() and (p / "config.json").exists() except (TypeError, OSError): return False + + +def find_matching_workspaces(pattern: str) -> list: + """Find all workspace directories matching an explicit path pattern. + + Supports three modes: + 1. Exact match: Pattern points to existing workspace directory + 2. Directory scan: Pattern is existing directory (not workspace) → find all workspaces inside + 3. Prefix match: Pattern is partial path → find directories starting with prefix + + Args: + pattern: Explicit path pattern (e.g., "./gemma-" or "/path/to/model" or ".") + Must start with ./, ../, / or be . or .. + + Returns: + List of Path objects for matching workspaces (directories with config.json). + Empty list if pattern is not an explicit path or no matches found. + + Examples: + >>> find_matching_workspaces("./gemma-3n-E2B-it-4bit") + [PosixPath('/path/to/gemma-3n-E2B-it-4bit')] # Exact match + + >>> find_matching_workspaces(".") + [PosixPath('/path/to/model1'), PosixPath('/path/to/model2')] # Directory scan + + >>> find_matching_workspaces("./gemma-") + [PosixPath('/path/to/gemma-3n-E2B-it-4bit'), + PosixPath('/path/to/gemma-3n-E2B-it-FIXED-4bit')] # Prefix match + + >>> find_matching_workspaces("mlx-community/Phi-3") + [] # Not an explicit path + """ + if not is_explicit_path(pattern): + return [] + + try: + p = Path(pattern).expanduser() + + # Case 1: Exact match - pattern is already a complete workspace + if is_workspace_path(p): + return [p.resolve()] + + # Case 2: Directory scan - pattern is existing directory (not a workspace) + # Find all workspaces inside this directory + if p.exists() and p.is_dir(): + matches = [] + for entry in p.iterdir(): + if entry.is_dir() and (entry / "config.json").exists(): + matches.append(entry.resolve()) + matches.sort(key=lambda x: x.name) + return matches + + # Case 3: Prefix match - find directories starting with pattern + parent = p.parent + prefix = p.name + + if not parent.exists() or not parent.is_dir(): + return [] + + # Find all directories in parent that start with prefix + matches = [] + for entry in parent.iterdir(): + if entry.is_dir() and entry.name.startswith(prefix): + # Only include if it's a valid workspace (has config.json) + if (entry / "config.json").exists(): + matches.append(entry.resolve()) + + # Sort by name for consistent output + matches.sort(key=lambda p: p.name) + return matches + + except (TypeError, OSError) as e: + logger.debug(f"Error finding workspaces for pattern '{pattern}': {e}") + return [] diff --git a/mlxk2/output/human.py b/mlxk2/output/human.py index 4e064cc..557986c 100644 --- a/mlxk2/output/human.py +++ b/mlxk2/output/human.py @@ -151,7 +151,9 @@ def render_list(data: Dict[str, Any], show_health: bool, show_all: bool, verbose rows: List[List[str]] = [] for m in filtered: - name = str(m.get("name", "-")) + # Use display_name for human output if available (workspace paths) + # Otherwise fall back to name (cache models) + name = str(m.get("display_name") or m.get("name", "-")) if not verbose and name.startswith("mlx-community/"): # Compact name without the default org prefix name = name.split("/", 1)[1] diff --git a/pyproject.toml b/pyproject.toml index 7723681..cf098b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,7 +66,7 @@ dev = [ "mypy>=1.5.0", ] vision = [ - "mlx-vlm @ git+https://github.com/Blaizzy/mlx-vlm.git@c536165df2b3b4aece3a795b2e414349f935e750", # Vision Language Models support (ADR-012, requires Python 3.10+; beta.4 uses git commit with Pixtral pad_token fix, will switch to v0.3.10 PyPI when released) + "mlx-vlm @ git+https://github.com/Blaizzy/mlx-vlm.git@fc8c92e31983a52761f37d503f903ec40bebbd62", # Vision Language Models support (ADR-012, requires Python 3.10+; beta.7 uses upstream main (MXFP4 support), will switch to v0.3.10 PyPI when released) ] [tool.setuptools] diff --git a/pytest.ini b/pytest.ini index 7174926..4d0d592 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,6 +3,7 @@ testpaths = tests_2.0 python_files = test_*.py python_classes = Test* python_functions = test_* +addopts = -m "not live" markers = spec: JSON API contract tests (current spec only) live: Umbrella marker for ALL tests requiring real models/network (excluded from default run) diff --git a/scripts/benchmark-memmon.sh b/scripts/benchmark-memmon.sh new file mode 100755 index 0000000..c9d1de4 --- /dev/null +++ b/scripts/benchmark-memmon.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# Benchmark Memory Monitoring Script +# Created: 2026-01-13 (Session 95) +# Purpose: Pre-Phase for ADR-013 Community Model Quality Database +# +# Evolution Roadmap: +# ================== +# Phase 0 (COMPLETE): Fork of wet-memmon with all wet tests +# - Baseline: All 161 wet tests (98 with model, 63 infrastructure) +# - Purpose: Establish benchmark methodology with memmon integration +# - Output: JSONL + memplot for analysis +# +# Phase 1 (CURRENT): Pure inference tests only +# - Goal: Filter out infrastructure/fixture tests +# - Target: ~94 real inference tests (duration >= 0.5s) +# - Method: Use @pytest.mark.benchmark_inference filter +# - Selection criteria: +# * Model loaded + inference performed +# * Meaningful prompt + response validation +# * Representative of real-world usage +# * Not infrastructure (portfolio discovery, fixture validation) +# +# Phase 2 (Refinement): Curate benchmark test set +# - Goal: Select subset of high-value tests for benchmarking +# - Criteria: +# * Stop token detection (critical quality metric) +# * Performance representative (tokens/second) +# * Vision vs Text coverage +# * Model size distribution (small/medium/large) +# - Target: ~30-40 tests (balanced portfolio) +# +# Phase 3 (Template): Independent benchmark suite +# - Goal: Create standalone mlxk-benchmark package (ADR-013 Phase 1) +# - Features: +# * Dedicated benchmark command (mlxk benchmark) +# * Standardized prompts (deterministic, temperature=0) +# * JSON report generation (schema v1.0) +# * Community contribution workflow +# - Separation: Benchmark ≠ E2E tests (different purposes) +# +# Related: +# - ADR-013: Community Model Quality Database (PROPOSED) +# - wet-memmon.sh: Parent script (Session 56-57, memory debugging) +# - wet-umbrella.sh: E2E test suite (161 tests) +# +# Usage: +# ./scripts/benchmark-memmon.sh +# +# Example: +# ./scripts/benchmark-memmon.sh baseline-v1 +# +# Output: +# benchmarks/reports/YYYY-MM-DD-benchmark-memory-.jsonl +# benchmarks/reports/YYYY-MM-DD-benchmark-benchmark-.jsonl +# (Note: 'benchmark-benchmark' naming will be refined in Phase 1) + +if [ -z "$1" ]; then + echo "Usage: $0 " + echo "Example: $0 baseline-v1" + echo "" + echo "Creates benchmark reports with memory monitoring (Pre-Phase for ADR-013)" + exit 1 +fi + +SIGNATURE="$1" +DATE=$(date +%Y-%m-%d) + +echo "=== Benchmark Memory Monitoring ===" +echo "Phase: 1 (Filtered - pure inference only)" +echo "Signature: ${SIGNATURE}" +echo "Date: ${DATE}" +echo "" +echo "Output files:" +echo " - benchmarks/reports/${DATE}-benchmark-memory-${SIGNATURE}.jsonl" +echo " - benchmarks/reports/${DATE}-benchmark-benchmark-${SIGNATURE}.jsonl" +echo "" +echo "Running tests with memory monitoring..." +echo "" + +# Run filtered inference tests with memory monitoring +# Phase 1: Use benchmark_inference marker to filter pure inference tests (~94 tests) +# Phase 0 (baseline): Use -m wet for all 161 tests (no longer default) +env MLXK2_ENABLE_PIPES=1 python -u benchmarks/tools/memmon.py \ + --output benchmarks/reports/${DATE}-benchmark-memory-${SIGNATURE}.jsonl -- \ + pytest -m "wet and benchmark_inference" -v -s --tb=no --report-output=benchmarks/reports/${DATE}-benchmark-benchmark-${SIGNATURE}.jsonl -o addopts="" + +echo "" +echo "=== Benchmark Complete ===" +echo "" +echo "Next steps:" +echo "1. Review report: benchmarks/reports/BENCHMARK-v1.0-2.0.4b7-${DATE}-benchmark-benchmark-${SIGNATURE}.md" +echo "2. Generate markdown report:" +echo " python benchmarks/generate_benchmark_report.py benchmarks/reports/${DATE}-benchmark-benchmark-${SIGNATURE}.jsonl" +echo "3. Analyze memory timeline:" +echo " python benchmarks/tools/memplot.py benchmarks/reports/${DATE}-benchmark-memory-${SIGNATURE}.jsonl" +echo "" +echo "Phase 1 complete: Filtered pure inference tests (~94 tests)" +echo "Next: Phase 2 (Curation) - select 30-40 high-value tests for dedicated benchmarking" +echo "See BENCHMARK-EVOLUTION.md for roadmap: benchmarks/BENCHMARK-EVOLUTION.md" diff --git a/scripts/test-wet-memmom.sh b/scripts/test-wet-memmom.sh index 0bde0a1..274ce43 100755 --- a/scripts/test-wet-memmom.sh +++ b/scripts/test-wet-memmom.sh @@ -31,6 +31,6 @@ fi SIGNATURE="$1" DATE=$(date +%Y-%m-%d) -python -u benchmarks/tools/memmon.py \ +env MLXK2_ENABLE_PIPES=1 python -u benchmarks/tools/memmon.py \ --output benchmarks/reports/${DATE}-wet-memory-${SIGNATURE}.jsonl -- \ - venv310/bin/pytest -m wet -v -s --tb=no --report-output=benchmarks/reports/${DATE}-wet-benchmark-${SIGNATURE}.jsonl + pytest -m wet -v -s --tb=no --report-output=benchmarks/reports/${DATE}-wet-benchmark-${SIGNATURE}.jsonl -o addopts="" diff --git a/scripts/test-wet-umbrella.sh b/scripts/test-wet-umbrella.sh index 1d017be..d99426f 100755 --- a/scripts/test-wet-umbrella.sh +++ b/scripts/test-wet-umbrella.sh @@ -14,25 +14,26 @@ PYTEST_OPTS="--tb=no --capture=sys" # Run 1: Compatible live tests (User Cache READ + Workspace) echo "" echo "📦 Phase 1: User Cache READ tests (wet umbrella)..." -MLXK2_ENABLE_ALPHA_FEATURES=1 pytest -m wet -v $PYTEST_OPTS +# Override addopts to allow live tests (pytest.ini has -m "not live" for default run) +pytest -m wet -v $PYTEST_OPTS -o addopts="" # Run 2: Isolated Cache WRITE - Pull (incompatible with Portfolio) echo "" echo "📥 Phase 2: Isolated Cache WRITE - Pull tests..." -MLXK2_TEST_RESUMABLE_DOWNLOAD=1 pytest -m live_pull -v $PYTEST_OPTS +MLXK2_TEST_RESUMABLE_DOWNLOAD=1 pytest -m live_pull -v $PYTEST_OPTS -o addopts="" # Run 3: Isolated Cache WRITE - Clone (incompatible with Portfolio) echo "" echo "🔄 Phase 3: Isolated Cache WRITE - Clone tests..." # Note: live_clone tests are opt-in (require env vars), will skip if not configured -MLXK2_ENABLE_ALPHA_FEATURES=1 pytest -m live_clone -v $PYTEST_OPTS +pytest -m live_clone -v $PYTEST_OPTS -o addopts="" # Run 4: Vision→Geo Pipe Integration echo "" echo "🖼️ Phase 4: Vision→Geo Pipe tests..." # Note: Requires vision model (e.g., pixtral) + text model (e.g., Qwen3-Next) # Will skip if models not found in cache (graceful degradation) -MLXK2_ENABLE_PIPES=1 pytest -m live_vision_pipe -v $PYTEST_OPTS +MLXK2_ENABLE_PIPES=1 pytest -m live_vision_pipe -v $PYTEST_OPTS -o addopts="" echo "" echo "✅ All real tests completed!" diff --git a/tests_2.0/assets/geo-test/coll2_1.jpeg b/tests_2.0/assets/geo-test/coll2_1.jpeg index 6651ef6..77a6aa0 100644 Binary files a/tests_2.0/assets/geo-test/coll2_1.jpeg and b/tests_2.0/assets/geo-test/coll2_1.jpeg differ diff --git a/tests_2.0/assets/geo-test/coll2_2.jpeg b/tests_2.0/assets/geo-test/coll2_2.jpeg index d8a7fab..f535dc3 100644 Binary files a/tests_2.0/assets/geo-test/coll2_2.jpeg and b/tests_2.0/assets/geo-test/coll2_2.jpeg differ diff --git a/tests_2.0/assets/geo-test/coll2_3.jpeg b/tests_2.0/assets/geo-test/coll2_3.jpeg index 674b4ae..fb629d9 100644 Binary files a/tests_2.0/assets/geo-test/coll2_3.jpeg and b/tests_2.0/assets/geo-test/coll2_3.jpeg differ diff --git a/tests_2.0/assets/geo-test/coll2_4.jpeg b/tests_2.0/assets/geo-test/coll2_4.jpeg index b2100d2..d263d6d 100644 Binary files a/tests_2.0/assets/geo-test/coll2_4.jpeg and b/tests_2.0/assets/geo-test/coll2_4.jpeg differ diff --git a/tests_2.0/assets/geo-test/coll2_5.jpeg b/tests_2.0/assets/geo-test/coll2_5.jpeg index 33e13bc..8ff638e 100644 Binary files a/tests_2.0/assets/geo-test/coll2_5.jpeg and b/tests_2.0/assets/geo-test/coll2_5.jpeg differ diff --git a/tests_2.0/assets/geo-test/coll2_6.jpeg b/tests_2.0/assets/geo-test/coll2_6.jpeg index c8c6fdd..6a49e20 100644 Binary files a/tests_2.0/assets/geo-test/coll2_6.jpeg and b/tests_2.0/assets/geo-test/coll2_6.jpeg differ diff --git a/tests_2.0/assets/geo-test/coll2_7.jpeg b/tests_2.0/assets/geo-test/coll2_7.jpeg index 1739ab1..0d33d5e 100644 Binary files a/tests_2.0/assets/geo-test/coll2_7.jpeg and b/tests_2.0/assets/geo-test/coll2_7.jpeg differ diff --git a/tests_2.0/assets/geo-test/coll2_8.jpeg b/tests_2.0/assets/geo-test/coll2_8.jpeg index ef42806..b708398 100644 Binary files a/tests_2.0/assets/geo-test/coll2_8.jpeg and b/tests_2.0/assets/geo-test/coll2_8.jpeg differ diff --git a/tests_2.0/assets/geo-test/coll2_9.jpeg b/tests_2.0/assets/geo-test/coll2_9.jpeg index dabfa73..edac29e 100644 Binary files a/tests_2.0/assets/geo-test/coll2_9.jpeg and b/tests_2.0/assets/geo-test/coll2_9.jpeg differ diff --git a/tests_2.0/conftest.py b/tests_2.0/conftest.py index 0ef3014..9fce7f1 100644 --- a/tests_2.0/conftest.py +++ b/tests_2.0/conftest.py @@ -1210,7 +1210,10 @@ def pytest_collection_modifyitems(config, items): # Wet marker for compatible tests if (test_markers & LIVE_MARKERS_FOR_WET) or is_in_live_dir: # EXCLUDE Isolated Cache WRITE tests (incompatible with Portfolio Discovery!) - if "live_pull" not in test_markers and "live_clone" not in test_markers: + # EXCLUDE Vision Pipe tests (opt-in only, run via -m live_vision_pipe) + if ("live_pull" not in test_markers and + "live_clone" not in test_markers and + "live_vision_pipe" not in test_markers): item.add_marker(pytest.mark.wet) @@ -1230,7 +1233,14 @@ def pytest_addoption(parser): def pytest_configure(config): - """Initialize report file if --report-output is specified.""" + """Initialize report file and register custom markers.""" + # Register benchmark_inference marker (ADR-013 Phase 1) + config.addinivalue_line( + "markers", + "benchmark_inference: Pure inference tests suitable for benchmarking (ADR-013 Phase 1)" + ) + + # Initialize benchmark report file if --report-output is specified from pathlib import Path config.report_file = None if report_path := config.getoption("--report-output"): @@ -1434,8 +1444,13 @@ def pytest_runtest_makereport(item, call): Reports are written as JSONL (one JSON object per line) to allow streaming and easy appending across test runs. - Schema version: 0.2.0 (Phase 0.5 - System Health + Hardware Profile) - See: ADR-013 Phase 0.5 implementation + Schema version: 0.2.1 (Inference Modality) + See: benchmarks/schemas/MIGRATIONS.md + + Changelog from 0.2.0 → 0.2.1: + - Added: metadata.inference_modality (vision/text/audio/video) + - Automatic detection via fixtures and user_properties + - Backward compatible: All 0.2.0 fields preserved Changelog from 0.1.0 → 0.2.0: - Added: system.hardware_profile (Mac model, cores) @@ -1458,7 +1473,7 @@ def pytest_runtest_makereport(item, call): # Build report data (required fields) data = { - "schema_version": "0.2.0", + "schema_version": "0.2.1", "timestamp": datetime.now(timezone.utc).isoformat(), "mlx_knife_version": __version__, "test": item.nodeid, @@ -1486,6 +1501,27 @@ def pytest_runtest_makereport(item, call): # Everything else goes to metadata data.setdefault("metadata", {})[key] = value + # ADR-013 Phase 1: Automatic inference_modality detection (v0.2.1) + # Differentiates Vision/Text inference for multimodal models (e.g., Pixtral) + inference_modality = None + + # Priority 1: Explicit override via user_properties (pipe tests use this) + if "metadata" in data and "inference_modality" in data["metadata"]: + inference_modality = data["metadata"]["inference_modality"] + + # Priority 2: Detect from pytest fixtures (parametrized tests) + elif not inference_modality: + # Vision tests: use vision_model_key fixture + if hasattr(item, "fixturenames") and "vision_model_key" in item.fixturenames: + inference_modality = "vision" + # Text tests: use text_model_key fixture + elif hasattr(item, "fixturenames") and "text_model_key" in item.fixturenames: + inference_modality = "text" + + # Set inference_modality if detected + if inference_modality: + data.setdefault("metadata", {})["inference_modality"] = inference_modality + # ADR-013 Phase 0.5: Collect system health metrics (v0.2.0) # Enables automatic regression quality assessment system_health = _get_macos_system_health() diff --git a/tests_2.0/live/conftest.py b/tests_2.0/live/conftest.py index 98ec8f8..39c08fc 100644 --- a/tests_2.0/live/conftest.py +++ b/tests_2.0/live/conftest.py @@ -413,6 +413,9 @@ def _auto_report_vision_model(request): "family": "pixtral", "variant": "12b-8bit", })) + # Explicit inference_modality for CLI vision tests (v0.2.1) + # Required because these tests don't use vision_model_key fixture + request.node.user_properties.append(("inference_modality", "vision")) def _parse_model_family(model_id: str) -> tuple[str, str]: diff --git a/tests_2.0/live/test_cli_e2e.py b/tests_2.0/live/test_cli_e2e.py index c1bdb41..02a1d1f 100644 --- a/tests_2.0/live/test_cli_e2e.py +++ b/tests_2.0/live/test_cli_e2e.py @@ -37,7 +37,7 @@ from .test_utils import ( MAX_TOKENS, TEST_TEMPERATURE, ) -# portfolio_models fixture is provided by conftest.py +# text_portfolio fixture is provided by conftest.py (Portfolio Separation) # Opt-in markers pytestmark = [pytest.mark.live, pytest.mark.live_e2e, pytest.mark.slow] @@ -70,25 +70,27 @@ class TestRunCommandBasic: """ @pytest.mark.live_e2e - def test_run_command(self, portfolio_models, model_key, report_benchmark): + @pytest.mark.benchmark_inference + def test_run_command(self, text_portfolio, text_model_key, report_benchmark): """Validate `mlxk run` with model. - Parametrized test (one instance per model in portfolio). + Parametrized test (one instance per model in text portfolio). + Uses text_model_key for automatic inference_modality detection (v0.2.1). Tests: - Exit code 0 on success - No visible stop tokens in output - Output is non-empty """ - model_info = portfolio_models[model_key] + model_info = text_portfolio[text_model_key] model_id = model_info["id"] # RAM gating - should_skip, skip_reason = should_skip_model(model_key, portfolio_models) + should_skip, skip_reason = should_skip_model(text_model_key, text_portfolio) if should_skip: pytest.skip(skip_reason) - print(f"\nTesting {model_key}: {model_id}") + print(f"\nTesting {text_model_key}: {model_id}") args = ["run", model_id, TEST_PROMPT, "--max-tokens", str(MAX_TOKENS), "--temperature", str(TEST_TEMPERATURE)] stdout, stderr, exit_code = _run_mlxk_subprocess(args, timeout=90) @@ -114,7 +116,7 @@ class TestRunCommandBasic: f"Output: {stdout!r}" ) - print(f"✓ {model_key}: Passed (output: {len(stdout)} chars)") + print(f"✓ {text_model_key}: Passed (output: {len(stdout)} chars)") # Benchmark reporting (ADR-013 Phase 0) report_benchmark(stop_tokens={ @@ -133,10 +135,12 @@ class TestRunCommandJSON: """ @pytest.mark.live_e2e - def test_run_json_output(self, portfolio_models, model_key, report_benchmark): + @pytest.mark.benchmark_inference + def test_run_json_output(self, text_portfolio, text_model_key, report_benchmark): """Validate `mlxk run --json` output format. - Parametrized test (one instance per model in portfolio). + Parametrized test (one instance per model in text portfolio). + Uses text_model_key for automatic inference_modality detection (v0.2.1). Tests: - JSON envelope structure @@ -144,15 +148,15 @@ class TestRunCommandJSON: - data.response contains output - No visible stop tokens """ - model_info = portfolio_models[model_key] + model_info = text_portfolio[text_model_key] model_id = model_info["id"] # RAM gating - should_skip, skip_reason = should_skip_model(model_key, portfolio_models) + should_skip, skip_reason = should_skip_model(text_model_key, text_portfolio) if should_skip: pytest.skip(skip_reason) - print(f"\nTesting {model_key}: {model_id}") + print(f"\nTesting {text_model_key}: {model_id}") args = ["run", model_id, TEST_PROMPT, "--max-tokens", str(MAX_TOKENS), "--temperature", str(TEST_TEMPERATURE), "--json"] stdout, stderr, exit_code = _run_mlxk_subprocess(args, timeout=90) @@ -184,7 +188,7 @@ class TestRunCommandJSON: f"Response: {response!r}" ) - print(f"✓ {model_key}: Passed (JSON output: {len(response)} chars)") + print(f"✓ {text_model_key}: Passed (JSON output: {len(response)} chars)") # Benchmark reporting (ADR-013 Phase 0) report_benchmark(stop_tokens={ @@ -257,23 +261,24 @@ class TestRunCommandStopTokens: """Specific stop token filtering validation.""" @pytest.mark.live_e2e - def test_run_no_visible_stop_tokens_mxfp4(self, portfolio_models): + def test_run_no_visible_stop_tokens_mxfp4(self, text_portfolio): """Validate MXFP4 model has no visible stop tokens via CLI. Specific regression test for Issue #32 at CLI level. + Uses text_portfolio (MXFP4 is a text model quantization format). """ # Find MXFP4 model in portfolio (or skip) mxfp4_model = None - for model_key, model_info in portfolio_models.items(): + for model_key, model_info in text_portfolio.items(): if "mxfp4" in model_key.lower() or "gpt-oss" in model_info["id"].lower(): # Check RAM - should_skip, skip_reason = should_skip_model(model_key, portfolio_models) + should_skip, skip_reason = should_skip_model(model_key, text_portfolio) if not should_skip: mxfp4_model = model_info["id"] break if mxfp4_model is None: - pytest.skip("MXFP4 model not available in portfolio or exceeds RAM") + pytest.skip("MXFP4 model not available in text portfolio or exceeds RAM") print(f"\nTesting MXFP4: {mxfp4_model}") diff --git a/tests_2.0/live/test_cli_pipe_live.py b/tests_2.0/live/test_cli_pipe_live.py index d34e8a2..03c1f2a 100644 --- a/tests_2.0/live/test_cli_pipe_live.py +++ b/tests_2.0/live/test_cli_pipe_live.py @@ -19,13 +19,23 @@ from .test_utils import should_skip_model, MAX_TOKENS, TEST_TEMPERATURE pytestmark = [pytest.mark.live, pytest.mark.live_e2e, pytest.mark.slow] -def _pick_first_eligible_model(portfolio_models: Dict[str, Dict[str, Any]]) -> Dict[str, Any]: - """Select the first model that passes RAM gating.""" - for key, info in portfolio_models.items(): - should_skip, _ = should_skip_model(key, portfolio_models) +@pytest.fixture(autouse=True) +def _report_text_modality(request): + """Report text inference modality for benchmark reports (v0.2.1). + + All pipe tests in this file are text inference (no vision). + Required because these tests don't use text_model_key fixture. + """ + request.node.user_properties.append(("inference_modality", "text")) + + +def _pick_first_eligible_model(text_portfolio: Dict[str, Dict[str, Any]]) -> Dict[str, Any]: + """Select the first text model that passes RAM gating.""" + for key, info in text_portfolio.items(): + should_skip, _ = should_skip_model(key, text_portfolio) if not should_skip: return info - pytest.skip("No suitable models found in portfolio (RAM gating)") + pytest.skip("No suitable text models found in portfolio (RAM gating)") def _run_cli(args: list[str], stdin: str | None = None, timeout: int = 120) -> Tuple[str, str, int]: @@ -44,10 +54,10 @@ class TestPipeModeSingleModel: """Exercise pipe workflows against one dynamically discovered model.""" @pytest.fixture(scope="class") - def model_id(self, portfolio_models): + def model_id(self, text_portfolio): if not os.getenv("MLXK2_ENABLE_PIPES"): pytest.skip("Pipe mode gated by MLXK2_ENABLE_PIPES=1") - model = _pick_first_eligible_model(portfolio_models) + model = _pick_first_eligible_model(text_portfolio) return model["id"] def test_stdin_dash_appends_trailing_text(self, model_id): diff --git a/tests_2.0/live/test_clone_live.py b/tests_2.0/live/test_clone_live.py index 2946f48..2d50fd7 100644 --- a/tests_2.0/live/test_clone_live.py +++ b/tests_2.0/live/test_clone_live.py @@ -4,7 +4,6 @@ Runs only when explicitly selected via markers/env, per TESTING.md mini‑matrix Validates ADR-007 Phase 1 compliance: real pull→temp cache→APFS same-volume clone→workspace. Enable with ALL required env vars: -- MLXK2_ENABLE_ALPHA_FEATURES=1 (clone is alpha) - MLXK2_LIVE_CLONE=1 (enable live test) - HF_TOKEN= (for model access) - MLXK2_LIVE_CLONE_MODEL= (e.g., "mlx-community/bge-small-en-v1.5-4bit") @@ -34,7 +33,6 @@ import pytest # Environment validation -alpha_enabled = os.environ.get("MLXK2_ENABLE_ALPHA_FEATURES") == "1" live_enabled = os.environ.get("MLXK2_LIVE_CLONE") == "1" hf_token_present = bool(os.environ.get("HF_TOKEN")) model = os.environ.get("MLXK2_LIVE_CLONE_MODEL") @@ -44,9 +42,9 @@ pytestmark = [ pytest.mark.live, pytest.mark.live_clone, pytest.mark.skipif( - not (alpha_enabled and live_enabled and hf_token_present and model and workspace), + not (live_enabled and hf_token_present and model and workspace), reason=( - "Live clone disabled. Set MLXK2_ENABLE_ALPHA_FEATURES=1, MLXK2_LIVE_CLONE=1, " + "Live clone disabled. Set MLXK2_LIVE_CLONE=1, " "HF_TOKEN, MLXK2_LIVE_CLONE_MODEL, and MLXK2_LIVE_CLONE_WORKSPACE to enable." ), ), diff --git a/tests_2.0/live/test_pipe_vision_geo.py b/tests_2.0/live/test_pipe_vision_geo.py index ad69477..83cabe3 100644 --- a/tests_2.0/live/test_pipe_vision_geo.py +++ b/tests_2.0/live/test_pipe_vision_geo.py @@ -97,7 +97,8 @@ class TestVisionGeoPipeline: """Get vision model (hardcoded for now - pixtral only viable model).""" # TODO: Use vision_portfolio when more vision models are viable # Currently only pixtral works reliably (blacklist filters others) - return "pixtral" + # Use full ID for consistency in benchmark reports (not "pixtral" shorthand) + return "mlx-community/pixtral-12b-8bit" @pytest.fixture(scope="class") def text_model_id(self, text_portfolio): @@ -126,11 +127,11 @@ class TestVisionGeoPipeline: assert len(GEO_IMAGES) == 9, f"Expected 9 images, found {len(GEO_IMAGES)}" - def test_vision_batch_processing_chunk_1(self, check_prerequisites, vision_model_id): + def test_vision_batch_processing_chunk_1(self, check_prerequisites, vision_model_id, request): """Test vision batch processing with chunk=1 (incremental output). - Validates: ADR-012 Phase 1c, Sessions 73-75 fixes - PASSED: Process succeeds, output not empty, multiple images mentioned + Validates: ADR-012 Phase 1c, Sessions 73-75 fixes, Session 93 chunk streaming + PASSED: Process succeeds, output not empty, all chunks processed """ image_paths = [str(p) for p in GEO_IMAGES] @@ -140,11 +141,7 @@ class TestVisionGeoPipeline: "--image", *image_paths, "--chunk", "1", "--max-tokens", "12000", - "--prompt", ( - "Describe each image in best possible detail. " - "Don't repeat unimportant camera information. " - "Number images according to metadata image number." - ), + "--prompt", "Describe each image in best possible detail.", ] stdout, stderr, code = _run_cli(args, timeout=600) @@ -153,19 +150,25 @@ class TestVisionGeoPipeline: assert code == 0, f"Vision phase failed: exit={code}\nstderr={stderr}" assert stdout.strip(), "Vision output is empty" - # Heuristic: Output should mention multiple images (smoke test) - image_mentions = sum(1 for i in range(1, 10) if f"Image {i}" in stdout or f"image {i}" in stdout.lower()) - assert image_mentions >= 5, f"Only {image_mentions}/9 images mentioned (expected most/all)" + # Session 93: With chunk=1, no image numbers in metadata (hallucination fix) + # Instead, verify all chunks were processed by checking chunk markers + chunk_markers = sum(1 for i in range(1, 10) if f"Chunk {i}/9" in stdout) + assert chunk_markers == 9, f"Only {chunk_markers}/9 chunks found (expected all chunks processed)" - def test_vision_to_geo_pipe(self, check_prerequisites, vision_model_id, text_model_id): + def test_vision_to_geo_pipe(self, check_prerequisites, vision_model_id, text_model_id, request): """Test complete Vision→Geo pipeline. Validates: Session 73 pipe stdin + --prompt, complete integration PASSED: Both phases succeed, geo output mentions location concepts """ + import time + import json + from datetime import datetime, timezone + image_paths = [str(p) for p in GEO_IMAGES] # Phase 1: Vision descriptions + vision_start = time.time() vision_args = [ "run", vision_model_id, @@ -180,11 +183,28 @@ class TestVisionGeoPipeline: ] vision_stdout, vision_stderr, vision_code = _run_cli(vision_args, timeout=600) + vision_end = time.time() + + # Log Vision phase as sub-test + if request.config.report_file: + vision_entry = { + "schema_version": "0.2.1", + "timestamp": datetime.fromtimestamp(vision_end, timezone.utc).isoformat(), + "mlx_knife_version": __import__("mlxk2").__version__, + "test": f"{request.node.nodeid}[vision_phase]", + "outcome": "passed" if vision_code == 0 else "failed", + "duration": vision_end - vision_start, + "model": {"id": vision_model_id, "size_gb": 12.6, "family": "pixtral"}, + "metadata": {"inference_modality": "vision"}, + } + request.config.report_file.write(json.dumps(vision_entry) + "\n") + request.config.report_file.flush() assert vision_code == 0, f"Vision phase failed: {vision_stderr}" assert vision_stdout.strip(), "Vision output is empty" # Phase 2: Geo inference via pipe + text_start = time.time() geo_args = [ "run", text_model_id, @@ -197,6 +217,27 @@ class TestVisionGeoPipeline: ] geo_stdout, geo_stderr, geo_code = _run_cli(geo_args, stdin=vision_stdout, timeout=300) + text_end = time.time() + + # Log Text phase as sub-test + # Note: size_gb lookup from portfolio would be ideal, but hardcoded for Mixtral-8x7B as fallback + # TODO: Extract size_gb from portfolio when available (Session 80 follow-up) + if request.config.report_file: + # Best-effort size_gb lookup (Mixtral-8x7B is 24.5GB, but might vary by quantization) + text_size_gb = 24.5 if "mixtral" in text_model_id.lower() else 0 + + text_entry = { + "schema_version": "0.2.1", + "timestamp": datetime.fromtimestamp(text_end, timezone.utc).isoformat(), + "mlx_knife_version": __import__("mlxk2").__version__, + "test": f"{request.node.nodeid}[text_phase]", + "outcome": "passed" if geo_code == 0 else "failed", + "duration": text_end - text_start, + "model": {"id": text_model_id, "size_gb": text_size_gb}, + "metadata": {"inference_modality": "text"}, + } + request.config.report_file.write(json.dumps(text_entry) + "\n") + request.config.report_file.flush() assert geo_code == 0, f"Geo phase failed: exit={geo_code}\nstderr={geo_stderr}" assert geo_stdout.strip(), "Geo output is empty" @@ -211,7 +252,7 @@ class TestVisionGeoPipeline: assert has_location_terms, f"Geo output lacks location terms (pipe may have failed):\n{geo_stdout[:300]}" - def test_vision_chunk_isolation_no_hallucination(self, check_prerequisites, vision_model_id): + def test_vision_chunk_isolation_no_hallucination(self, check_prerequisites, vision_model_id, request): """Test chunk isolation with chunk=1 (Session 73 regression test). Validates: Fresh VisionRunner per chunk, no state leakage @@ -235,7 +276,7 @@ class TestVisionGeoPipeline: assert code == 0, f"exit={code}\nstderr={stderr}" assert stdout.strip(), "Output is empty" - # Smoke test: Both batches should be visible (chunk workflow functioning) - # NOTE: We don't verify isolation quality - just that 2 batches were processed - assert "batch 1/2" in stdout.lower(), "Batch 1/2 not found (chunking failed?)" - assert "batch 2/2" in stdout.lower(), "Batch 2/2 not found (chunking failed?)" + # Smoke test: Both chunks should be visible (chunk workflow functioning) + # NOTE: We don't verify isolation quality - just that 2 chunks were processed + assert "chunk 1/2" in stdout.lower(), "Chunk 1/2 not found (chunking failed?)" + assert "chunk 2/2" in stdout.lower(), "Chunk 2/2 not found (chunking failed?)" diff --git a/tests_2.0/live/test_server_e2e.py b/tests_2.0/live/test_server_e2e.py index 61354f7..64afc79 100644 --- a/tests_2.0/live/test_server_e2e.py +++ b/tests_2.0/live/test_server_e2e.py @@ -37,7 +37,7 @@ from .test_utils import ( TEST_PROMPT, MAX_TOKENS, ) -# portfolio_models fixture is provided by conftest.py +# text_portfolio fixture is provided by conftest.py (Portfolio Separation) # Server request timeout (increased from 30s to 45s in Session 22) # Accounts for: baseline (15s) + probe/policy overhead (2.7s) + generation + safety margin @@ -126,6 +126,7 @@ class TestChatCompletionsBatch: """ @pytest.mark.live_e2e + @pytest.mark.benchmark_inference def test_chat_completions_batch(self, text_portfolio, text_model_key, report_benchmark): """Validate non-streaming chat completions. @@ -217,6 +218,7 @@ class TestChatCompletionsStreaming: """ @pytest.mark.live_e2e + @pytest.mark.benchmark_inference def test_chat_completions_streaming(self, text_portfolio, text_model_key, report_benchmark): """Validate SSE streaming chat completions. diff --git a/tests_2.0/live/test_streaming_parity.py b/tests_2.0/live/test_streaming_parity.py index 4500e42..e03e3e6 100644 --- a/tests_2.0/live/test_streaming_parity.py +++ b/tests_2.0/live/test_streaming_parity.py @@ -39,7 +39,7 @@ from .test_utils import ( TEST_PROMPT, MAX_TOKENS, ) -# portfolio_models fixture is provided by conftest.py +# text_portfolio fixture is provided by conftest.py (Portfolio Separation) # Opt-in markers pytestmark = [ @@ -53,8 +53,8 @@ pytestmark = [ ] -def _select_parity_test_models(portfolio: Dict[str, Dict[str, Any]]) -> List[str]: - """Select 2-3 representative models from portfolio for parity testing. +def _select_parity_test_keys(portfolio: Dict[str, Dict[str, Any]]) -> set: + """Select 2-3 representative model keys from portfolio for parity testing. Strategy: - Only small models (<6GB RAM) for fast testing @@ -63,10 +63,10 @@ def _select_parity_test_models(portfolio: Dict[str, Dict[str, Any]]) -> List[str - Limit to 3 models max (parity tests are slow) Args: - portfolio: Model portfolio from portfolio_models fixture + portfolio: Text model portfolio from text_portfolio fixture Returns: - List of model_keys to test (empty if no suitable models) + Set of model_keys to test (empty if no suitable models) """ # Filter: small models only candidates = { @@ -75,7 +75,7 @@ def _select_parity_test_models(portfolio: Dict[str, Dict[str, Any]]) -> List[str } if not candidates: - return [] + return set() # Exclude reasoning models (known Issue #20 regression - will fix in ADR-010) # Reasoning models have batch/stream inconsistency: @@ -88,71 +88,33 @@ def _select_parity_test_models(portfolio: Dict[str, Dict[str, Any]]) -> List[str } if not candidates: - return [] + return set() # Sort by RAM (smallest first) and select up to 3 sorted_models = sorted(candidates.items(), key=lambda x: x[1]["ram_needed_gb"]) - selected = [key for key, _ in sorted_models[:3]] + selected = {key for key, _ in sorted_models[:3]} return selected -def pytest_generate_tests(metafunc): - """Custom parametrization for parity tests. - - Parametrizes parity_model_key over 2-3 selected models from portfolio. - This hook runs at collection time. - """ - if "parity_model_key" in metafunc.fixturenames: - # Check if live_e2e marker is requested - selected_markers = metafunc.config.getoption("-m") or "" - if "live_e2e" not in selected_markers: - # Parametrize with dummy value (tests will be skipped) - metafunc.parametrize("parity_model_key", ["_skipped"]) - return - - # Import portfolio discovery (same as conftest.py) - from .test_utils import discover_mlx_models_in_user_cache, TEST_MODELS - - discovered = discover_mlx_models_in_user_cache() - - if discovered: - # Build portfolio from discovered models - portfolio = {} - for i, model in enumerate(discovered): - key = f"discovered_{i:02d}" - portfolio[key] = { - "id": model["model_id"], - "ram_needed_gb": model["ram_needed_gb"], - "expected_issue": None, - "description": f"Discovered: {model['model_id']}" - } - else: - # Fallback to hardcoded test models - portfolio = TEST_MODELS - - # Select 2-3 models for parity testing - selected = _select_parity_test_models(portfolio) - - if not selected: - # No suitable models - parametrize with dummy for graceful skip - metafunc.parametrize("parity_model_key", ["_no_suitable_models"]) - else: - metafunc.parametrize("parity_model_key", selected) +# Note: No custom pytest_generate_tests - using parent conftest.py's hook +# for text_model_key parametrization. Tests filter to parity subset internally. class TestRunnerStreamingParity: """MLXRunner direct streaming vs. batch parity. - Tests are parametrized over selected models from portfolio (2-3 models). + Tests are parametrized over text models, filtered to 2-3 parity subset. + Uses text_model_key for automatic inference_modality detection (v0.2.1). Each test runs independently for clean isolation. """ @pytest.mark.live_e2e - def test_runner_streaming_batch_identical(self, _use_real_mlx_modules, portfolio_models, parity_model_key): + def test_runner_streaming_batch_identical(self, _use_real_mlx_modules, text_portfolio, text_model_key): """Validate MLXRunner streaming and batch produce identical output. - Parametrized test (one instance per selected parity model). + Parametrized test - runs for all text models, filters to parity subset. + Uses text_model_key for automatic inference_modality detection (v0.2.1). Issue #20: Previously, batch output had visible stop tokens while streaming did not. This validates the ADR-009 fix at Runner level. @@ -161,22 +123,21 @@ class TestRunnerStreamingParity: """ from mlxk2.core.runner import MLXRunner - # Handle graceful skips - if parity_model_key == "_skipped": - pytest.skip("Run with -m live_e2e to enable parity tests") - if parity_model_key == "_no_suitable_models": - pytest.skip("No suitable models for parity testing (<6GB, non-reasoning)") + # Parity subset filtering - only run for 2-3 representative models + parity_keys = _select_parity_test_keys(text_portfolio) + if text_model_key not in parity_keys: + pytest.skip(f"Not in parity subset (testing {len(parity_keys)} models)") # Get model info from portfolio - model_info = portfolio_models[parity_model_key] + model_info = text_portfolio[text_model_key] model_id = model_info["id"] # RAM gating - should_skip, skip_reason = should_skip_model(parity_model_key, portfolio_models) + should_skip, skip_reason = should_skip_model(text_model_key, text_portfolio) if should_skip: pytest.skip(skip_reason) - print(f"\nTesting {parity_model_key}: {model_id}") + print(f"\nTesting {text_model_key}: {model_id}") with MLXRunner(model_id, verbose=False) as runner: # Batch generation (temperature=0 for deterministic output) @@ -204,40 +165,41 @@ class TestRunnerStreamingParity: f"Stream ({len(stream_output)} chars): {stream_output!r}" ) - print(f"✓ {parity_model_key}: Parity verified ({len(batch_output)} chars)") + print(f"✓ {text_model_key}: Parity verified ({len(batch_output)} chars)") class TestServerStreamingParity: """Server API streaming vs. batch parity. - Tests are parametrized over selected models from portfolio (2-3 models). + Tests are parametrized over text models, filtered to 2-3 parity subset. + Uses text_model_key for automatic inference_modality detection (v0.2.1). Each test runs independently for clean isolation. """ @pytest.mark.live_e2e - def test_server_api_streaming_batch_identical(self, portfolio_models, parity_model_key): + def test_server_api_streaming_batch_identical(self, text_portfolio, text_model_key): """Validate Server API streaming and batch produce identical output. - Parametrized test (one instance per selected parity model). + Parametrized test - runs for all text models, filters to parity subset. + Uses text_model_key for automatic inference_modality detection (v0.2.1). Tests parity at HTTP API level (closest to production usage). """ - # Handle graceful skips - if parity_model_key == "_skipped": - pytest.skip("Run with -m live_e2e to enable parity tests") - if parity_model_key == "_no_suitable_models": - pytest.skip("No suitable models for parity testing (<6GB, non-reasoning)") + # Parity subset filtering - only run for 2-3 representative models + parity_keys = _select_parity_test_keys(text_portfolio) + if text_model_key not in parity_keys: + pytest.skip(f"Not in parity subset (testing {len(parity_keys)} models)") # Get model info from portfolio - model_info = portfolio_models[parity_model_key] + model_info = text_portfolio[text_model_key] model_id = model_info["id"] # RAM gating - should_skip, skip_reason = should_skip_model(parity_model_key, portfolio_models) + should_skip, skip_reason = should_skip_model(text_model_key, text_portfolio) if should_skip: pytest.skip(skip_reason) - print(f"\nTesting {parity_model_key}: {model_id}") + print(f"\nTesting {text_model_key}: {model_id}") with LocalServer(model_id, port=8765) as server_url: # Batch request (temperature=0 for deterministic output) @@ -280,35 +242,40 @@ class TestServerStreamingParity: f"Stream ({len(stream_output)} chars): {stream_output!r}" ) - print(f"✓ {parity_model_key}: Parity verified ({len(batch_output)} chars)") + print(f"✓ {text_model_key}: Parity verified ({len(batch_output)} chars)") class TestCrossInterfaceParity: """Parity across different interfaces (Runner vs Server).""" @pytest.mark.live_e2e - def test_runner_vs_server_consistency(self, _use_real_mlx_modules, portfolio_models): + def test_runner_vs_server_consistency(self, _use_real_mlx_modules, text_portfolio): """Validate MLXRunner and Server API produce consistent output. Tests that direct Runner usage and Server HTTP API yield the same results (validates no server-specific transformations). + Uses text_portfolio for text model selection. Requires real MLX modules (not stubs) since we use MLXRunner directly. """ from mlxk2.core.runner import MLXRunner # Select smallest available model for fastest testing - selected = _select_parity_test_models(portfolio_models) + selected = _select_parity_test_keys(text_portfolio) if not selected: pytest.skip("No suitable models for cross-interface testing (<6GB, non-reasoning)") - # Use first (smallest) model - test_model_key = selected[0] - model_info = portfolio_models[test_model_key] + # Use first (smallest) model from sorted portfolio + sorted_models = sorted( + [(k, text_portfolio[k]) for k in selected], + key=lambda x: x[1]["ram_needed_gb"] + ) + test_model_key = sorted_models[0][0] + model_info = text_portfolio[test_model_key] model_id = model_info["id"] # RAM check - should_skip, skip_reason = should_skip_model(test_model_key, portfolio_models) + should_skip, skip_reason = should_skip_model(test_model_key, text_portfolio) if should_skip: pytest.skip(skip_reason) diff --git a/tests_2.0/live/test_vision_e2e_live.py b/tests_2.0/live/test_vision_e2e_live.py index 1f88e4b..557cdf4 100644 --- a/tests_2.0/live/test_vision_e2e_live.py +++ b/tests_2.0/live/test_vision_e2e_live.py @@ -38,6 +38,7 @@ class TestVisionDeterministicQueries: to validate actual image understanding rather than hallucination. """ + @pytest.mark.benchmark_inference def test_chess_position_e6(self): """Test reading specific chess position (e6 = black king).""" result = subprocess.run( @@ -59,6 +60,7 @@ class TestVisionDeterministicQueries: # Expected: "black king" on e6 - either "black king" or just "black" (if truncated) assert "black" in output or "king" in output, f"Expected 'black' or 'king' in output: {result.stdout}" + @pytest.mark.benchmark_inference def test_contract_name_extraction(self): """Test OCR: extract name from contract document.""" result = subprocess.run( @@ -80,6 +82,7 @@ class TestVisionDeterministicQueries: assert "John" in output, f"Expected 'John' in output: {result.stdout}" assert "Smith" in output, f"Expected 'Smith' in output: {result.stdout}" + @pytest.mark.benchmark_inference def test_mug_color_identification(self): """Test color recognition: blue mug.""" result = subprocess.run( @@ -100,6 +103,7 @@ class TestVisionDeterministicQueries: # Expected: "blue" assert "blue" in output, f"Expected 'blue' in output: {result.stdout}" + @pytest.mark.benchmark_inference def test_chart_axis_label_reading(self): """Test chart OCR: read Y-axis label.""" result = subprocess.run( @@ -120,6 +124,7 @@ class TestVisionDeterministicQueries: # Expected: "tokens/s" or "tokens per second" assert "token" in output, f"Expected 'token' in output: {result.stdout}" + @pytest.mark.benchmark_inference def test_large_image_support(self): """Test that 2.7MB image (T2.png) is accepted (10MB limit).""" image_path = Path("tests_2.0/assets/T2.png") diff --git a/tests_2.0/live/test_vision_server_e2e.py b/tests_2.0/live/test_vision_server_e2e.py index bbf2340..6e862f0 100644 --- a/tests_2.0/live/test_vision_server_e2e.py +++ b/tests_2.0/live/test_vision_server_e2e.py @@ -64,6 +64,7 @@ class TestVisionServerE2E: """ @pytest.mark.live_e2e + @pytest.mark.benchmark_inference def test_single_image_chat_completion(self, vision_portfolio, vision_model_key): """Vision model should describe an image sent via Base64. @@ -123,6 +124,7 @@ class TestVisionServerE2E: print(f"\n✅ Vision response: {content[:200]}...") @pytest.mark.live_e2e + @pytest.mark.benchmark_inference def test_streaming_graceful_degradation(self, vision_portfolio, vision_model_key): """Vision request with stream=True should gracefully degrade via SSE emulation. @@ -187,12 +189,19 @@ class TestVisionServerE2E: print(f"\n✅ SSE emulation response: {full_content[:100]}...") @pytest.mark.live_e2e - def test_text_request_still_works_on_vision_model(self, vision_portfolio, vision_model_key): + @pytest.mark.benchmark_inference + def test_text_request_still_works_on_vision_model(self, vision_portfolio, vision_model_key, request): """Text-only requests should still work on vision model server. Parametrized test (one instance per VISION model in portfolio). Tests that vision models can handle pure text requests (no images). + + Note: Uses vision_model_key fixture but does TEXT inference (no --image). + Explicit modality override required for correct benchmark classification. """ + # Override auto-detection: vision_model_key fixture but TEXT inference + request.node.user_properties.append(("inference_modality", "text")) + model_info = vision_portfolio[vision_model_key] model_id = model_info["id"] diff --git a/tests_2.0/test_cli_push_args.py b/tests_2.0/test_cli_push_args.py index 6d80cfd..d27774b 100644 --- a/tests_2.0/test_cli_push_args.py +++ b/tests_2.0/test_cli_push_args.py @@ -31,7 +31,6 @@ def _run_cli(argv: list[str], capsys): def test_cli_push_missing_args_json_error(capsys, monkeypatch): # Missing required positional args but with --json should emit JSON error - monkeypatch.setenv("MLXK2_ENABLE_ALPHA_FEATURES", "1") stdout, stderr = _run_cli(["mlxk2", "push", "--private", "--json"], capsys) # JSON mode: all output to stdout (for scripting) data = json.loads(stdout) @@ -42,7 +41,6 @@ def test_cli_push_missing_args_json_error(capsys, monkeypatch): def test_cli_push_workspace_missing_json_error(tmp_path, monkeypatch, capsys): # Provide missing workspace; ensure JSON error and specific error type - monkeypatch.setenv("MLXK2_ENABLE_ALPHA_FEATURES", "1") monkeypatch.setenv("HF_TOKEN", "dummy") missing = str(tmp_path / "nope") stdout, stderr = _run_cli(["mlxk2", "push", "--private", missing, "user/repo", "--json"], capsys) @@ -89,7 +87,6 @@ def test_cli_push_no_changes_json_output(tmp_path, monkeypatch, capsys): ws = tmp_path / "ws" ws.mkdir() (ws / "x.txt").write_text("x") - monkeypatch.setenv("MLXK2_ENABLE_ALPHA_FEATURES", "1") monkeypatch.setenv("HF_TOKEN", "dummy") _install_fake_hf(monkeypatch, mode="no_changes") @@ -108,7 +105,6 @@ def test_cli_push_with_changes_json_output(tmp_path, monkeypatch, capsys): ws = tmp_path / "ws" ws.mkdir() (ws / "x.txt").write_text("x") - monkeypatch.setenv("MLXK2_ENABLE_ALPHA_FEATURES", "1") monkeypatch.setenv("HF_TOKEN", "dummy") _install_fake_hf(monkeypatch, mode="with_changes") diff --git a/tests_2.0/test_json_api_list.py b/tests_2.0/test_json_api_list.py index aecef78..6baa3ac 100644 --- a/tests_2.0/test_json_api_list.py +++ b/tests_2.0/test_json_api_list.py @@ -109,3 +109,248 @@ def test_list_empty_cache(isolated_cache): assert result["status"] == "success" assert result["data"]["models"] == [] assert result["data"]["count"] == 0 + + +class TestListWorkspacePrefix: + """Test list_models() with workspace path patterns (Session 103).""" + + def test_list_workspace_exact_match(self, tmp_path): + """Test list_models with exact workspace path.""" + import os + + ws = tmp_path / "my-model" + ws.mkdir() + (ws / "config.json").write_text('{"model_type": "llama"}') + + old_cwd = os.getcwd() + try: + os.chdir(tmp_path) + result = list_models(pattern="./my-model") + + assert result["status"] == "success" + assert result["data"]["count"] == 1 + assert len(result["data"]["models"]) == 1 + assert "my-model" in result["data"]["models"][0]["name"] + finally: + os.chdir(old_cwd) + + def test_list_workspace_prefix_match(self, tmp_path): + """Test list_models with workspace prefix pattern.""" + import os + + # Create multiple workspaces with common prefix + for name in ["gemma-3n-4bit", "gemma-3n-FIXED-4bit", "gemma-3n-8bit"]: + ws = tmp_path / name + ws.mkdir() + (ws / "config.json").write_text('{"model_type": "gemma"}') + + # Create non-matching workspace + other = tmp_path / "llama-3" + other.mkdir() + (other / "config.json").write_text('{"model_type": "llama"}') + + old_cwd = os.getcwd() + try: + os.chdir(tmp_path) + result = list_models(pattern="./gemma-") + + assert result["status"] == "success" + assert result["data"]["count"] == 3 + # All matches should contain gemma + for m in result["data"]["models"]: + assert "gemma" in m["name"] + finally: + os.chdir(old_cwd) + + def test_list_workspace_prefix_no_match(self, tmp_path): + """Test list_models with non-matching prefix returns empty list.""" + import os + + old_cwd = os.getcwd() + try: + os.chdir(tmp_path) + result = list_models(pattern="./nonexistent-") + + assert result["status"] == "success" + assert result["data"]["count"] == 0 + assert result["data"]["models"] == [] + finally: + os.chdir(old_cwd) + + def test_list_workspace_does_not_fall_through_to_cache(self, tmp_path, isolated_cache): + """Test explicit path patterns don't fall through to cache search.""" + import os + + old_cwd = os.getcwd() + try: + os.chdir(tmp_path) + # ./gemma- should NOT match cache models even if they contain "gemma" + # It should only search local workspaces + result = list_models(pattern="./gemma-") + + assert result["status"] == "success" + # Should be empty (no local workspaces) not cache models + assert result["data"]["count"] == 0 + finally: + os.chdir(old_cwd) + + def test_list_workspace_sorted_by_name(self, tmp_path): + """Test workspace results are sorted by name.""" + import os + + for name in ["model-c", "model-a", "model-b"]: + ws = tmp_path / name + ws.mkdir() + (ws / "config.json").write_text('{}') + + old_cwd = os.getcwd() + try: + os.chdir(tmp_path) + result = list_models(pattern="./model-") + + names = [m["name"] for m in result["data"]["models"]] + # Should be sorted (names contain full path, but sorted by name) + assert "model-a" in names[0] + assert "model-b" in names[1] + assert "model-c" in names[2] + finally: + os.chdir(old_cwd) + + def test_list_workspace_absolute_path(self, tmp_path): + """Test list_models with absolute workspace path.""" + ws = tmp_path / "model" + ws.mkdir() + (ws / "config.json").write_text('{}') + + result = list_models(pattern=str(ws)) + + assert result["status"] == "success" + assert result["data"]["count"] == 1 + + def test_list_workspace_has_all_fields(self, tmp_path): + """Test workspace model objects have all required fields.""" + import os + + ws = tmp_path / "my-model" + ws.mkdir() + (ws / "config.json").write_text('{"model_type": "llama"}') + + old_cwd = os.getcwd() + try: + os.chdir(tmp_path) + result = list_models(pattern="./my-model") + + m = result["data"]["models"][0] + # Check required fields exist + required = ["name", "hash", "size_bytes", "last_modified", + "framework", "model_type", "capabilities", "health", "cached"] + for field in required: + assert field in m, f"Missing field: {field}" + + # Workspace-specific: hash should be None, cached should be False + assert m["hash"] is None + assert m["cached"] is False + finally: + os.chdir(old_cwd) + + def test_list_workspace_display_name_relative(self, tmp_path): + """Test display_name is relative for relative input patterns.""" + import os + + ws = tmp_path / "my-model" + ws.mkdir() + (ws / "config.json").write_text('{}') + + old_cwd = os.getcwd() + try: + os.chdir(tmp_path) + result = list_models(pattern="./my-model") + + m = result["data"]["models"][0] + # name should be absolute (for programmatic use) + assert m["name"].startswith("/") + # display_name should be relative + assert m["display_name"] == "my-model" + finally: + os.chdir(old_cwd) + + def test_list_workspace_display_name_absolute(self, tmp_path): + """Test display_name is absolute for absolute input patterns.""" + ws = tmp_path / "my-model" + ws.mkdir() + (ws / "config.json").write_text('{}') + + # Use absolute path pattern + result = list_models(pattern=str(ws)) + + m = result["data"]["models"][0] + # Both name and display_name should be absolute + assert m["name"].startswith("/") + assert m["display_name"].startswith("/") + assert m["display_name"] == str(ws) + + def test_list_workspace_display_name_prefix_match(self, tmp_path): + """Test display_name works correctly with prefix matching.""" + import os + + for name in ["gemma-a", "gemma-b"]: + ws = tmp_path / name + ws.mkdir() + (ws / "config.json").write_text('{}') + + old_cwd = os.getcwd() + try: + os.chdir(tmp_path) + result = list_models(pattern="./gemma-") + + # Both should have relative display_name + display_names = [m["display_name"] for m in result["data"]["models"]] + assert "gemma-a" in display_names + assert "gemma-b" in display_names + # None should be absolute + assert not any(dn.startswith("/") for dn in display_names) + finally: + os.chdir(old_cwd) + + def test_list_workspace_directory_scan(self, tmp_path): + """Test listing all workspaces in a directory (. pattern).""" + import os + + # Create multiple workspaces + for name in ["model-a", "model-b"]: + ws = tmp_path / name + ws.mkdir() + (ws / "config.json").write_text('{}') + + old_cwd = os.getcwd() + try: + os.chdir(tmp_path) + result = list_models(pattern=".") + + assert result["status"] == "success" + assert result["data"]["count"] == 2 + # display_names should be just the directory names + display_names = [m["display_name"] for m in result["data"]["models"]] + assert "model-a" in display_names + assert "model-b" in display_names + finally: + os.chdir(old_cwd) + + def test_list_workspace_directory_scan_absolute(self, tmp_path): + """Test listing all workspaces with absolute directory path.""" + # Create workspaces + for name in ["model-a", "model-b"]: + ws = tmp_path / name + ws.mkdir() + (ws / "config.json").write_text('{}') + + # Use absolute path + result = list_models(pattern=str(tmp_path)) + + assert result["status"] == "success" + assert result["data"]["count"] == 2 + # display_names should be absolute (because input was absolute) + for m in result["data"]["models"]: + assert m["display_name"].startswith("/") + # name should also be absolute + assert m["name"].startswith("/") diff --git a/tests_2.0/test_push_workspace_check.py b/tests_2.0/test_push_workspace_check.py index 5d7665c..5b36c8c 100644 --- a/tests_2.0/test_push_workspace_check.py +++ b/tests_2.0/test_push_workspace_check.py @@ -73,3 +73,98 @@ def test_check_only_lfs_pointer_detected(tmp_path): diag = res["data"]["workspace_health"] assert diag["healthy"] is False assert any(a["code"] == "lfs_pointer_detected" for a in diag["anomalies"]) + + +class TestPushAmbiguousWorkspace: + """Tests for ambiguous workspace pattern handling (Session 103).""" + + def test_push_ambiguous_prefix_pattern(self, tmp_path): + """Ambiguous prefix pattern should return clear error with matches list.""" + import os + + # Create multiple workspaces with common prefix + for name in ["gemma-3n-4bit", "gemma-3n-8bit", "gemma-3n-FIXED-4bit"]: + ws = tmp_path / name + ws.mkdir() + (ws / "config.json").write_text('{"model_type": "gemma"}') + (ws / "model.safetensors").write_text("data") + + old_cwd = os.getcwd() + try: + os.chdir(tmp_path) + res = push_operation("./gemma-", "org/model", branch=DEFAULT_PUSH_BRANCH, check_only=True) + + assert res["status"] == "error" + assert res["error"]["type"] == "ambiguous_workspace" + assert "3 workspaces" in res["error"]["message"] + assert "matches" in res["error"] + assert len(res["error"]["matches"]) == 3 + finally: + os.chdir(old_cwd) + + def test_push_prefix_single_match_succeeds(self, tmp_path): + """Prefix pattern with single match should work.""" + import os + + ws = tmp_path / "unique-model-4bit" + ws.mkdir() + (ws / "config.json").write_text('{"model_type": "llama"}') + (ws / "model.safetensors").write_text("data") + + old_cwd = os.getcwd() + try: + os.chdir(tmp_path) + res = push_operation("./unique-", "org/model", branch=DEFAULT_PUSH_BRANCH, check_only=True) + + # Should succeed (check_only doesn't need HF_TOKEN) + assert res["status"] == "success" + assert res["data"]["workspace_health"]["healthy"] is True + finally: + os.chdir(old_cwd) + + def test_push_prefix_no_match(self, tmp_path): + """Prefix pattern with no matches should return workspace_not_found.""" + import os + + old_cwd = os.getcwd() + try: + os.chdir(tmp_path) + res = push_operation("./nonexistent-", "org/model", branch=DEFAULT_PUSH_BRANCH, check_only=True) + + assert res["status"] == "error" + assert res["error"]["type"] == "workspace_not_found" + assert "No workspace found" in res["error"]["message"] + finally: + os.chdir(old_cwd) + + def test_push_exact_path_still_works(self, tmp_path): + """Exact workspace path should still work as before.""" + ws = tmp_path / "my-model" + ws.mkdir() + (ws / "config.json").write_text('{"model_type": "llama"}') + (ws / "model.safetensors").write_text("data") + + res = push_operation(str(ws), "org/model", branch=DEFAULT_PUSH_BRANCH, check_only=True) + + assert res["status"] == "success" + assert res["data"]["workspace_health"]["healthy"] is True + + def test_push_dot_pushes_current_directory(self, tmp_path): + """'push .' pushes the current directory directly (not directory scan).""" + import os + + # tmp_path itself as a workspace + (tmp_path / "config.json").write_text('{"model_type": "llama"}') + (tmp_path / "model.safetensors").write_text("data" * 100) # Needs some size + + old_cwd = os.getcwd() + try: + os.chdir(tmp_path) + res = push_operation(".", "org/model", branch=DEFAULT_PUSH_BRANCH, check_only=True) + + # Should succeed - pushes current directory directly (not ambiguous error) + assert res["status"] == "success" + # workspace_health should exist (check_only performs health analysis) + assert "workspace_health" in res["data"] + finally: + os.chdir(old_cwd) diff --git a/tests_2.0/test_resumable_pull.py b/tests_2.0/test_resumable_pull.py index ec19a53..1a84460 100644 --- a/tests_2.0/test_resumable_pull.py +++ b/tests_2.0/test_resumable_pull.py @@ -179,13 +179,23 @@ snapshot_download( print(f"[DEBUG] Before resume - HF_HOME: {os.environ.get('HF_HOME')}") print(f"[DEBUG] Before resume - Cache dir exists: {cache_dir.exists()}") - # CRITICAL CHECK: Is Phase 1 subprocess still running? - subprocess_status = proc.poll() - if subprocess_status is None: - print(f"[WARNING] Phase 1 subprocess (PID {proc.pid}) STILL RUNNING during Phase 3!") - print(f"[WARNING] This could cause race conditions with downloads") - else: - print(f"[DEBUG] Phase 1 subprocess terminated with code: {subprocess_status}") + # CRITICAL: Ensure Phase 1 subprocess is fully terminated before resuming + # Race condition: SIGTERM is sent but subprocess cleanup takes time + # If resume starts too early, huggingface_hub sees partial files and skips download + if proc.poll() is None: + print(f"[DEBUG] Waiting for Phase 1 subprocess (PID {proc.pid}) to terminate...") + try: + proc.wait(timeout=15) + except subprocess.TimeoutExpired: + proc.kill() + pytest.fail("Phase 1 subprocess did not terminate within 15s") + + # Additional wait for filesystem to settle (file locks, etc.) + # Without this, huggingface_hub may see incomplete state and skip download + # Testing shows marker-run needs longer wait (possibly pytest interference) + print(f"[DEBUG] Phase 1 subprocess terminated with code: {proc.poll()}") + print("[DEBUG] Waiting 10s for filesystem to settle...") + time.sleep(10) result = pull_operation(model, force_resume=True) diff --git a/tests_2.0/test_stop_tokens_live.py b/tests_2.0/test_stop_tokens_live.py index 0dc9229..e5dda2e 100644 --- a/tests_2.0/test_stop_tokens_live.py +++ b/tests_2.0/test_stop_tokens_live.py @@ -58,13 +58,14 @@ def _use_real_mlx_modules(): sys.path = [p for p in sys.path if p != stub_path_str] path_removed = True - # Remove stub modules from sys.modules so real modules can be imported - removed_modules: Dict[str, Any] = {} + # Clear transformers modules FIRST (depends on huggingface_hub, uses lazy imports) + # Must happen before any getattr() calls on sys.modules to avoid triggering lazy imports + removed_transformers_modules: Dict[str, Any] = {} for module_name, module in list(sys.modules.items()): - module_file = getattr(module, "__file__", "") or "" - if module_file and stub_path_str in module_file: - removed_modules[module_name] = module + if module_name == "transformers" or module_name.startswith("transformers."): + removed_transformers_modules[module_name] = module sys.modules.pop(module_name, None) + # Also clear any previously installed huggingface_hub shims removed_hf_modules: Dict[str, Any] = {} for module_name, module in list(sys.modules.items()): @@ -72,6 +73,15 @@ def _use_real_mlx_modules(): removed_hf_modules[module_name] = module sys.modules.pop(module_name, None) + # Remove stub modules from sys.modules so real modules can be imported + # (AFTER transformers/huggingface_hub cleanup to avoid lazy import triggers) + removed_modules: Dict[str, Any] = {} + for module_name, module in list(sys.modules.items()): + module_file = getattr(module, "__file__", "") or "" + if module_file and stub_path_str in module_file: + removed_modules[module_name] = module + sys.modules.pop(module_name, None) + # Require real mlx / mlx-lm; skip entire module if not available missing_runtime = False if ( @@ -90,6 +100,8 @@ def _use_real_mlx_modules(): sys.modules[name] = mod for name, mod in removed_hf_modules.items(): sys.modules[name] = mod + for name, mod in removed_transformers_modules.items(): + sys.modules[name] = mod if path_removed and stub_path_str not in sys.path: sys.path.insert(0, stub_path_str) pytest.skip( @@ -102,6 +114,8 @@ def _use_real_mlx_modules(): if name not in sys.modules}) sys.modules.update({name: mod for name, mod in removed_hf_modules.items() if name not in sys.modules}) + sys.modules.update({name: mod for name, mod in removed_transformers_modules.items() + if name not in sys.modules}) if path_removed and stub_path_str not in sys.path: sys.path.insert(0, stub_path_str) pytest.skip( @@ -117,6 +131,8 @@ def _use_real_mlx_modules(): sys.modules[name] = module for name, module in removed_hf_modules.items(): sys.modules[name] = module + for name, module in removed_transformers_modules.items(): + sys.modules[name] = module # Ensure stub path is back at the front for unit tests if path_removed and stub_path_str not in sys.path: diff --git a/tests_2.0/test_vision_chunk_streaming.py b/tests_2.0/test_vision_chunk_streaming.py new file mode 100644 index 0000000..96d1f92 --- /dev/null +++ b/tests_2.0/test_vision_chunk_streaming.py @@ -0,0 +1,229 @@ +""" +Unit tests for vision chunk streaming SSE format. + +Tests the new per-chunk streaming feature where multi-image vision requests +with stream=True yield SSE events as each chunk completes, rather than +waiting for all chunks to finish. +""" + +import json +from typing import Iterator +from unittest.mock import patch, MagicMock + +from fastapi.testclient import TestClient + +from mlxk2.core.server_base import app + + +def _iter_sse_lines(resp) -> Iterator[str]: + """Iterate non-empty SSE lines as strings from a streaming response.""" + for raw in resp.iter_lines(): + if not raw: + continue + if isinstance(raw, bytes): + line = raw.decode("utf-8", errors="ignore") + else: + line = raw + if line.strip(): + yield line + + +def _parse_sse_events(resp) -> list: + """Parse SSE events into list of dicts (skips [DONE]).""" + events = [] + for line in _iter_sse_lines(resp): + if line.strip() == "data: [DONE]": + continue + if line.startswith("data: "): + try: + events.append(json.loads(line[len("data: "):])) + except json.JSONDecodeError: + pass + return events + + +class TestVisionChunkStreamingSSEFormat: + """Tests for vision per-chunk SSE streaming format (mocked endpoint).""" + + def test_multi_chunk_streams_multiple_content_events(self): + """Multi-chunk vision request should emit SSE event per chunk.""" + # This test validates the SSE format by mocking _stream_vision_chunks directly + from fastapi import FastAPI + from fastapi.responses import StreamingResponse + + test_app = FastAPI() + + async def mock_stream_gen(): + yield 'data: {"id":"test","object":"chat.completion.chunk","created":1234,"model":"test","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}\n\n' + yield 'data: {"id":"test","object":"chat.completion.chunk","created":1234,"model":"test","choices":[{"index":0,"delta":{"content":"Chunk 1 output\\n\\n"},"finish_reason":null}]}\n\n' + yield 'data: {"id":"test","object":"chat.completion.chunk","created":1234,"model":"test","choices":[{"index":0,"delta":{"content":"Chunk 2 output\\n\\n"},"finish_reason":null}]}\n\n' + yield 'data: {"id":"test","object":"chat.completion.chunk","created":1234,"model":"test","choices":[{"index":0,"delta":{"content":"Chunk 3 output"},"finish_reason":null}]}\n\n' + yield 'data: {"id":"test","object":"chat.completion.chunk","created":1234,"model":"test","choices":[{"index":0,"delta":{},"finish_reason":"stop"}]}\n\n' + yield "data: [DONE]\n\n" + + @test_app.post("/test-stream") + async def test_endpoint(): + return StreamingResponse( + mock_stream_gen(), + media_type="text/event-stream" + ) + + client = TestClient(test_app) + + with client.stream("POST", "/test-stream") as resp: + assert resp.status_code == 200 + + events = _parse_sse_events(resp) + + # Should have: role event + 3 content events + final event = 5 events + assert len(events) == 5, f"Expected 5 events, got {len(events)}" + + # First event should have role + assert events[0]["choices"][0]["delta"].get("role") == "assistant" + + # Content events should have content + content_events = [e for e in events if e["choices"][0]["delta"].get("content")] + assert len(content_events) == 3, f"Expected 3 content events, got {len(content_events)}" + + # Final event should have finish_reason + assert events[-1]["choices"][0]["finish_reason"] == "stop" + + def test_single_chunk_uses_emulated_sse(self): + """Single-chunk requests should use existing SSE emulation (batch response).""" + client = TestClient(app) + + mock_runner = MagicMock() + mock_runner.model_path = "/mock/path" + mock_runner.model_name = "mock-vision" + mock_runner.generate.return_value = "Single chunk response" + + with patch('mlxk2.core.server_base.get_or_load_model', return_value=mock_runner), \ + patch('mlxk2.core.server_base.isinstance', side_effect=lambda obj, cls: True): + # 1 image = single chunk, uses _emulate_sse_stream + payload = { + "model": "mock-vision-model", + "messages": [{ + "role": "user", + "content": [ + {"type": "text", "text": "Describe this image"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="}}, + ] + }], + "stream": True, + } + + with client.stream("POST", "/v1/chat/completions", json=payload) as resp: + # Should return 200 (either streaming or error is acceptable here + # since we're testing the routing, not the full integration) + assert resp.status_code in [200, 400, 500] + + def test_sse_format_compliance(self): + """SSE events should follow OpenAI format.""" + client = TestClient(app) + + with patch('mlxk2.core.server_base._stream_vision_chunks') as mock_stream: + async def mock_stream_gen(*args, **kwargs): + yield 'data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1234567890,"model":"test","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}\n\n' + yield 'data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1234567890,"model":"test","choices":[{"index":0,"delta":{"content":"Hello"},"finish_reason":null}]}\n\n' + yield 'data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1234567890,"model":"test","choices":[{"index":0,"delta":{},"finish_reason":"stop"}]}\n\n' + yield "data: [DONE]\n\n" + + mock_stream.return_value = mock_stream_gen() + + mock_runner = MagicMock() + mock_runner.model_path = "/mock/path" + mock_runner.model_name = "mock-vision" + + with patch('mlxk2.core.server_base.get_or_load_model', return_value=mock_runner), \ + patch('mlxk2.core.server_base.isinstance', side_effect=lambda obj, cls: True): + payload = { + "model": "mock-vision-model", + "messages": [{ + "role": "user", + "content": [ + {"type": "text", "text": "Test"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="}}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="}}, + ] + }], + "stream": True, + "chunk": 1, + } + + with client.stream("POST", "/v1/chat/completions", json=payload) as resp: + events = _parse_sse_events(resp) + + for event in events: + # Required fields per OpenAI spec + assert "id" in event, "Missing 'id' field" + assert "object" in event, "Missing 'object' field" + assert event["object"] == "chat.completion.chunk" + assert "choices" in event, "Missing 'choices' field" + assert len(event["choices"]) > 0 + + choice = event["choices"][0] + assert "index" in choice, "Missing 'index' in choice" + assert "delta" in choice, "Missing 'delta' in choice" + + +class TestVisionChunkStreamingIntegration: + """Integration tests that exercise the actual streaming function.""" + + def test_stream_vision_chunks_generator_format(self): + """Test _stream_vision_chunks yields valid SSE format.""" + import asyncio + from mlxk2.core.server_base import _stream_vision_chunks + + # Mock VisionRunner + class MockVisionRunner: + def __init__(self, *args, **kwargs): + pass + + def __enter__(self): + return self + + def __exit__(self, *args): + pass + + def generate(self, **kwargs): + return "Test output" + + async def run_generator(): + events = [] + # Patch at the source module where VisionRunner is defined + with patch('mlxk2.core.vision_runner.VisionRunner', MockVisionRunner): + gen = _stream_vision_chunks( + model_path="/mock/path", + model_name="mock-model", + prompt="Test prompt", + images=[("img1.jpg", b"fake1"), ("img2.jpg", b"fake2")], + chunk_size=1, + image_id_map={}, + max_tokens=100, + temperature=0.0, + top_p=0.9, + repetition_penalty=1.0, + completion_id="test-123", + created=1234567890, + model="test-model", + ) + async for event in gen: + events.append(event) + return events + + events = asyncio.run(run_generator()) + + # Should have: role + 2 content events + final + [DONE] + assert len(events) >= 4, f"Expected at least 4 events, got {len(events)}: {events}" + + # First event should be role + assert events[0].startswith("data: ") + first = json.loads(events[0][6:].strip()) + assert first["choices"][0]["delta"].get("role") == "assistant" + + # Last event should be [DONE] + assert events[-1].strip() == "data: [DONE]" + + # Second-to-last should have finish_reason + final = json.loads(events[-2][6:].strip()) + assert final["choices"][0]["finish_reason"] == "stop" diff --git a/tests_2.0/test_vision_exif.py b/tests_2.0/test_vision_exif.py index eb1c675..d0f5cc7 100644 --- a/tests_2.0/test_vision_exif.py +++ b/tests_2.0/test_vision_exif.py @@ -69,6 +69,38 @@ class TestExifExtraction: exif = VisionRunner._extract_exif(image_bytes) assert exif is None + @pytest.mark.skipif(sys.version_info < (3, 10), reason="PIL required (mlx-vlm needs Python 3.10+)") + def test_datetime_tag_306_ignored(self): + """Tag 306 (DateTime) should be ignored. + + Priority: GPS-Timestamp (Tag 7+29) → DateTimeOriginal (Tag 36867) → Tag 306 NEVER. + + Rationale: Tag 306 is updated by image editors to modification time. + GPS-Timestamp and DateTimeOriginal preserve the actual capture time. + """ + from io import BytesIO + from PIL import Image + + # Create image with ONLY Tag 306 (DateTime), no Tag 36867 (DateTimeOriginal) + img = Image.new("RGB", (10, 10), color="green") + + # Manually set EXIF with Tag 306 but NOT 36867 + from PIL import Image as PILImage + exif_ifd = img.getexif() + exif_ifd[306] = "2026:01:06 19:12:02" # Tag 306 = DateTime (modification time) + # Explicitly NOT setting 36867 (DateTimeOriginal) + + buf = BytesIO() + img.save(buf, format="JPEG", exif=exif_ifd) + image_bytes = buf.getvalue() + + # Should return None for datetime (Tag 306 ignored) + exif = VisionRunner._extract_exif(image_bytes) + + # EXIF object may exist (for other metadata), but datetime should be None + if exif: + assert exif.datetime is None, "Tag 306 should be ignored, datetime must be None" + def test_collapsible_table_without_exif(self): """Table should be collapsible without EXIF data.""" result = "A beach." @@ -145,7 +177,7 @@ class TestExifExtraction: images = [("test.jpg", b"\x00\x01")] output = VisionRunner._add_filename_mapping(result, images) - assert "📍 32.79°N, 16.92°W" in output + assert "📍 32.7900°N, 16.9200°W" in output # Test case 2: Southern + Eastern (hypothetical) mock_exif = ExifData(gps_lat=-10.5, gps_lon=20.3, datetime=None, camera=None) @@ -155,7 +187,7 @@ class TestExifExtraction: images = [("test.jpg", b"\x00\x01")] output = VisionRunner._add_filename_mapping(result, images) - assert "📍 10.50°S, 20.30°E" in output + assert "📍 10.5000°S, 20.3000°E" in output class TestImageIdMapWithExif: diff --git a/tests_2.0/test_workspace_sentinel.py b/tests_2.0/test_workspace_sentinel.py index c23f1a6..e1790e6 100644 --- a/tests_2.0/test_workspace_sentinel.py +++ b/tests_2.0/test_workspace_sentinel.py @@ -17,6 +17,8 @@ from mlxk2.operations.workspace import ( write_workspace_sentinel, is_managed_workspace, is_workspace_path, + is_explicit_path, + find_matching_workspaces, read_workspace_metadata, SENTINEL_FILENAME ) @@ -59,6 +61,204 @@ class TestIsWorkspacePath: assert is_workspace_path([]) is False +class TestIsExplicitPath: + """Test is_explicit_path() helper function.""" + + def test_explicit_path_relative_dot_slash(self): + """Test ./ prefix is explicit path.""" + assert is_explicit_path("./model") is True + assert is_explicit_path("./gemma-3n") is True + assert is_explicit_path("./") is True + + def test_explicit_path_relative_dot_dot_slash(self): + """Test ../ prefix is explicit path.""" + assert is_explicit_path("../model") is True + assert is_explicit_path("../parent/model") is True + + def test_explicit_path_absolute(self): + """Test / prefix is explicit path.""" + assert is_explicit_path("/abs/path/model") is True + assert is_explicit_path("/model") is True + + def test_explicit_path_dot_only(self): + """Test . and .. alone are explicit paths.""" + assert is_explicit_path(".") is True + assert is_explicit_path("..") is True + + def test_not_explicit_path_hf_model_id(self): + """Test HF model IDs are NOT explicit paths.""" + assert is_explicit_path("mlx-community/Phi-3") is False + assert is_explicit_path("microsoft/phi-2") is False + + def test_not_explicit_path_bare_name(self): + """Test bare names without path prefix are NOT explicit paths.""" + assert is_explicit_path("my-model") is False + assert is_explicit_path("gemma-3n-E2B") is False + + def test_not_explicit_path_invalid_input(self): + """Test handles invalid input gracefully.""" + assert is_explicit_path(None) is False + assert is_explicit_path("") is False + assert is_explicit_path(123) is False + + +class TestFindMatchingWorkspaces: + """Test find_matching_workspaces() prefix matching.""" + + def test_find_exact_match(self, tmp_path): + """Test exact match returns single workspace.""" + ws = tmp_path / "my-model" + ws.mkdir() + (ws / "config.json").write_text('{"model_type": "llama"}') + + # Use ./ prefix to make it explicit path + matches = find_matching_workspaces(f"./{ws.name}") + # Must be in correct directory for ./ to work + import os + old_cwd = os.getcwd() + try: + os.chdir(tmp_path) + matches = find_matching_workspaces("./my-model") + assert len(matches) == 1 + assert matches[0].name == "my-model" + finally: + os.chdir(old_cwd) + + def test_find_prefix_match(self, tmp_path): + """Test prefix match returns multiple workspaces.""" + # Create multiple workspaces with common prefix + for name in ["gemma-3n-4bit", "gemma-3n-FIXED-4bit", "gemma-3n-8bit"]: + ws = tmp_path / name + ws.mkdir() + (ws / "config.json").write_text('{"model_type": "gemma"}') + + # Create non-matching workspace + other = tmp_path / "llama-3" + other.mkdir() + (other / "config.json").write_text('{"model_type": "llama"}') + + import os + old_cwd = os.getcwd() + try: + os.chdir(tmp_path) + matches = find_matching_workspaces("./gemma-") + assert len(matches) == 3 + assert all("gemma" in m.name for m in matches) + # Should NOT include llama-3 + assert not any("llama" in m.name for m in matches) + finally: + os.chdir(old_cwd) + + def test_find_prefix_match_sorted(self, tmp_path): + """Test prefix match returns sorted results.""" + for name in ["model-c", "model-a", "model-b"]: + ws = tmp_path / name + ws.mkdir() + (ws / "config.json").write_text('{}') + + import os + old_cwd = os.getcwd() + try: + os.chdir(tmp_path) + matches = find_matching_workspaces("./model-") + names = [m.name for m in matches] + assert names == ["model-a", "model-b", "model-c"] + finally: + os.chdir(old_cwd) + + def test_find_no_match(self, tmp_path): + """Test returns empty list when no workspaces match.""" + import os + old_cwd = os.getcwd() + try: + os.chdir(tmp_path) + matches = find_matching_workspaces("./nonexistent-") + assert matches == [] + finally: + os.chdir(old_cwd) + + def test_find_skips_non_workspaces(self, tmp_path): + """Test skips directories without config.json.""" + # Valid workspace + valid = tmp_path / "gemma-valid" + valid.mkdir() + (valid / "config.json").write_text('{}') + + # Directory without config.json (not a workspace) + invalid = tmp_path / "gemma-invalid" + invalid.mkdir() + (invalid / "other.txt").write_text("not a workspace") + + import os + old_cwd = os.getcwd() + try: + os.chdir(tmp_path) + matches = find_matching_workspaces("./gemma-") + assert len(matches) == 1 + assert matches[0].name == "gemma-valid" + finally: + os.chdir(old_cwd) + + def test_find_absolute_path(self, tmp_path): + """Test works with absolute paths.""" + ws = tmp_path / "model" + ws.mkdir() + (ws / "config.json").write_text('{}') + + matches = find_matching_workspaces(str(ws)) + assert len(matches) == 1 + + def test_find_not_explicit_path(self): + """Test returns empty list for non-explicit paths.""" + # HF model ID is not explicit path + matches = find_matching_workspaces("mlx-community/model") + assert matches == [] + + # Bare name is not explicit path + matches = find_matching_workspaces("my-model") + assert matches == [] + + def test_find_directory_scan(self, tmp_path): + """Test directory scan (existing directory, not workspace) finds all workspaces inside.""" + import os + + # Create multiple workspaces in tmp_path + for name in ["model-a", "model-b", "model-c"]: + ws = tmp_path / name + ws.mkdir() + (ws / "config.json").write_text('{}') + + # Create a non-workspace directory + other = tmp_path / "not-a-workspace" + other.mkdir() + + old_cwd = os.getcwd() + try: + os.chdir(tmp_path) + # Pattern "." should find all 3 workspaces + matches = find_matching_workspaces(".") + assert len(matches) == 3 + names = [m.name for m in matches] + assert "model-a" in names + assert "model-b" in names + assert "model-c" in names + # Should NOT include non-workspace directory + assert "not-a-workspace" not in names + finally: + os.chdir(old_cwd) + + def test_find_directory_scan_absolute(self, tmp_path): + """Test directory scan with absolute path.""" + for name in ["ws1", "ws2"]: + ws = tmp_path / name + ws.mkdir() + (ws / "config.json").write_text('{}') + + # Use absolute path to directory + matches = find_matching_workspaces(str(tmp_path)) + assert len(matches) == 2 + + class TestWorkspaceSentinel: """Test workspace sentinel write/read primitives."""