diff --git a/.gitignore b/.gitignore
index 1487cdf..a33ade5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,6 +27,7 @@ openwebui311/bin/
*_report.json
test-img-collection/
small-img-collection
+benchmarks/reports/*.html
# Benchmark reports (ADR-013 Phase 0)
# These reports ARE tracked in git for historical data
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 49effbc..df79049 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,97 @@
# Changelog
-## [2.0.4-beta.1] - WIP
+## [2.0.4-beta.3] - 2025-12-23
+
+### Added
+
+- **Benchmark Infrastructure v1.0 (ADR-013 Phase 0):**
+ - Template-based report generator: `benchmarks/generate_benchmark_report.py`
+ - Per-model statistics, per-test statistics, system health summary
+ - Schema validation: `benchmarks/validate_reports.py`
+ - Documentation: `benchmarks/README.md`, `benchmarks/TESTING.md`
+ - Quality tracking: Schema v0.2.0 with system_health (swap, RAM, zombies, quality_flags)
+ - Page-size fix: Corrected Apple Silicon 16KB page size (RAM values were 4x too low)
+ - Files: `tests_2.0/live/conftest.py`, `test_utils.py`, `test_vm_stat_parsing.py`
+
+- **Memory Timeline Visualization:**
+ - Interactive HTML visualizer: `benchmarks/tools/memplot.py` (500+ lines)
+ - Memory monitor enhanced: `benchmarks/tools/memmon.py` (memory pressure capture)
+ - Visual legend: Activity Monitor colors, memory pressure, test regions, model markers
+ - Documentation: Complete interpretation guide in `benchmarks/README.md`
+ - Schema learnings: Server test attribution problem + log-parsing solution documented
+ - File: `benchmarks/schemas/LEARNINGS-FOR-v1.0.md`
+
+### Fixed
+
+- **Server model switch log timing:** "Switched to model" now emitted only after successful load (past tense reflects completed action)
+ - File: `mlxk2/core/server_base.py:230`
+
+- **Unified model filter (Server + CLI):** Both `/v1/models` and `mlxk list` now use `build_model_object()` as single source of truth
+ - Filter: `healthy AND runtime_compatible` (no more code duplication)
+ - Framework gate: Non-MLX models (PyTorch, GGUF) now correctly marked `runtime_compatible=false`
+ - WebUI clients get consistent, runnable model lists
+ - Files: `mlxk2/core/server_base.py`, `mlxk2/output/human.py`, `mlxk2/operations/common.py`
+
+- **transformers 5.0 compatibility for vision models:** Removed `fix_mistral_regex` parameter from mlx-vlm load call
+ - transformers 5.0.0rc1 changed tokenizer initialization - `fix_mistral_regex` no longer accepted as kwarg
+ - Error was: `TypeError: _patch_mistral_regex() got multiple values for keyword argument 'fix_mistral_regex'`
+ - Removed deprecated parameter from vision model loading - all vision models now work with transformers 5.0
+ - File: `mlxk2/core/vision_runner.py:101`
+
+- **huggingface-hub 1.x compatibility:** Updated preflight test mocks for hub 1.x exception API changes
+ - Hub 1.x changed exception signatures: `GatedRepoError/RepositoryNotFoundError` now require `response` parameter
+ - Added `_create_mock_response()` helper to create proper httpx.Response objects for test mocks
+ - **Test-only changes** - preflight production code works unchanged with hub 0.x and 1.x
+ - **Result:** mlx-knife now fully compatible with mlx 0.30.x, mlx-lm 0.30.0, transformers 5.0, hub 1.x
+ - All 494 unit tests pass, vision models functional with newest dependencies
+ - Files: `tests_2.0/test_issue_30_preflight.py`, `mlxk2/core/vision_runner.py`
+
+- **EXIF GPS 0° coordinate handling:** Fixed truthiness checks in `VisionRunner._extract_exif` that incorrectly dropped valid GPS coordinates
+ - Equator (0° latitude) and Prime Meridian (0° longitude) now correctly preserved
+ - Changed latitude/longitude negation checks from `if lat` to `if lat is not None`
+ - Changed EXIF retention check from `not any([...])` to `all(x is None for x in [...])`
+ - Ensures 0.0 is treated as valid coordinate, not as missing data
+ - File: `mlxk2/core/vision_runner.py:259-262, 283`
+
+- **Framework/Type detection for non-mlx-community models (Issue #48):**
+ - `detect_framework()`: Now reads front-matter internally and checks config.json `quantization` key (MLX-specific)
+ - `detect_model_type()`: Added `probe` parameter and checks for `chat_template.json` file (reliable chat indicator)
+ - Removed redundant PR #42 code from server_base.py (cleaner architecture)
+ - Fixes: Models like locally converted quantized models now correctly show "MLX" + "chat" instead of "PyTorch" + "base"
+ - Files: `mlxk2/operations/common.py:118-157, 180-208`, `mlxk2/core/server_base.py:114-120`
+
+- **Video model detection and exclusion:**
+ - Video models (require PyTorch/Torchvision) now excluded from vision capability detection
+ - mlx-vlm only supports image vision models, not video models
+ - Video indicators: `video_preprocessor_config.json`, `temporal_patch_size`, `AutoVideoProcessor`
+ - Video models fall back to mlx-lm for text-only (consistent with vision architecture)
+ - Example: `mlx-community/MiMo-VL-7B-RL-bf16` now classified as "chat" (not "chat+vision")
+ - Files: `mlxk2/operations/common.py:211-266`, `mlxk2/core/capabilities.py:169-238`
+
+### Documentation
+
+- **mlx-vlm beta.3 install guidance:** Recommend upstream commit `c4ea290e47e2155b67d94c708c662f8ab64e1b37` until mlx-vlm 0.3.10 is released
+ - Files: `README.md`, `docs/SERVER-HANDBOOK.md`
+
+## [2.0.4-beta.2] - 2025-12-16
+
+**PyPI-only release** - Fixes Git dependency issue for PyPI compatibility. Not tagged on GitHub.
+
+### Fixed
+
+- **PyPI compatibility:** Changed `mlx-vlm` dependency from Git URL to PyPI version `mlx-vlm>=0.3.9`
+ - PyPI does not allow Git dependencies
+ - mlx-vlm 0.3.9 is available on PyPI
+ - File: `pyproject.toml:69`
+
+### Documentation
+
+- **Installation instructions:** Added Vision-specific installation to README.md
+ - Clear separation: Text models (Python 3.9+) vs Vision models (Python 3.10+)
+ - Installation command: `pip install mlx-knife[vision]`
+ - Updated all version references from 2.0.4-beta.1 → 2.0.4-beta.2
+
+## [2.0.4-beta.1] - 2025-12-16
**Focus:** Unix Pipe Integration + Vision Support + Memory-Aware Loading + Python 3.14
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 18e6e53..3d2a54f 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -129,18 +129,15 @@ For detailed testing options, troubleshooting, and advanced workflows, see **[TE
### Before Submitting PRs
-Please ensure all tests pass locally:
-```bash
-# Complete test workflow
-ruff check mlxk2/ --fix # Fix code style
-mypy mlxk2/ # Check types
-pytest -v # Run all 2.0 tests
-```
+**All tests must pass:**
+- ✅ Code quality: `ruff check mlxk2/ --fix && mypy mlxk2/`
+- ✅ Unit tests: `pytest tests_2.0/ -v` (always required)
+- ✅ Live E2E tests: Required for model/inference changes
-Since we don't have CI/CD (MLX requires Apple Silicon), we rely on contributors to verify their changes locally. Please mention in your PR:
-- Which Python version you tested with
-- Which Mac model you tested on (M1/M2/M3)
-- Test results summary
+**PR requirements:**
+- State your Python version + Mac chip in PR description
+- For model/inference changes: Document which live tests you ran
+- **Important:** Unit tests alone are NOT sufficient - see **[TESTING.md](TESTING.md)** for why and how
## Python Version Requirements
diff --git a/README.md b/README.md
index 3c3862c..804ebb2 100644
--- a/README.md
+++ b/README.md
@@ -4,9 +4,9 @@
-**Current Version: 2.0.4-beta.2** (Stable: 2.0.3)
+**Current Version: 2.0.4-beta.3** (Stable: 2.0.3)
-[](https://github.com/mzau/mlx-knife/releases)
+[](https://github.com/mzau/mlx-knife/releases)
[](https://www.apache.org/licenses/LICENSE-2.0)
[](https://www.python.org/downloads/)
[](https://support.apple.com/en-us/HT211814)
@@ -20,7 +20,7 @@
- **Model Information**: Detailed model metadata including quantization info
- **Download Models**: Pull models from HuggingFace with progress tracking
- **Run Models**: Native MLX execution with streaming and chat modes
-- **Vision Models**: Image analysis (Python 3.10+, alpha)
+- **Vision Models**: Image analysis (Python 3.10+, beta)
- **Unix Pipes**: Chain models via stdin/stdout - no temp files (beta)
- **Health Checks**: Verify model integrity and MLX runtime compatibility
- **Cache Management**: Clean up and organize your model storage
@@ -67,7 +67,7 @@ This license applies **only** to the `mlx-knife` code and **does not extend** to
MLX Knife has been comprehensively tested and verified on:
✅ **Python 3.9.6 - 3.14** - Text LLMs fully supported (mlx-lm 0.28.4+)
-✅ **Python 3.10 - 3.14** - Vision models supported (mlx-vlm 0.3.9+)
+✅ **Python 3.10 - 3.14** - Vision models supported (mlx-vlm 0.3.9+; beta.3 recommends commit c4ea290e47e2155b67d94c708c662f8ab64e1b37)
**Note:** Vision features require Python 3.10+. Native macOS Python 3.9.6 users need to upgrade (e.g., via Homebrew).
@@ -85,12 +85,17 @@ pip install mlx-knife
pip install mlx-knife[vision]
# Verify installation
-mlxk --version # → mlxk 2.0.3 (stable) or 2.0.4-beta.2 (dev)
+mlxk --version # → mlxk 2.0.3 (stable) or 2.0.4-beta.3 (dev)
```
**Python Requirements:**
- **Text models:** Python 3.9-3.14
-- **Vision models:** Python 3.10-3.14 (requires `mlx-vlm>=0.3.9`)
+- **Vision models:** Python 3.10-3.14 (requires `mlx-vlm>=0.3.9`; beta.3 recommends commit c4ea290e47e2155b67d94c708c662f8ab64e1b37)
+
+**Beta.3 note:** Until mlx-vlm 0.3.10 is released, install the upstream commit before mlx-knife if you need the fix:
+```bash
+pip install "mlx-vlm @ git+https://github.com/Blaizzy/mlx-vlm.git@c4ea290e47e2155b67d94c708c662f8ab64e1b37"
+```
### Development Installation
@@ -106,7 +111,7 @@ pip install -e ".[dev,test]"
pip install -e ".[dev,test,vision]"
# Verify installation
-mlxk --version # → mlxk 2.0.4-beta.2
+mlxk --version # → mlxk 2.0.4-beta.3
# Run tests and quality checks (before committing)
pytest -v
@@ -182,6 +187,100 @@ open index.html
| 🔒 `pipe mode` | **Beta feature** - Unix pipes with `mlxk run - ...`; requires `MLXK2_ENABLE_PIPES=1` |
+## Multi-Modal Support
+
+MLX Knife supports multiple input modalities beyond text. All multi-modal features share a **common output pattern**: model responses are followed by collapsible metadata tables for transparency and traceability.
+
+### Vision (Beta)
+
+Image analysis via the `--image` flag (CLI and server). Requires Python 3.10+.
+
+#### Requirements
+
+- **Python 3.10+** (mlx-vlm dependency)
+- **Installation:** `pip install mlx-knife[vision]`
+- **Backend:** mlx-vlm 0.3.9+ from PyPI
+- **Beta.3 note:** For upstream bugfixes, install commit `c4ea290e47e2155b67d94c708c662f8ab64e1b37` before mlx-knife:
+ ```bash
+ pip install "mlx-vlm @ git+https://github.com/Blaizzy/mlx-vlm.git@c4ea290e47e2155b67d94c708c662f8ab64e1b37"
+ pip install mlx-knife[vision]
+ ```
+
+#### Usage
+
+```bash
+# Image analysis with custom prompt
+mlxk run "mlx-community/Llama-3.2-11B-Vision-Instruct-4bit" \
+ --image photo.jpg "Describe what you see in detail"
+
+# Multiple images (space-separated or glob)
+mlxk run vision-model --image img1.jpg img2.jpg img3.jpg "Compare these images"
+mlxk run vision-model --image photos/*.jpg "Which images show outdoor scenes?"
+
+# Auto-prompt (default: "Describe the image.")
+mlxk run vision-model --image cat.jpg
+
+# Text-only on vision model (no --image flag)
+mlxk run "mlx-community/Llama-3.2-11B-Vision-Instruct-4bit" "What is 2+2?"
+```
+
+#### Metadata Output Format
+
+When processing images, MLX Knife automatically appends metadata in a **collapsible table** (collapsed by default):
+
+```
+A beach with palm trees and clear blue water.
+
+
+📸 Image Metadata (2 images)
+
+| Image | Filename | Original | Location | Date | Camera |
+|-------|----------|----------|----------|------|--------|
+| 1 | image_abc123.jpeg | beach.jpg | 📍 32.79°N, 16.92°W | 📅 2023-12-06 12:19 | 📷 Apple iPhone SE |
+| 2 | image_def456.jpeg | mountain.jpg | 📍 32.87°N, 17.17°W | 📅 2023-12-10 15:42 | 📷 Apple iPhone SE |
+
+
+```
+
+**Metadata includes:**
+- **Image ID** → **Filename mapping** (identify which description belongs to which file)
+- **GPS coordinates** (latitude/longitude, if available in EXIF)
+- **Capture date/time** (ISO 8601 format)
+- **Camera model** (device info)
+
+**Privacy control:**
+
+EXIF extraction is **enabled by default**. To disable (e.g., for privacy-sensitive images):
+
+```bash
+export MLXK2_EXIF_METADATA=0
+mlxk run vision-model --image photo.jpg "describe"
+```
+
+**Output is the same for CLI and server** - metadata tables work in terminals, web UIs (nChat), and can be parsed programmatically.
+
+#### Limitations
+
+- **Non-streaming:** Vision runs always use batch mode (no streaming output)
+- **Image limits:** 5 images max per request, 20 MB per image, 50 MB total
+
+#### Server API
+
+Vision models work with OpenAI-compatible `/v1/chat/completions` endpoint using base64-encoded images:
+
+```bash
+curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
+ "model": "llama-vision",
+ "messages": [{
+ "role": "user",
+ "content": [
+ {"type": "text", "text": "What is in this image?"},
+ {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,..."}}
+ ]
+ }]
+}'
+```
+
## JSON API
@@ -211,28 +310,6 @@ mlxk show "Phi-3-mini" --json | jq '.data.model'
### Examples
-#### Pipe mode (Alpha: set `MLXK2_ENABLE_PIPES=1`)
-
-```bash
-# Read prompt from stdin and append trailing text (auto batch in pipes)
-echo "from stdin" | MLXK2_ENABLE_PIPES=1 mlxk run "" - "append extra context"
-
-# JSON interactive guard (no prompt) emits JSON error on stdout, exit!=0
-MLXK2_ENABLE_PIPES=1 mlxk run "" --json
-
-# Pipe list JSON into run for summarization
-MLXK2_ENABLE_PIPES=1 mlxk list --json \
- | MLXK2_ENABLE_PIPES=1 mlxk run "" - "Summarize the model list as a concise table."
-
-# Shortcut wrapper (same semantics)
-MLXK2_ENABLE_PIPES=1 mlx-run "" - "translate into german" < README.md
-```
-
-Notes:
-- Stdin requires `MLXK2_ENABLE_PIPES=1` (alpha gate). Without it, `-` is rejected.
-- When stdout is a pipe (non-TTY), streaming is disabled automatically to keep clean output.
-- Use full model IDs in place of ``; HF_HOME should point to your cache for live runs.
-
#### List Models
```bash
mlxk list --json
@@ -656,7 +733,7 @@ mlxk health --json | jq '.data.summary'
```
-## Hidden Alpha Features: `clone`, `push`, and pipe mode
+## Feature Gates: `clone`, `push` (Alpha), `pipe mode` (Beta)
### `clone` - Model Workspace Creation
@@ -710,38 +787,31 @@ These features are not final and may change or be removed in future releases.
Pipe mode is beta (feature complete) and requires `MLXK2_ENABLE_PIPES=1`. It lets `mlxk run` (and `mlx-run`) read stdin when you pass `-` as the prompt.
-- Gate: `MLXK2_ENABLE_PIPES=1` (will become default in a future stable release).
-- Auto-batch: When stdout is a pipe (non-TTY), streaming is disabled automatically for clean output.
-- Robust: Handles SIGPIPE and BrokenPipeError gracefully (`| head`, `| grep -m1` work correctly).
-- Scope: Applies to `mlxk run` and `mlx-run`; other commands unchanged.
+- **Status:** Beta (feature complete), API stable (syntax will not change)
+- **Gate:** `MLXK2_ENABLE_PIPES=1` (will become default in a future stable release)
+- **Auto-batch:** When stdout is a pipe (non-TTY), streaming is disabled automatically for clean output
+- **Robust:** Handles SIGPIPE and BrokenPipeError gracefully (`| head`, `| grep -m1` work correctly)
+- **Scope:** Applies to `mlxk run` and `mlx-run`; other commands unchanged
- Usage examples (replace `` with a cached MLX chat model):
```bash
# stdin + trailing text (batch when piped)
MLXK2_ENABLE_PIPES=1 echo "from stdin" | mlxk run "" - "append extra context"
-# JSON interactive guard (no prompt) → JSON error on stdout, exit 1
-MLXK2_ENABLE_PIPES=1 mlxk run "" --json
-
# list → run summarization
MLXK2_ENABLE_PIPES=1 mlxk list --json \
- | MLXK2_ENABLE_PIPES=1 mlxk run "" - "Summarize the model list as a concise table."
+ | MLXK2_ENABLE_PIPES=1 mlxk run "" - "Summarize the model list as a concise table." >my-hf-table.md
# Wrapper shorthand
MLXK2_ENABLE_PIPES=1 mlx-run "" - "translate into german" < README.md
+
+# Vision → Text chain: Photo tour review
+MLXK2_ENABLE_PIPES=1 mlxk run pixtral --image photos/*.jpg "Describe each picture" \
+ | MLXK2_ENABLE_PIPES=1 mlxk run qwen3 - \
+ "Write a tour review. Create a table with picture names, metadata, and descriptions." \
+ > tour-review.md
```
-Pipe mode API is stable.
-
-### `vision` - mlx-vlm (Python 3.10+, non-streaming)
-
-- Install extras: `pip install -e .[vision]` (requires `mlx-vlm>=0.3.9` from PyPI, Python 3.10+).
-- Backend: Uses `mlx-vlm` (vision); streaming is disabled for vision runs.
-- Usage:
- - Text-only on a vision model: `mlxk run "mlx-community/Llama-3.2-11B-Vision-Instruct-4bit" "what is 2+2"`
- - Image + text: `mlxk run "" --image cat.jpg "describe the cat"`
- - Image-only (auto prompt): `mlxk run "" --image cat.jpg`
-
## Testing
@@ -817,7 +887,7 @@ Apache License 2.0 — see `LICENSE` (root) and `mlxk2/NOTICE`.
Made with ❤️ by The BROKE team 
- Version 2.0.4-beta.2 | December 2025
+ Version 2.0.4-beta.3 | December 2025
💬 Web UI: nChat - lightweight chat interface •
🔮 Multi-node: BROKE Cluster
diff --git a/TESTING-DETAILS.md b/TESTING-DETAILS.md
index d07fb2c..a2f5d7d 100644
--- a/TESTING-DETAILS.md
+++ b/TESTING-DETAILS.md
@@ -847,7 +847,7 @@ MLXK2_LIVE_PUSH=1 \
---
-### Complete Test File Structure (2.0.4-beta.1)
+### Complete Test File Structure (2.0.4-beta.3)
```
tests_2.0/
@@ -885,7 +885,8 @@ tests_2.0/
│ ├── test_server_e2e.py # Server E2E tests with TEXT models (ADR-011 + Portfolio Separation, parametrized: text_XX)
│ ├── test_streaming_parity.py # Streaming vs batch parity tests (Issue #20, ADR-011, parametrized)
│ ├── test_vision_e2e_live.py # Vision CLI E2E tests with real models (ADR-012, 5 deterministic vision queries)
-│ └── test_vision_server_e2e.py # Vision Server E2E tests with VISION models (ADR-012 Phase 3 + Portfolio Separation, parametrized: vision_XX)
+│ ├── test_vision_server_e2e.py # Vision Server E2E tests with VISION models (ADR-012 Phase 3 + Portfolio Separation, parametrized: vision_XX)
+│ └── test_vm_stat_parsing.py # vm_stat output parsing validation (macOS memory metrics)
├── test_adr004_error_logging.py # ADR-004 error logging and redaction (tokens, paths)
├── test_capabilities.py # Probe/Policy architecture (ADR-012, ADR-016, Session 18-19, 45 tests)
├── test_cli_log_json_flag.py # CLI --log-json flag behavior and JSON log format
diff --git a/TESTING.md b/TESTING.md
index 5d5903e..cb027c5 100644
--- a/TESTING.md
+++ b/TESTING.md
@@ -19,6 +19,17 @@ For current test counts, version-specific details, and complete file listings, s
- Delete operations fail if not in test cache (`MLXK2_STRICT_TEST_DELETE=1`)
- Live tests never modify user cache without explicit environment variables
+**Unit Test Limitations:**
+
+MLX Knife has two test categories:
+1. **Unit tests** (~500 tests, fast, mocked) - verify code structure
+2. **Live E2E tests** (real models, slow) - verify actual functionality
+
+**Why both are needed:**
+When dependencies like `transformers` or `mlx-lm` update their APIs, unit tests (which mock these libraries) continue to pass, but real model loading breaks. Only live E2E tests catch these issues.
+
+**Example:** transformers 5.0 changed tokenizer initialization - unit tests passed (mocked API), but vision models failed to load in production. Live E2E tests caught the issue immediately.
+
## Quick Start
```bash
diff --git a/benchmarks/README.md b/benchmarks/README.md
index e44417c..437ee63 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,68 +1,288 @@
# MLX Knife Benchmarks
-**Status:** Phase 0 - Organic Data Collection
+**Status:** Phase 0 - Organic Data Collection (WIP)
-## Architecture
+## What's Here?
-This directory tracks empirical performance and compatibility data from mlx-knife's test suite.
+This directory contains benchmark infrastructure for mlx-knife:
+- Empirical performance and compatibility data from E2E tests
+- Tools for analysis and visualization
+- Schema definitions for structured reports
-### Phase 0 Goals (2.0.3+)
+## Directory Structure
+
+```
+benchmarks/
+├── reports/ # JSONL test reports + Markdown analyses
+│ ├── 2025-12-20-v2.0.4b3.jsonl # Raw data (one file per test run)
+│ └── BENCHMARK-v1.0-*.md # Generated analysis reports
+├── schemas/ # JSON Schema definitions
+│ ├── report-v0.1.0.schema.json # lecacy schema
+│ ├── report-v0.2.0.schema.json # Current schema
+│ └── report-current.schema.json # Symlink → current schema
+├── tools/ # Standalone tools
+│ ├── memmon.py # Memory monitor (background sampling)
+│ └── memplot.py # Memory timeline visualizer
+├── generate_benchmark_report.py # Report generator (Template v1.0)
+├── validate_reports.py # Schema validation
+├── README.md # ← You are here
+└── TESTING.md # Benchmark handbook (How-To)
+```
+
+## Tools
+
+| Tool | Purpose |
+|------|---------|
+| `generate_benchmark_report.py` | JSONL → Markdown report (Template v1.0) |
+| `validate_reports.py` | Schema validation of JSONL files |
+| `tools/memmon.py` | Memory monitoring during test runs |
+| `tools/memplot.py` | Interactive memory timeline visualization (HTML) |
+
+## Schema
+
+**Current:** v0.2.0 (Phase 0 - Test Infrastructure)
+
+| Version | Release | Content |
+|---------|---------|---------|
+| v0.1.0 | 2.0.3 | Minimal: test, outcome, duration, model |
+| v0.2.0 | 2.0.4 | + hardware_profile, system_health, quality_flags |
+| v1.0.0 | Future | Model benchmarks (mlxk-benchmark package) |
+
+**Schema Strategy:** No v0.3.x planned. v0.2.0 → v1.0.0 directly.
+- v0.x = Test infrastructure ("Was the test run clean?")
+- v1.x = Model benchmarks ("How good is the model?")
+
+See `schemas/LEARNINGS-FOR-v1.0.md` for details.
+
+## Current Baseline
+
+**Report:** `reports/BENCHMARK-v1.0-2.0.4b3-2025-12-20.md`
+
+- Version: 2.0.4-beta.3
+- Hardware: Mac14,13 (M2 Max, 64 GB)
+- Tests: 141/162 passed, 19.5 min
+- Quality: 100% clean (0 MB swap, 0 zombies)
+
+## Phase 0 Goals
1. **Collect data organically** from E2E tests
2. **No perfect schema** - schema evolves with data
3. **Git-tracked reports** - historical trends
-4. **Foundation for future** - community contributions, public database
+4. **Foundation for Phase 1** - mlxk-benchmark package
-### Directory Structure
+## Memory Timeline Visualization
-- `reports/` - JSONL test reports (one file per release)
-- `schemas/` - JSON Schema definitions (versioned)
+**Tool:** `tools/memplot.py` | **Created:** Session 45 (2025-12-21)
-### Current Schema
-
-**Version:** 0.2.0 (Phase 0 - Scheduling-Enhanced)
-
-- **v0.1.0** (2.0.3+): Minimal schema - basic performance metrics
-- **v0.2.0** (2.0.4+): Hardware profiling + detailed metrics for cluster scheduling
- - `system.hardware_profile`: Mac model, cores, Metal version
- - `performance.*_time_s`: model_load, time_to_first_token, cleanup
- - `system_health`: swap, zombies, quality_flags
- - Backward compatible: v0.1.0 reports still valid
-
-**Schema Files:**
-- `schemas/report-current.schema.json` → always points to latest version
-- `schemas/report-v0.2.schema.json` → current schema (2.0.4+)
-- `schemas/report-v0.1.schema.json` → legacy schema (2.0.3)
-
-**Required fields:**
-- `schema_version`, `timestamp`, `mlx_knife_version`, `test`, `outcome`
-
-**Optional sections:**
-- `model` - Model metadata
-- `performance` - tokens/sec, RAM usage
-- `stop_tokens` - ADR-009 validation data
-- `system` - Platform info
-- `metadata` - Extensible (anything)
-
-### Generating Reports
+### Quick Start
```bash
-# During E2E tests
-pytest -m live_e2e tests_2.0/live/ \
- --report-output benchmarks/reports/$(date +%Y-%m-%d)-v$(mlxk --version | cut -d' ' -f2).jsonl
+# Collect data (memmon runs in background)
+python benchmarks/tools/memmon.py --output memory.jsonl -- \
+ pytest -m live_e2e tests_2.0/live/ --report-output benchmark.jsonl
+
+# Generate interactive HTML
+python benchmarks/tools/memplot.py memory.jsonl benchmark.jsonl -o timeline.html
```
-### Schema Evolution
+### Visual Legend
-As we collect more data, the schema will evolve:
-- New fields added (backward compatible)
-- Optional → Required (when stable)
-- Breaking changes documented in `schemas/MIGRATIONS.md`
+#### Main Graph: RAM Free (GB)
-### Future Phases
+**Blue line with colored markers:**
+- 🟢 **Green markers:** Healthy (≥32 GB free, ≥50% of 64 GB)
+- 🟠 **Orange markers:** Warning (16-32 GB free, 25-50%)
+- 🔴 **Red markers:** Critical (<16 GB free, <25%)
-- **Phase 1 (2.1+):** Schema formalization, validation tooling
-- **Phase 2 (2.2+):** `mlxk report` CLI for manual submissions
-- **Phase 3 (2.3+):** Public database, community contributions
+**Dashed threshold lines:**
+- **Green line (32 GB):** 50% threshold - system healthy
+- **Orange line (16 GB):** 25% threshold - warning level
-See `docs/ADR/ADR-013-Community-Model-Quality-Database.md` for full roadmap.
+#### Background Rectangles: Test Regions
+
+**Gray (rgba(200, 200, 200, 0.3)):**
+- Model tests that load an LLM model
+- Example: `test_run_command[text_00]`, `test_chat_completion[vision_01]`
+- **Meaning:** Model is loaded in RAM during this time
+
+**Light Blue (rgba(173, 216, 230, 0.2)):**
+- Infrastructure tests without model
+- Example: `test_portfolio_discovery`, `test_health_check`
+- **Meaning:** No model loaded, only test infrastructure active
+
+⚠️ **Known limitation (v0.2.0):** Server tests appear as "light blue" even when loading models (LocalServer fixture doesn't record model metadata). Recognizable by: high RAM usage + long duration in blue region. Example: `test_text_request_still_works_on_vision_model` (57 GB used, 16s duration).
+
+#### Memory Pressure Overlay
+
+**Yellow (rgba(255, 204, 0, 0.15)):**
+- macOS Memory Pressure: WARN
+- Source: `sysctl kern.memorystatus_vm_pressure_level = 2`
+
+**Red (rgba(255, 59, 48, 0.15)):**
+- macOS Memory Pressure: CRITICAL
+- Source: `sysctl kern.memorystatus_vm_pressure_level = 4`
+- **Meaning:** System begins swapping, performance degradation
+
+**White/Transparent:**
+- macOS Memory Pressure: NORMAL (level = 1)
+
+#### Labels
+
+**Top (90° rotated, black):**
+- Model names at each model switch
+- Example: `DeepHermes-3-Mistral`, `pixtral-12b-8bit`
+- Position: Left-aligned with test start
+
+**Bottom (90° rotated, gray):**
+- Test names for each test (model + infrastructure)
+- Example: `test_run_command`, `test_chat_completion`
+- Position: Left-aligned with test start
+
+**Vertical helper lines:**
+- Thin gray lines at each test start
+- Help correlate labels with timeline
+
+#### Secondary Y-Axis: Swap Used (MB)
+
+**Red line (right axis):**
+- Only visible when swap > 0 MB
+- **Meaning:** System paging RAM to SSD → performance loss
+- **Normal:** 0 MB
+- **Problematic:** >100 MB
+
+### Interpretation Patterns
+
+**Typical model load:**
+```
+Pattern: RAM Free drops suddenly (e.g., 52 GB → 28 GB)
+Duration: 2-5 seconds
+Color: Gray rectangle begins
+Label: Model name appears at top
+→ Model loaded into RAM (24 GB)
+```
+
+**Typical model unload:**
+```
+Pattern: RAM Free rises suddenly (e.g., 28 GB → 52 GB)
+Duration: <1 second
+Color: Gray rectangle ends (or switches to next)
+Label: New model name (or none)
+→ Model removed from RAM
+```
+
+**Memory pressure without swap:**
+```
+Pattern: Yellow/Red background WITHOUT swap line
+RAM Free: Still >10 GB
+→ macOS preparing to swap, not yet active
+→ Often during large model loads (temporary)
+```
+
+**Memory pressure with swap:**
+```
+Pattern: Red background + Red swap line rises
+RAM Free: <10 GB
+Swap: >100 MB
+→ System actually at limit
+→ Performance significantly worse
+→ Typical: Multiple large models in short time
+```
+
+**Infrastructure test with high RAM usage:**
+```
+Pattern: Light blue rectangle + RAM drops significantly (>20 GB)
+Duration: >10 seconds
+Example: 57 GB used in test_text_request_still_works_on_vision_model
+→ ⚠️ Schema bug: Server test loads model but "model": null
+→ Should be gray, not light blue
+→ Fix: v1.0 schema with log parsing
+```
+
+### Data Sources
+
+**RAM Free:**
+- Source: `vm_stat` (macOS native)
+- Calculation: `(free + inactive + purgeable + speculative) * page_size / 1e9`
+- Sample rate: 500ms (2 samples/second)
+
+**Memory Pressure:**
+- Source: `sysctl kern.memorystatus_vm_pressure_level`
+- Values: 1=NORMAL, 2=WARN, 4=CRITICAL
+- Sample rate: 500ms (synchronized with RAM)
+
+**Swap Used:**
+- Source: `sysctl vm.swapusage`
+- Unit: MB
+- Sample rate: 500ms
+
+**Test Metadata:**
+- Source: Benchmark JSONL (pytest-json-report format)
+- Fields: `timestamp`, `duration`, `test`, `model` (optional), `outcome`
+- Correlation: ISO timestamp → Unix timestamp → elapsed seconds
+
+### Known Limitations (v0.2.0)
+
+1. **Model load/unload events missing**
+ - Gray regions show "test with model", not "model is loaded"
+ - Pytest runs through ALL models 4x → each model loaded/unloaded 4x
+ - Regions overlap visually though sequential
+ - **Fix planned:** v1.0 schema with explicit events
+
+2. **Server tests without model attribution**
+ - Server tests (LocalServer fixture) load models internally
+ - Appear as "infrastructure" (light blue) instead of "model" (gray)
+ - Recognizable: High RAM + long duration in blue region
+ - **Fix planned:** Log parsing in v0.3.0/v1.0
+
+3. **Dense test sequences**
+ - Tests shorter than 500ms sample rate → no coloring
+ - Typical: Fast infrastructure tests (<100ms)
+ - **Workaround:** Test labels show all tests
+
+4. **Label overlap**
+ - Many tests in short time (>10 tests/min)
+ - Labels may overlap (90° rotated)
+ - **Mitigation:** Zoom for detailed view
+ - **Future:** Adaptive label density or collapsing
+
+### Interactive Features
+
+- **Zoom & Pan:** Mouse wheel (vertical), Shift+wheel (horizontal), click+drag
+- **Range Slider:** Quick navigation in long (>20 min) timelines
+- **Hover:** X-axis unified mode shows all values at same time
+
+### Future Extensions (Ideas)
+
+**For plot:**
+- [ ] Embedded legend in plot (not external file)
+- [ ] Toggle show/hide infrastructure tests
+- [ ] Hover shows full test names (not truncated)
+- [ ] Color-blind mode (alternative palette)
+
+**For schema v1.0:**
+- [ ] Model load/unload events → precise "in RAM" regions
+- [ ] Log parsing for server tests → correct attribution
+- [ ] GPU activity (Metal performance)
+- [ ] Net T/S (tokens/second, pure inference)
+
+**For analysis:**
+- [ ] Automatic anomaly detection (memory leaks, zombies)
+- [ ] Per-model memory profiling (min/max/avg RAM)
+- [ ] Scheduling optimization (avoid model-switch overlap)
+
+---
+
+## Roadmap
+
+| Phase | Release | Description |
+|-------|---------|-------------|
+| **Phase 0** | 2.0.3-2.0.4 | Organic Data Collection ✅ |
+| Phase 1 | 2.1+ | `mlxk-benchmark` package (separate tool) |
+| Phase 2 | 2.2+ | Report aggregation, hardware correlation |
+| Phase 3 | 2.3+ | Public database, community contributions |
+
+## Further Documentation
+
+- **[TESTING.md](TESTING.md)** - Benchmark handbook (How-To)
+- **[schemas/LEARNINGS-FOR-v1.0.md](schemas/LEARNINGS-FOR-v1.0.md)** - Learnings for Phase 1
+- **[docs/ADR/ADR-013-Community-Model-Quality-Database.md](../docs/ADR/ADR-013-Community-Model-Quality-Database.md)** - Architecture vision
diff --git a/benchmarks/TESTING.md b/benchmarks/TESTING.md
index 59e44f2..6d8d063 100644
--- a/benchmarks/TESTING.md
+++ b/benchmarks/TESTING.md
@@ -1,155 +1,263 @@
-# Testing with Benchmark Reports (ADR-013 Phase 0)
+# Benchmark Handbook
-This document explains how to generate benchmark reports during E2E tests.
+Step-by-step guide for running benchmarks and generating reports.
+
+## Quick Start
+
+```bash
+# 1. Run E2E tests with report output
+pytest -m live_e2e tests_2.0/live/ \
+ --report-output benchmarks/reports/$(date +%Y-%m-%d)-v2.0.4b3.jsonl
+
+# 2. Generate analysis report
+python benchmarks/generate_benchmark_report.py
+
+# 3. View results
+cat benchmarks/reports/BENCHMARK-v1.0-2.0.4b3-*.md
+```
+
+---
+
+## Running Benchmarks
+
+### Basic Test Run
+
+```bash
+# Run all E2E tests, output to JSONL
+pytest -m live_e2e tests_2.0/live/ \
+ --report-output benchmarks/reports/$(date +%Y-%m-%d)-v2.0.4b3.jsonl
+```
+
+### With Custom HuggingFace Cache
+
+```bash
+HF_HOME=/path/to/huggingface/cache \
+ pytest -m live_e2e tests_2.0/live/ -v \
+ --report-output benchmarks/reports/2025-12-20-v2.0.4b3.jsonl
+```
+
+### With Memory Monitoring
+
+```bash
+# Run memmon in parallel to capture memory profile
+python benchmarks/tools/memmon.py \
+ --output benchmarks/reports/2025-12-20-memory.jsonl \
+ -- pytest -m live_e2e tests_2.0/live/ \
+ --report-output benchmarks/reports/2025-12-20-v2.0.4b3.jsonl
+```
+
+---
## Generating Reports
-### Basic Usage
+### Auto-Detect Latest JSONL
```bash
-# Run E2E tests with reporting
-pytest -m live_e2e tests_2.0/live/ \
- --report-output benchmarks/reports/$(date +%Y-%m-%d)-v2.0.3.jsonl
+python benchmarks/generate_benchmark_report.py
+# → Finds most recent .jsonl in benchmarks/reports/
+# → Outputs: BENCHMARK-v1.0--.md
```
-### With Full Environment
+### Explicit Input File
```bash
-# Use specific HF cache + generate reports
-HF_HOME=/Volumes/mz-SSD/huggingface/cache \
- pytest -m live_e2e tests_2.0/live/ -v \
- --report-output benchmarks/reports/2025-11-16-v2.0.3.jsonl
+python benchmarks/generate_benchmark_report.py \
+ benchmarks/reports/2025-12-20-v2.0.4b3.jsonl
```
-## Adding Report Data to Tests
+### With Comparison (Regression Detection)
-Tests can add structured data to reports using `request.node.user_properties`:
+```bash
+python benchmarks/generate_benchmark_report.py \
+ benchmarks/reports/2025-12-20-new.jsonl \
+ --compare benchmarks/reports/2025-12-19-old.jsonl
+```
+
+Output includes:
+- Duration change (e.g., 20.5 min → 19.7 min, -3.8%)
+- Per-model changes with Old/Δ/Change columns
+- Per-test median time changes
+- Status indicators: ⚠️ (>5% slower), ✅ (>1% faster)
+
+### Custom Output Location
+
+```bash
+python benchmarks/generate_benchmark_report.py \
+ --output /tmp/my-report.md \
+ benchmarks/reports/2025-12-20-v2.0.4b3.jsonl
+```
+
+---
+
+## Memory Monitoring
+
+### Standalone Monitor (Fixed Duration)
+
+```bash
+python benchmarks/tools/memmon.py \
+ --duration 60 \
+ --interval 200 \
+ --output memory.jsonl
+```
+
+### Wrap Any Command
+
+```bash
+python benchmarks/tools/memmon.py \
+ --output memory.jsonl \
+ -- ./my-benchmark-script.sh
+```
+
+### Output Format
+
+```jsonl
+{"ts": 1734567890.1, "ram_free_gb": 45.2, "swap_used_mb": 0, "elapsed_s": 0.2}
+{"ts": 1734567890.3, "ram_free_gb": 42.1, "swap_used_mb": 0, "elapsed_s": 0.4}
+...
+{"summary": {"ram_free_min_gb": 21.3, "ram_free_max_gb": 45.2, "swap_max_mb": 0}}
+```
+
+### Correlating with Test Results
+
+Memory samples can be correlated with test results via timestamps:
```python
-def test_example(model_info, request):
- # ... test logic ...
+# Test entry has: timestamp (end time), duration
+# Calculate: started_at = timestamp - duration
- # Add model info
- request.node.user_properties.append(("model", {
- "id": model_info["id"],
- "size_gb": model_info["ram_needed_gb"],
- "family": extract_family(model_info["id"]),
- "variant": extract_variant(model_info["id"])
- }))
+test_start = parse_iso(entry["timestamp"]) - entry["duration"]
+test_end = parse_iso(entry["timestamp"])
- # Add performance metrics
- request.node.user_properties.append(("performance", {
- "tokens_per_sec": measure_tokens_per_sec(response),
- "ram_peak_mb": get_peak_ram_usage(),
- "duration_s": response.elapsed
- }))
-
- # Add stop token data (ADR-009)
- request.node.user_properties.append(("stop_tokens", {
- "configured": model_stop_tokens,
- "detected": find_stop_tokens_in_response(response),
- "workaround": get_workaround_name(model_info["id"]),
- "leaked": check_for_leaked_tokens(response)
- }))
-
- # Add system info (optional)
- request.node.user_properties.append(("system", {
- "platform": platform.system().lower(),
- "platform_version": get_os_version(),
- "python_version": platform.python_version(),
- "mlx_version": get_mlx_version(),
- "hardware": get_hardware_model(),
- "ram_total_gb": get_total_ram_gb()
- }))
-
- # Anything else goes to metadata
- request.node.user_properties.append(("custom_metric", "value"))
+# Find matching memory samples
+matching = [s for s in samples if test_start <= s["ts"] <= test_end]
```
-## Structured Sections
+---
-Reports have predefined structured sections that map to schema fields:
+## Validating Reports
-| user_properties key | Maps to report field | Description |
-|---------------------|----------------------|-------------|
-| `model` | `model` object | Model metadata (id, size, family, variant) |
-| `performance` | `performance` object | Performance metrics (tokens/sec, RAM, duration) |
-| `stop_tokens` | `stop_tokens` object | Stop token behavior (ADR-009 validation) |
-| `system` | `system` object | Platform information (OS, Python, MLX, hardware) |
-| _anything else_ | `metadata` object | Extensible catch-all for experiments |
-
-## Schema Validation
+### Validate Against Current Schema
```bash
-# Validate reports against schema (requires jsonschema)
-pip install jsonschema
-
-# Validate all reports
-for report in benchmarks/reports/*.jsonl; do
- echo "Validating $report..."
- cat "$report" | while read line; do
- echo "$line" | python3 -c "
-import sys, json
-from jsonschema import validate
-
-with open('benchmarks/schemas/report-v0.1.schema.json') as f:
- schema = json.load(f)
-
-report = json.load(sys.stdin)
-validate(instance=report, schema=schema)
-print('✓ Valid')
-"
- done
-done
+python benchmarks/validate_reports.py benchmarks/reports/*.jsonl
```
-## Example Report
+### Validate Specific File
+```bash
+python benchmarks/validate_reports.py benchmarks/reports/2025-12-20-v2.0.4b3.jsonl
+```
+
+---
+
+## Schema Reference
+
+### Current Schema: v0.2.0
+
+Required fields:
```json
{
- "schema_version": "0.1.0",
- "timestamp": "2025-11-16T10:30:00Z",
- "mlx_knife_version": "2.0.3",
- "test": "tests_2.0/live/test_stop_tokens_live.py::test_stop_tokens[phi-3-mini]",
+ "schema_version": "0.2.0",
+ "timestamp": "2025-12-20T02:26:10.722510+00:00",
+ "mlx_knife_version": "2.0.4-beta.3",
+ "test": "tests_2.0/live/test_cli_e2e.py::test_run_command[discovered_00]",
"outcome": "passed",
- "duration": 12.3,
+ "duration": 12.3
+}
+```
+
+Optional sections:
+```json
+{
"model": {
- "id": "mlx-community/phi-3-mini-4k-instruct",
- "size_gb": 2.8,
- "family": "phi-3",
- "variant": "mini-4k-instruct"
+ "id": "mlx-community/Qwen3-32B-4bit",
+ "size_gb": 17.2,
+ "family": "qwen3"
},
- "performance": {
- "tokens_per_sec": 45.2,
- "ram_peak_mb": 3200,
- "prompt_tokens": 15,
- "completion_tokens": 42
+ "system": {
+ "hardware_profile": {
+ "model": "Mac14,13",
+ "cores_physical": 12
+ }
},
- "stop_tokens": {
- "configured": ["<|end|>", "<|endoftext|>"],
- "detected": ["<|end|>"],
- "workaround": "phi-3-dual-eos",
- "leaked": false
+ "system_health": {
+ "swap_used_mb": 0,
+ "ram_free_gb": 45.2,
+ "zombie_processes": 0,
+ "quality_flags": ["clean"]
}
}
```
-## Analyzing Reports
+### Quality Flags
-See `reports/README.md` for analysis examples (jq queries, statistics, trends).
+| Flag | Meaning | Threshold |
+|------|---------|-----------|
+| `clean` | Test ran without issues | swap=0, zombies=0 |
+| `degraded_swap` | Memory pressure detected | swap > 100 MB |
+| `degraded_zombies` | Zombie processes present | zombies > 0 |
+
+---
## Best Practices
1. **File Naming:** Use `YYYY-MM-DD-vX.Y.Z.jsonl` format
2. **Append Only:** Never edit existing reports (historical data)
3. **Commit Reports:** Reports are git-tracked for trend analysis
-4. **Schema Version:** Always include `schema_version` for evolution tracking
-5. **Optional Data:** Only add what you can measure reliably
-6. **No PII:** Never include personal information in reports
+4. **Clean State:** Reboot before important benchmark runs
+5. **Close Apps:** Minimize background processes during tests
+6. **Multiple Runs:** Run 2-3 times, compare for consistency
-## Future Enhancements (Phase 1+)
+---
-- Automatic validation during `pytest --report-output`
-- Performance regression detection
-- Report comparison tools (`mlxk report diff`)
-- Schema migration utilities
+## Troubleshooting
+
+### "No JSONL files found"
+
+```bash
+# Check if reports exist
+ls -la benchmarks/reports/*.jsonl
+
+# Run tests with output
+pytest -m live_e2e tests_2.0/live/ --report-output benchmarks/reports/test.jsonl
+```
+
+### Schema Validation Fails
+
+```bash
+# Check schema version in file
+head -1 benchmarks/reports/file.jsonl | jq .schema_version
+
+# Validate manually
+python -c "
+import json
+from jsonschema import validate
+with open('benchmarks/schemas/report-current.schema.json') as f:
+ schema = json.load(f)
+with open('benchmarks/reports/file.jsonl') as f:
+ for line in f:
+ validate(json.loads(line), schema)
+print('OK')
+"
+```
+
+### Comparison Shows "N/A"
+
+Model not found in comparison file. Check:
+- Same models tested in both runs?
+- Model ID spelling matches exactly?
+
+---
+
+## Future: Phase 1 (mlxk-benchmark)
+
+Phase 1 will introduce a standalone benchmark package:
+
+```bash
+pip install mlxk-benchmark
+mlx-benchmark --model llama-3.2-3b --contribute
+```
+
+No pytest, no fixtures, no conftest.py - just simple CLI for community contributions.
+
+See `schemas/LEARNINGS-FOR-v1.0.md` for design notes.
diff --git a/benchmarks/generate_benchmark_report.py b/benchmarks/generate_benchmark_report.py
new file mode 100644
index 0000000..e49d79f
--- /dev/null
+++ b/benchmarks/generate_benchmark_report.py
@@ -0,0 +1,575 @@
+#!/usr/bin/env python3
+"""Generate benchmark analysis report from JSONL test data.
+
+Reads JSONL benchmark reports and generates structured Markdown analysis.
+
+Usage:
+ # Auto-detect latest JSONL
+ python benchmarks/generate_benchmark_report.py
+
+ # Explicit file
+ python benchmarks/generate_benchmark_report.py benchmarks/reports/2025-12-20-v2.0.4b3.jsonl
+
+ # With comparison
+ python benchmarks/generate_benchmark_report.py new.jsonl --compare old.jsonl
+"""
+
+import argparse
+import json
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+try:
+ import jsonschema
+except ImportError:
+ print("Error: jsonschema not installed. Install with: pip install jsonschema")
+ sys.exit(1)
+
+
+# Template version
+TEMPLATE_VERSION = "1.0"
+REPORTS_DIR = Path("benchmarks/reports")
+SCHEMA_PATH = Path("benchmarks/schemas/report-current.schema.json")
+
+
+def load_schema() -> dict:
+ """Load current JSON schema."""
+ if not SCHEMA_PATH.exists():
+ print(f"❌ Schema not found: {SCHEMA_PATH}")
+ sys.exit(1)
+
+ with open(SCHEMA_PATH) as f:
+ return json.load(f)
+
+
+def validate_jsonl(data: List[dict], schema: dict, filepath: Path) -> bool:
+ """Validate JSONL data against schema."""
+ errors = []
+ for i, entry in enumerate(data, 1):
+ try:
+ jsonschema.validate(instance=entry, schema=schema)
+ except jsonschema.ValidationError as e:
+ errors.append(f"Line {i}: {e.message}")
+
+ if errors:
+ print(f"❌ Validation failed for {filepath}")
+ for error in errors[:5]: # Show first 5 errors
+ print(f" {error}")
+ if len(errors) > 5:
+ print(f" ... and {len(errors) - 5} more errors")
+ return False
+
+ return True
+
+
+def load_jsonl(filepath: Path) -> List[dict]:
+ """Load JSONL file."""
+ data = []
+ with open(filepath) as f:
+ for line in f:
+ line = line.strip()
+ if line:
+ data.append(json.loads(line))
+ return data
+
+
+def find_latest_jsonl() -> Optional[Path]:
+ """Find the most recent JSONL file in reports directory."""
+ if not REPORTS_DIR.exists():
+ return None
+
+ jsonl_files = sorted(REPORTS_DIR.glob("*.jsonl"), key=lambda p: p.stat().st_mtime, reverse=True)
+ return jsonl_files[0] if jsonl_files else None
+
+
+def extract_version_from_filename(filepath: Path) -> Optional[str]:
+ """Extract version string from filename like '2025-12-20-v2.0.4b3.jsonl'."""
+ parts = filepath.stem.split("-v")
+ return parts[1].split("-")[0] if len(parts) > 1 else None
+
+
+def calculate_statistics(data: List[dict]) -> Dict:
+ """Calculate all benchmark statistics from JSONL data."""
+ # Separate by outcome
+ passed_tests = [e for e in data if e.get("outcome") == "passed"]
+ skipped_tests = [e for e in data if e.get("outcome") == "skipped"]
+ passed_with_model = [e for e in passed_tests if "model" in e]
+ passed_without_model = [e for e in passed_tests if "model" not in e]
+
+ # System health metrics (optional for backward compatibility with older schemas)
+ swap_values = []
+ ram_values = []
+ zombie_values = []
+ quality_flags = []
+
+ for e in data:
+ if "system_health" in e:
+ swap_values.append(e["system_health"].get("swap_used_mb", 0))
+ ram_values.append(e["system_health"].get("ram_free_gb", 0))
+ zombie_values.append(e["system_health"].get("zombie_processes", 0))
+ quality_flags.append(e["system_health"].get("quality_flags", ["unknown"]))
+
+ clean_count = sum(1 for flags in quality_flags if flags == ["clean"])
+ degraded_swap = sum(1 for flags in quality_flags if "degraded_swap" in flags)
+ degraded_zombies = sum(1 for flags in quality_flags if "degraded_zombies" in flags)
+
+ # Per-model statistics
+ model_stats = {}
+ for entry in passed_with_model:
+ model_id = entry["model"]["id"]
+ if model_id not in model_stats:
+ model_stats[model_id] = {
+ "id": model_id,
+ "size_gb": entry["model"]["size_gb"],
+ "count": 0,
+ "total_time": 0,
+ "ram_min": float("inf"),
+ "ram_max": 0,
+ "swap_max": 0,
+ "zombies_max": 0,
+ }
+
+ stats = model_stats[model_id]
+ stats["count"] += 1
+ stats["total_time"] += entry["duration"]
+ # Handle optional system_health (backward compatibility)
+ if "system_health" in entry:
+ stats["ram_min"] = min(stats["ram_min"], entry["system_health"].get("ram_free_gb", 0))
+ stats["ram_max"] = max(stats["ram_max"], entry["system_health"].get("ram_free_gb", 0))
+ stats["swap_max"] = max(stats["swap_max"], entry["system_health"].get("swap_used_mb", 0))
+ stats["zombies_max"] = max(stats["zombies_max"], entry["system_health"].get("zombie_processes", 0))
+
+ # Per-test statistics
+ import statistics
+ test_stats = {}
+ for entry in passed_with_model:
+ # Extract test function name and normalize (remove parametrization)
+ test_full = entry["test"].split("::")[-1]
+ test_name = test_full.split("[")[0] # Remove [discovered_XX] part
+
+ model_id = entry["model"]["id"]
+ model_short = model_id.replace("mlx-community/", "").split("-")[0] # Short name
+ duration = entry["duration"]
+
+ if test_name not in test_stats:
+ test_stats[test_name] = {
+ "name": test_name,
+ "models": set(),
+ "runs": [],
+ }
+
+ test_stats[test_name]["models"].add(model_id)
+ test_stats[test_name]["runs"].append({
+ "model": model_id,
+ "model_short": model_short,
+ "duration": duration
+ })
+
+ # Calculate aggregates per test
+ for test_name, stats in test_stats.items():
+ durations = [r["duration"] for r in stats["runs"]]
+ stats["model_count"] = len(stats["models"])
+ stats["median_time"] = statistics.median(durations) if durations else 0
+
+ # Find fastest and slowest
+ sorted_runs = sorted(stats["runs"], key=lambda r: r["duration"])
+ stats["fastest"] = sorted_runs[0] if sorted_runs else None
+ stats["slowest"] = sorted_runs[-1] if sorted_runs else None
+
+ # Convert set to list for JSON serialization
+ stats["models"] = list(stats["models"])
+
+ # Hardware profile (from first entry, optional for backward compatibility)
+ hw_profile = {}
+ if data and "system" in data[0] and "hardware_profile" in data[0]["system"]:
+ hw_profile = data[0]["system"]["hardware_profile"]
+
+ return {
+ "total_tests": len(data),
+ "passed": len(passed_tests),
+ "passed_with_model": len(passed_with_model),
+ "passed_infrastructure": len(passed_without_model),
+ "skipped": len(skipped_tests),
+ "total_duration": sum(e["duration"] for e in passed_tests),
+ "schema_version": data[0]["schema_version"] if data else "unknown",
+ "mlx_knife_version": data[0]["mlx_knife_version"] if data else "unknown",
+ "swap": {
+ "min": min(swap_values) if swap_values else 0,
+ "max": max(swap_values) if swap_values else 0,
+ "avg": sum(swap_values) / len(swap_values) if swap_values else 0,
+ },
+ "ram": {
+ "min": min(ram_values) if ram_values else 0,
+ "max": max(ram_values) if ram_values else 0,
+ "avg": sum(ram_values) / len(ram_values) if ram_values else 0,
+ },
+ "zombies": {
+ "min": min(zombie_values) if zombie_values else 0,
+ "max": max(zombie_values) if zombie_values else 0,
+ },
+ "quality": {
+ "clean": clean_count,
+ "degraded_swap": degraded_swap,
+ "degraded_zombies": degraded_zombies,
+ "clean_percent": 100 * clean_count / len(data) if data else 0,
+ },
+ "hardware": hw_profile,
+ "models": model_stats,
+ "tests": test_stats,
+ }
+
+
+def generate_markdown(stats: Dict, input_file: Path, compare_file: Optional[Path] = None, compare_stats: Optional[Dict] = None) -> str:
+ """Generate Markdown report from statistics."""
+ version = stats["mlx_knife_version"]
+ date = input_file.stem.split("-v")[0] # Extract date from filename
+ now = datetime.now(timezone.utc).isoformat()
+
+ # Header
+ md = f"""# Benchmark Report v{TEMPLATE_VERSION}: {version}
+
+**Date:** {date}
+**Generated:** {now}
+**Generator:** generate_benchmark_report.py v{TEMPLATE_VERSION}
+**Hardware:** {stats['hardware'].get('model', 'unknown')}, {stats['hardware'].get('cores_physical', '?')} cores
+
+---
+
+## Input Files
+
+- **Primary:** `{input_file}`
+- **Schema:** v{stats['schema_version']}
+"""
+
+ if compare_file:
+ md += f"- **Comparison:** `{compare_file}`\n"
+
+ md += "\n---\n\n"
+
+ # Executive Summary
+ md += "## Executive Summary\n\n"
+ md += f"**Tests:** {stats['total_tests']} total ({stats['passed']} passed, {stats['skipped']} skipped)\n"
+ md += f"**Duration:** {stats['total_duration']:.1f}s ({stats['total_duration']/60:.1f} min)\n"
+ md += f"**Quality:** {stats['quality']['clean_percent']:.1f}% clean ({stats['quality']['clean']}/{stats['total_tests']})\n"
+ md += f"**Models:** {len(stats['models'])} tested\n\n"
+
+ # Comparison Summary
+ if compare_stats:
+ old_duration = compare_stats['total_duration']
+ new_duration = stats['total_duration']
+ duration_delta = new_duration - old_duration
+ duration_pct = (duration_delta / old_duration * 100) if old_duration > 0 else 0
+
+ # Count models by change direction
+ compare_models_dict = {m['id']: m for m in compare_stats['models'].values()}
+ slower_count = 0
+ faster_count = 0
+ for model in stats['models'].values():
+ old_model = compare_models_dict.get(model['id'])
+ if old_model:
+ if model['total_time'] > old_model['total_time']:
+ slower_count += 1
+ elif model['total_time'] < old_model['total_time']:
+ faster_count += 1
+
+ total_compared = slower_count + faster_count
+ change_icon = "⚠️" if duration_pct > 3 else "✅" if duration_pct < -1 else "➡️"
+
+ md += f"### Comparison\n\n"
+ md += f"**vs:** `{compare_file.name}`\n"
+ md += f"**Duration:** {old_duration/60:.1f} min → {new_duration/60:.1f} min ({duration_pct:+.1f}%) {change_icon}\n"
+ if total_compared > 0:
+ md += f"**Models:** {slower_count}/{total_compared} slower ({100*slower_count/total_compared:.0f}%), {faster_count}/{total_compared} faster ({100*faster_count/total_compared:.0f}%)\n"
+ md += "\n"
+
+ # Validation Status
+ quality_icon = "✅" if stats['quality']['clean_percent'] == 100 else "⚠️"
+ md += f"{quality_icon} **System Health:** "
+ if stats['quality']['clean_percent'] == 100:
+ md += "All tests clean (0 MB swap, 0 zombies)\n"
+ else:
+ md += f"{stats['quality']['degraded_swap']} degraded (swap), {stats['quality']['degraded_zombies']} degraded (zombies)\n"
+
+ md += "\n---\n\n"
+
+ # Test Summary
+ md += "## Test Summary\n\n"
+ md += f"""```
+Total tests: {stats['total_tests']}
+Passed: {stats['passed']}
+ With model: {stats['passed_with_model']}
+ Infrastructure: {stats['passed_infrastructure']}
+Skipped: {stats['skipped']}
+Duration: {stats['total_duration']:.1f}s ({stats['total_duration']/60:.1f} min)
+```
+
+---
+
+## System Health
+
+"""
+ md += f"""```
+Swap (MB): min={stats['swap']['min']}, max={stats['swap']['max']}, avg={stats['swap']['avg']:.1f}
+RAM free (GB): min={stats['ram']['min']:.1f}, max={stats['ram']['max']:.1f}, avg={stats['ram']['avg']:.1f}
+Zombies: min={stats['zombies']['min']}, max={stats['zombies']['max']}
+
+Quality Flags:
+ Clean: {stats['quality']['clean']}/{stats['total_tests']} ({stats['quality']['clean_percent']:.1f}%)
+ Degraded (swap): {stats['quality']['degraded_swap']}
+ Degraded (zombies): {stats['quality']['degraded_zombies']}
+```
+
+---
+
+## Per-Model Statistics
+
+"""
+
+ # Sort models by total time (descending), or by change if comparing
+ sorted_models = sorted(stats['models'].values(), key=lambda m: m['total_time'], reverse=True)
+
+ # Build comparison lookup if available
+ compare_models = {}
+ if compare_stats:
+ compare_models = {m['id']: m for m in compare_stats['models'].values()}
+ # Re-sort by change percentage (biggest regression first)
+ def get_change_pct(model):
+ old = compare_models.get(model['id'])
+ if old and old['total_time'] > 0:
+ return (model['total_time'] - old['total_time']) / old['total_time'] * 100
+ return 0
+ sorted_models = sorted(stats['models'].values(), key=get_change_pct, reverse=True)
+
+ if compare_stats:
+ md += f"""```
+{'Model':<42} {'Size':<7} {'Tests':<5} {'Time':<8} {'Old':<8} {'Δ':<8} {'Change':<10} {'RAM (GB)':<12}
+{'='*42} {'='*7} {'='*5} {'='*8} {'='*8} {'='*8} {'='*10} {'='*12}
+"""
+ else:
+ md += f"""```
+{'Model':<50} {'Size':<8} {'Tests':<6} {'Time':<10} {'RAM (GB)':<20}
+{'='*50} {'='*8} {'='*6} {'='*10} {'='*20}
+"""
+
+ for model in sorted_models:
+ # Shorten model ID (remove mlx-community/ prefix)
+ model_short = model['id'].replace('mlx-community/', '')
+ max_len = 40 if compare_stats else 48
+ if len(model_short) > max_len:
+ model_short = model_short[:max_len-3] + "..."
+
+ ram_range = f"{model['ram_min']:.1f}-{model['ram_max']:.1f}"
+
+ if compare_stats:
+ old_model = compare_models.get(model['id'])
+ if old_model:
+ old_time = old_model['total_time']
+ delta = model['total_time'] - old_time
+ change_pct = (delta / old_time * 100) if old_time > 0 else 0
+ # Status indicator
+ if change_pct > 5:
+ status = "⚠️"
+ elif change_pct < -1:
+ status = "✅"
+ else:
+ status = ""
+ change_str = f"{change_pct:+.1f}% {status}"
+ md += f"{model_short:<42} {model['size_gb']:>5.1f}GB {model['count']:<5} {model['total_time']:>6.1f}s {old_time:>6.1f}s {delta:>+6.1f}s {change_str:<10} {ram_range:<12}\n"
+ else:
+ md += f"{model_short:<42} {model['size_gb']:>5.1f}GB {model['count']:<5} {model['total_time']:>6.1f}s {'N/A':<8} {'N/A':<8} {'NEW':<10} {ram_range:<12}\n"
+ else:
+ md += f"{model_short:<50} {model['size_gb']:>6.1f}GB {model['count']:<6} {model['total_time']:>8.1f}s {ram_range:<20}\n"
+
+ md += "```\n\n"
+
+ # Model Categories
+ large_models = [m for m in sorted_models if m['size_gb'] >= 20]
+ medium_models = [m for m in sorted_models if 10 <= m['size_gb'] < 20]
+ small_models = [m for m in sorted_models if m['size_gb'] < 10]
+
+ md += "### Model Categories\n\n"
+ md += f"""```
+LARGE MODELS (≥20 GB): {len(large_models)} models
+ Avg size: {sum(m['size_gb'] for m in large_models) / len(large_models):.1f} GB
+ Avg test time: {sum(m['total_time']/m['count'] for m in large_models) / len(large_models):.1f}s
+ Avg min RAM: {sum(m['ram_min'] for m in large_models) / len(large_models):.1f} GB
+
+MEDIUM MODELS (10-20 GB): {len(medium_models)} models
+ Avg size: {sum(m['size_gb'] for m in medium_models) / len(medium_models):.1f} GB
+ Avg test time: {sum(m['total_time']/m['count'] for m in medium_models) / len(medium_models):.1f}s
+ Avg min RAM: {sum(m['ram_min'] for m in medium_models) / len(medium_models):.1f} GB
+
+SMALL MODELS (<10 GB): {len(small_models)} models
+ Avg size: {sum(m['size_gb'] for m in small_models) / len(small_models):.1f} GB
+ Avg test time: {sum(m['total_time']/m['count'] for m in small_models) / len(small_models):.1f}s
+ Avg min RAM: {sum(m['ram_min'] for m in small_models) / len(small_models):.1f} GB
+```
+""" if large_models and medium_models and small_models else ""
+
+ md += "\n---\n\n"
+
+ # Per-Test Statistics
+ md += "## Per-Test Statistics\n\n"
+ md += "Shows performance range across models for each test.\n\n"
+
+ # Sort tests by model count (descending) - most representative tests first
+ sorted_tests = sorted(stats['tests'].values(), key=lambda t: t['model_count'], reverse=True)
+
+ # Build comparison lookup for tests
+ compare_tests = {}
+ if compare_stats:
+ compare_tests = {t['name']: t for t in compare_stats['tests'].values()}
+
+ if compare_stats:
+ md += f"""```
+{'Test Name':<40} {'Models':<7} {'Fastest':<20} {'Slowest':<20} {'Med':<6} {'Old':<6} {'Δ Med':<8}
+{'='*40} {'='*7} {'='*20} {'='*20} {'='*6} {'='*6} {'='*8}
+"""
+ else:
+ md += f"""```
+{'Test Name':<50} {'Models':<7} {'Fastest':<25} {'Slowest':<25} {'Med Time'}
+{'='*50} {'='*7} {'='*25} {'='*25} {'='*8}
+"""
+
+ for test in sorted_tests:
+ # Shorten test name if needed
+ max_test_len = 38 if compare_stats else 48
+ test_short = test['name']
+ if len(test_short) > max_test_len:
+ test_short = test_short[:max_test_len-3] + "..."
+
+ # Format fastest/slowest
+ fastest = test['fastest']
+ slowest = test['slowest']
+
+ if fastest and slowest:
+ max_model_len = 18 if compare_stats else 23
+ fastest_str = f"{fastest['model_short']} ({fastest['duration']:.1f}s)"
+ slowest_str = f"{slowest['model_short']} ({slowest['duration']:.1f}s)"
+ if len(fastest_str) > max_model_len:
+ fastest_str = fastest_str[:max_model_len-3] + "..."
+ if len(slowest_str) > max_model_len:
+ slowest_str = slowest_str[:max_model_len-3] + "..."
+
+ med_time = test['median_time']
+
+ if compare_stats:
+ old_test = compare_tests.get(test['name'])
+ if old_test:
+ old_med = old_test['median_time']
+ delta_pct = ((med_time - old_med) / old_med * 100) if old_med > 0 else 0
+ delta_str = f"{delta_pct:+.1f}%"
+ md += f"{test_short:<40} {test['model_count']:<7} {fastest_str:<20} {slowest_str:<20} {med_time:<5.1f}s {old_med:<5.1f}s {delta_str:<8}\n"
+ else:
+ md += f"{test_short:<40} {test['model_count']:<7} {fastest_str:<20} {slowest_str:<20} {med_time:<5.1f}s {'N/A':<6} {'NEW':<8}\n"
+ else:
+ md += f"{test_short:<50} {test['model_count']:<7} {fastest_str:<25} {slowest_str:<25} {med_time:.1f}s\n"
+
+ md += "```\n\n"
+
+ md += "\n---\n\n"
+ md += "## Files\n\n"
+ md += f"- **Benchmark report:** `{input_file}`\n"
+ md += f"- **Schema:** `benchmarks/schemas/report-v{stats['schema_version']}.schema.json`\n"
+
+ return md
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Generate benchmark analysis report from JSONL data",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog=__doc__
+ )
+ parser.add_argument(
+ 'input',
+ nargs='?',
+ type=Path,
+ help='JSONL benchmark file (default: latest in benchmarks/reports/)'
+ )
+ parser.add_argument(
+ '--compare',
+ type=Path,
+ help='Compare with this JSONL file (adds Old/Δ/Change columns)'
+ )
+ parser.add_argument(
+ '--output',
+ type=Path,
+ help='Output markdown file (default: auto-generated in benchmarks/reports/)'
+ )
+
+ args = parser.parse_args()
+
+ # Determine input file
+ if args.input:
+ input_file = args.input
+ else:
+ input_file = find_latest_jsonl()
+ if not input_file:
+ print("❌ No JSONL files found in benchmarks/reports/")
+ sys.exit(1)
+ print(f"📊 Auto-detected: {input_file}")
+
+ if not input_file.exists():
+ print(f"❌ File not found: {input_file}")
+ sys.exit(1)
+
+ # Load and validate
+ print(f"📋 Loading: {input_file}")
+ schema = load_schema()
+ data = load_jsonl(input_file)
+
+ print(f"✓ Loaded {len(data)} entries")
+
+ # Validate against schema
+ if not validate_jsonl(data, schema, input_file):
+ sys.exit(1)
+
+ print(f"✓ Schema validation passed")
+
+ # Calculate statistics
+ stats = calculate_statistics(data)
+
+ # Load and calculate comparison statistics if requested
+ compare_stats = None
+ if args.compare:
+ if not args.compare.exists():
+ print(f"❌ Comparison file not found: {args.compare}")
+ sys.exit(1)
+ print(f"📊 Comparing with: {args.compare}")
+ compare_data = load_jsonl(args.compare)
+ if not validate_jsonl(compare_data, schema, args.compare):
+ sys.exit(1)
+ compare_stats = calculate_statistics(compare_data)
+ print(f"✓ Loaded {len(compare_data)} comparison entries")
+
+ # Generate report
+ markdown = generate_markdown(stats, input_file, args.compare, compare_stats)
+
+ # Determine output file
+ if args.output:
+ output_file = args.output
+ else:
+ # Auto-generate: BENCHMARK-v1--.md
+ version = extract_version_from_filename(input_file) or stats["mlx_knife_version"]
+ date = input_file.stem.split("-v")[0] # Extract date portion
+ output_file = REPORTS_DIR / f"BENCHMARK-v{TEMPLATE_VERSION}-{version}-{date}.md"
+
+ # Write output
+ output_file.parent.mkdir(parents=True, exist_ok=True)
+ with open(output_file, 'w') as f:
+ f.write(markdown)
+
+ print(f"✅ Generated: {output_file}")
+ print()
+ print(f"Summary:")
+ print(f" Tests: {stats['passed']}/{stats['total_tests']} passed")
+ print(f" Duration: {stats['total_duration']/60:.1f} min")
+ print(f" Quality: {stats['quality']['clean_percent']:.1f}% clean")
+ print(f" Models: {len(stats['models'])}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/reports/BENCHMARK-v1.0-2.0.4b3-2025-12-20.md b/benchmarks/reports/BENCHMARK-v1.0-2.0.4b3-2025-12-20.md
new file mode 100644
index 0000000..87fb85b
--- /dev/null
+++ b/benchmarks/reports/BENCHMARK-v1.0-2.0.4b3-2025-12-20.md
@@ -0,0 +1,125 @@
+# Benchmark Report v1.0: 2.0.4b3
+
+**Date:** 2025-12-20
+**Generated:** 2025-12-20T14:43:01.786689+00:00
+**Generator:** generate_benchmark_report.py v1.0
+**Hardware:** Mac14,13, 12 cores
+
+---
+
+## Input Files
+
+- **Primary:** `benchmarks/reports/2025-12-20-v2.0.4b3-2nd_0.2.0_schema.jsonl`
+- **Schema:** v0.2.0
+
+---
+
+## Executive Summary
+
+**Tests:** 162 total (141 passed, 21 skipped)
+**Duration:** 1169.3s (19.5 min)
+**Quality:** 100.0% clean (162/162)
+**Models:** 22 tested
+
+✅ **System Health:** All tests clean (0 MB swap, 0 zombies)
+
+---
+
+## Test Summary
+
+```
+Total tests: 162
+Passed: 141
+ With model: 84
+ Infrastructure: 57
+Skipped: 21
+Duration: 1169.3s (19.5 min)
+```
+
+---
+
+## System Health
+
+```
+Swap (MB): min=0, max=0, avg=0.0
+RAM free (GB): min=0.0, max=46.7, avg=19.0
+Zombies: min=0, max=0
+
+Quality Flags:
+ Clean: 162/162 (100.0%)
+ Degraded (swap): 0
+ Degraded (zombies): 0
+```
+
+---
+
+## Per-Model Statistics
+
+```
+Model Size Tests Time RAM (GB)
+================================================== ======== ====== ========== ====================
+Mistral-Small-3.2-24B-Instruct-2506-8bit 23.3GB 4 102.2s 21.4-25.9
+Qwen3-Coder-30B-A3B-Instruct-6bit-DWQ-lr9e-8 24.9GB 4 97.5s 21.6-26.8
+Mixtral-8x7B-Instruct-v0.1-4bit 24.5GB 4 96.9s 2.7-26.4
+DeepHermes-3-Mistral-24B-Preview-8bit 23.3GB 4 63.0s 0.0-24.6
+OpenCodeInterpreter-DS-33B-hf-4bit-mlx 17.8GB 4 62.9s 17.9-33.0
+Qwen3-32B-4bit 17.2GB 4 48.7s 17.1-20.3
+Klear-46B-A2.5B-Instruct-3bit 18.9GB 4 40.7s 18.9-19.9
+MiMo-VL-7B-RL-bf16 15.5GB 4 38.9s 14.6-19.7
+gpt-oss-20b-MXFP4-Q8 11.3GB 4 36.6s 14.2-36.4
+Qwen3-30B-A3B-Instruct-2507-4bit 16.0GB 4 34.2s 16.2-23.7
+Qwen3-Coder-30B-A3B-Instruct-4bit 16.0GB 4 33.1s 16.3-17.1
+Mistral-Small-3.2-24B-Instruct-2506-4bit 12.4GB 4 32.9s 13.0-16.9
+Mistral-Small-Instruct-2409-4bit 11.7GB 4 27.6s 12.9-26.2
+Qwen2.5-Coder-7B-Instruct-8bit 7.5GB 4 19.9s 8.5-31.2
+DeepSeek-R1-Distill-Llama-8B-4bit 4.2GB 4 19.7s 20.2-37.6
+pixtral-12b-8bit 12.6GB 2 15.5s 14.3-14.4
+Mistral-7B-Instruct-v0.2-4bit 4.0GB 4 14.1s 8.9-26.2
+Gabliterated-Qwen3-0.6B-float32 2.2GB 4 12.7s 16.1-37.3
+Phi-3-mini-4k-instruct-4bit 2.0GB 4 11.5s 14.6-46.7
+Phi-3.5-mini-instruct-4bit 2.0GB 4 10.2s 12.6-44.6
+Qwen2.5-0.5B-Instruct-4bit 0.3GB 4 9.2s 13.8-46.0
+Llama-3.2-11B-Vision-Instruct-4bit 5.6GB 2 8.9s 10.3-12.1
+```
+
+### Model Categories
+
+```
+LARGE MODELS (≥20 GB): 4 models
+ Avg size: 24.0 GB
+ Avg test time: 22.5s
+ Avg min RAM: 11.5 GB
+
+MEDIUM MODELS (10-20 GB): 10 models
+ Avg size: 14.9 GB
+ Avg test time: 9.7s
+ Avg min RAM: 15.6 GB
+
+SMALL MODELS (<10 GB): 8 models
+ Avg size: 3.5 GB
+ Avg test time: 3.6s
+ Avg min RAM: 13.1 GB
+```
+
+---
+
+## Per-Test Statistics
+
+Shows performance range across models for each test.
+
+```
+Test Name Models Fastest Slowest Med Time
+================================================== ======= ========================= ========================= ========
+test_run_command 22 Qwen2.5 (1.2s) DeepHermes (22.1s) 7.1s
+test_run_json_output 22 Qwen2.5 (1.2s) Mistral (13.3s) 7.1s
+test_chat_completions_batch 20 Phi (3.3s) Mixtral (30.9s) 8.7s
+test_chat_completions_streaming 20 Qwen2.5 (3.4s) Qwen3 (51.3s) 10.6s
+```
+
+
+---
+
+## Files
+
+- **Benchmark report:** `benchmarks/reports/2025-12-20-v2.0.4b3-2nd_0.2.0_schema.jsonl`
+- **Schema:** `benchmarks/schemas/report-v0.2.0.schema.json`
diff --git a/benchmarks/tools/memmon.py b/benchmarks/tools/memmon.py
new file mode 100644
index 0000000..bf57c3e
--- /dev/null
+++ b/benchmarks/tools/memmon.py
@@ -0,0 +1,371 @@
+#!/usr/bin/env python3
+"""Memory Monitor - Standalone tool for tracking memory during subprocess execution.
+
+Samples RAM, swap, and memory pressure while running any command.
+Outputs JSONL with per-sample data and final summary.
+
+Usage:
+ # Basic usage
+ python benchmarks/tools/memmon.py -- pytest -m live_e2e tests_2.0/live/
+
+ # With options
+ python benchmarks/tools/memmon.py --interval 200 --output memory.jsonl -- pytest -v
+
+ # Just monitor (no subprocess)
+ python benchmarks/tools/memmon.py --duration 60 --output memory.jsonl
+
+Future: Will be part of mlxk-benchmark kit.
+"""
+
+import argparse
+import json
+import subprocess
+import sys
+import threading
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+
+
+def get_memory_sample() -> dict:
+ """Get current memory state using psutil."""
+ try:
+ import psutil
+ import subprocess
+
+ # Get memory pressure from sysctl (macOS only)
+ # Values: 1=NORMAL (green), 2=WARN (yellow), 4=CRITICAL (red)
+ memory_pressure = 1 # Default to NORMAL
+ try:
+ result = subprocess.run(
+ ["sysctl", "-n", "kern.memorystatus_vm_pressure_level"],
+ capture_output=True, text=True, timeout=1
+ )
+ memory_pressure = int(result.stdout.strip())
+ except Exception:
+ pass
+
+ vm = psutil.virtual_memory()
+ swap = psutil.swap_memory()
+ return {
+ "ram_free_gb": round(vm.available / 1e9, 2),
+ "ram_used_gb": round(vm.used / 1e9, 2),
+ "ram_percent": vm.percent,
+ "swap_used_mb": round(swap.used / 1e6, 1),
+ "swap_percent": swap.percent,
+ "memory_pressure": memory_pressure,
+ }
+ except ImportError:
+ # Fallback without psutil
+ return get_memory_sample_native()
+
+
+def get_memory_sample_native() -> dict:
+ """Get memory state using native macOS commands (no psutil)."""
+ import subprocess
+
+ # Get memory pressure (1=NORMAL/green, 2=WARN/yellow, 4=CRITICAL/red)
+ memory_pressure = 1 # Default to NORMAL
+ try:
+ result = subprocess.run(
+ ["sysctl", "-n", "kern.memorystatus_vm_pressure_level"],
+ capture_output=True, text=True, timeout=1
+ )
+ memory_pressure = int(result.stdout.strip())
+ except Exception:
+ pass
+
+ # Get swap usage
+ swap_mb = 0
+ try:
+ result = subprocess.run(
+ ["sysctl", "-n", "vm.swapusage"],
+ capture_output=True, text=True, timeout=1
+ )
+ # Parse: "total = 0.00M used = 0.00M free = 0.00M (encrypted)"
+ for part in result.stdout.split():
+ if part.endswith("M") and "used" in result.stdout.split()[result.stdout.split().index(part)-2]:
+ swap_mb = float(part[:-1])
+ break
+ # Simpler parsing
+ parts = result.stdout.replace("M", "").split()
+ for i, p in enumerate(parts):
+ if p == "used" and i + 2 < len(parts):
+ swap_mb = float(parts[i + 2])
+ break
+ except Exception:
+ pass
+
+ # Get RAM via vm_stat
+ ram_free_gb = 0
+ try:
+ result = subprocess.run(
+ ["vm_stat"],
+ capture_output=True, text=True, timeout=1
+ )
+ # Parse page size and available pages
+ page_size = 16384 # Default for Apple Silicon
+ pages_free = 0
+ pages_inactive = 0
+ pages_purgeable = 0
+ pages_speculative = 0
+
+ for line in result.stdout.splitlines():
+ if "page size of" in line:
+ page_size = int(line.split()[-2])
+ elif "Pages free:" in line:
+ pages_free = int(line.split()[-1].rstrip("."))
+ elif "Pages inactive:" in line:
+ pages_inactive = int(line.split()[-1].rstrip("."))
+ elif "Pages purgeable:" in line:
+ pages_purgeable = int(line.split()[-1].rstrip("."))
+ elif "Pages speculative:" in line:
+ pages_speculative = int(line.split()[-1].rstrip("."))
+
+ # Total available = free + inactive + purgeable + speculative
+ total_available_pages = pages_free + pages_inactive + pages_purgeable + pages_speculative
+ ram_free_gb = round((total_available_pages * page_size) / 1e9, 2)
+ except Exception:
+ pass
+
+ return {
+ "ram_free_gb": ram_free_gb,
+ "ram_used_gb": 0, # Not available without psutil
+ "ram_percent": 0,
+ "swap_used_mb": swap_mb,
+ "swap_percent": 0,
+ "memory_pressure": memory_pressure,
+ }
+
+
+class MemoryMonitor:
+ """Background memory sampler.
+
+ Usage:
+ monitor = MemoryMonitor(interval_ms=200)
+ monitor.start()
+ # ... do work ...
+ summary = monitor.stop()
+ """
+
+ def __init__(self, interval_ms: int = 200):
+ self.interval = interval_ms / 1000
+ self.samples: list[dict] = []
+ self.running = False
+ self.thread: Optional[threading.Thread] = None
+ self.start_time: float = 0
+
+ def start(self):
+ """Start background sampling."""
+ self.running = True
+ self.samples = []
+ self.start_time = time.time()
+ self.thread = threading.Thread(target=self._sample_loop, daemon=True)
+ self.thread.start()
+
+ def stop(self) -> dict:
+ """Stop sampling and return summary."""
+ self.running = False
+ if self.thread:
+ self.thread.join(timeout=1.0)
+
+ if not self.samples:
+ return {"error": "No samples collected"}
+
+ ram_values = [s["ram_free_gb"] for s in self.samples]
+ swap_values = [s["swap_used_mb"] for s in self.samples]
+
+ return {
+ "duration_s": round(time.time() - self.start_time, 2),
+ "samples": len(self.samples),
+ "interval_ms": int(self.interval * 1000),
+ "ram_free_min_gb": min(ram_values),
+ "ram_free_max_gb": max(ram_values),
+ "ram_free_avg_gb": round(sum(ram_values) / len(ram_values), 2),
+ "swap_max_mb": max(swap_values),
+ "swap_avg_mb": round(sum(swap_values) / len(swap_values), 1),
+ }
+
+ def get_samples(self) -> list[dict]:
+ """Get all collected samples."""
+ return self.samples.copy()
+
+ def _sample_loop(self):
+ """Background sampling loop."""
+ while self.running:
+ sample = get_memory_sample()
+ sample["ts"] = round(time.time(), 3)
+ sample["elapsed_s"] = round(time.time() - self.start_time, 2)
+ self.samples.append(sample)
+ time.sleep(self.interval)
+
+
+def run_with_monitoring(
+ command: list[str],
+ interval_ms: int = 200,
+ output_file: Optional[Path] = None,
+ verbose: bool = False
+) -> dict:
+ """Run a command while monitoring memory.
+
+ Args:
+ command: Command and arguments to run
+ interval_ms: Sampling interval in milliseconds
+ output_file: Optional JSONL output file
+ verbose: Print samples as they're collected
+
+ Returns:
+ Summary dict with memory statistics
+ """
+ monitor = MemoryMonitor(interval_ms=interval_ms)
+
+ print(f"Starting memory monitor (interval: {interval_ms}ms)")
+ print(f"Running: {' '.join(command)}")
+ print("-" * 60)
+
+ monitor.start()
+
+ # Run subprocess
+ try:
+ result = subprocess.run(command)
+ exit_code = result.returncode
+ except KeyboardInterrupt:
+ exit_code = 130
+ print("\nInterrupted")
+ except Exception as e:
+ exit_code = 1
+ print(f"\nError: {e}")
+
+ summary = monitor.stop()
+ summary["exit_code"] = exit_code
+ summary["command"] = " ".join(command)
+ summary["timestamp"] = datetime.now(timezone.utc).isoformat()
+
+ print("-" * 60)
+ print(f"Memory Monitor Summary:")
+ print(f" Duration: {summary['duration_s']:.1f}s ({summary['samples']} samples)")
+ print(f" RAM free: {summary['ram_free_min_gb']:.1f} - {summary['ram_free_max_gb']:.1f} GB")
+ print(f" Swap peak: {summary['swap_max_mb']:.1f} MB")
+ print(f" Exit code: {exit_code}")
+
+ # Write output
+ if output_file:
+ with open(output_file, "w") as f:
+ # Write samples
+ for sample in monitor.get_samples():
+ f.write(json.dumps(sample) + "\n")
+ # Write summary as last line
+ f.write(json.dumps({"summary": summary}) + "\n")
+ print(f" Output: {output_file}")
+
+ return summary
+
+
+def monitor_only(
+ duration_s: float,
+ interval_ms: int = 200,
+ output_file: Optional[Path] = None
+) -> dict:
+ """Monitor memory for a fixed duration (no subprocess).
+
+ Args:
+ duration_s: How long to monitor
+ interval_ms: Sampling interval in milliseconds
+ output_file: Optional JSONL output file
+
+ Returns:
+ Summary dict with memory statistics
+ """
+ monitor = MemoryMonitor(interval_ms=interval_ms)
+
+ print(f"Monitoring memory for {duration_s}s (interval: {interval_ms}ms)")
+ print("-" * 60)
+
+ monitor.start()
+
+ try:
+ time.sleep(duration_s)
+ except KeyboardInterrupt:
+ print("\nInterrupted")
+
+ summary = monitor.stop()
+ summary["timestamp"] = datetime.now(timezone.utc).isoformat()
+
+ print("-" * 60)
+ print(f"Memory Monitor Summary:")
+ print(f" Duration: {summary['duration_s']:.1f}s ({summary['samples']} samples)")
+ print(f" RAM free: {summary['ram_free_min_gb']:.1f} - {summary['ram_free_max_gb']:.1f} GB")
+ print(f" Swap peak: {summary['swap_max_mb']:.1f} MB")
+
+ if output_file:
+ with open(output_file, "w") as f:
+ for sample in monitor.get_samples():
+ f.write(json.dumps(sample) + "\n")
+ f.write(json.dumps({"summary": summary}) + "\n")
+ print(f" Output: {output_file}")
+
+ return summary
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Monitor memory while running a command",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog=__doc__
+ )
+ parser.add_argument(
+ "--interval", "-i",
+ type=int,
+ default=200,
+ help="Sampling interval in milliseconds (default: 200)"
+ )
+ parser.add_argument(
+ "--output", "-o",
+ type=Path,
+ help="Output JSONL file for samples and summary"
+ )
+ parser.add_argument(
+ "--duration", "-d",
+ type=float,
+ help="Monitor for fixed duration (seconds), no subprocess"
+ )
+ parser.add_argument(
+ "--verbose", "-v",
+ action="store_true",
+ help="Print samples as they're collected"
+ )
+ parser.add_argument(
+ "command",
+ nargs="*",
+ help="Command to run (after --)"
+ )
+
+ args = parser.parse_args()
+
+ if args.duration:
+ # Monitor-only mode
+ summary = monitor_only(
+ duration_s=args.duration,
+ interval_ms=args.interval,
+ output_file=args.output
+ )
+ elif args.command:
+ # Run command with monitoring
+ summary = run_with_monitoring(
+ command=args.command,
+ interval_ms=args.interval,
+ output_file=args.output,
+ verbose=args.verbose
+ )
+ sys.exit(summary.get("exit_code", 0))
+ else:
+ parser.print_help()
+ print("\nExamples:")
+ print(" python benchmarks/tools/memmon.py -- pytest -m live_e2e")
+ print(" python benchmarks/tools/memmon.py --duration 10 --output mem.jsonl")
+ sys.exit(1)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/tools/memplot.py b/benchmarks/tools/memplot.py
new file mode 100644
index 0000000..3da6883
--- /dev/null
+++ b/benchmarks/tools/memplot.py
@@ -0,0 +1,522 @@
+#!/usr/bin/env python3
+"""Memory Timeline Visualization - Generate interactive HTML charts from benchmark data.
+
+Correlates memory samples (memmon.py) with test results to show RAM/swap usage
+over time with model markers.
+
+Usage:
+ # Basic usage
+ python benchmarks/tools/memplot.py memory.jsonl benchmark.jsonl
+
+ # Custom output
+ python benchmarks/tools/memplot.py memory.jsonl benchmark.jsonl -o report.html
+
+ # PNG export (requires kaleido)
+ python benchmarks/tools/memplot.py memory.jsonl benchmark.jsonl --format png
+
+Requires: plotly (pip install plotly)
+Optional: kaleido (pip install kaleido) for PNG export
+"""
+
+import argparse
+import json
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+
+def parse_memory_samples(path: Path) -> tuple[list[dict], dict]:
+ """Parse memmon JSONL output.
+
+ Returns:
+ Tuple of (samples list, summary dict)
+ """
+ samples = []
+ summary = {}
+
+ with open(path) as f:
+ for line in f:
+ line = line.strip()
+ if not line:
+ continue
+ entry = json.loads(line)
+ if "summary" in entry:
+ summary = entry["summary"]
+ else:
+ samples.append(entry)
+
+ return samples, summary
+
+
+def parse_benchmark_results(path: Path) -> tuple[list[dict], list[dict]]:
+ """Parse benchmark JSONL output.
+
+ Returns:
+ Tuple of (tests with models, tests without models)
+ """
+ tests_with_model = []
+ tests_without_model = []
+
+ with open(path) as f:
+ for line in f:
+ line = line.strip()
+ if not line:
+ continue
+ entry = json.loads(line)
+ if "timestamp" not in entry or "duration" not in entry:
+ continue
+
+ if "model" in entry and entry.get("outcome") == "passed":
+ tests_with_model.append(entry)
+ elif "model" not in entry and entry.get("outcome") == "passed":
+ tests_without_model.append(entry)
+
+ return tests_with_model, tests_without_model
+
+
+def parse_iso_timestamp(ts_str: str) -> float:
+ """Convert ISO timestamp to Unix timestamp."""
+ # Handle timezone suffix
+ if ts_str.endswith("Z"):
+ ts_str = ts_str[:-1] + "+00:00"
+ dt = datetime.fromisoformat(ts_str)
+ return dt.timestamp()
+
+
+def correlate_tests_with_timeline(
+ samples: list[dict],
+ tests: list[dict],
+ memory_start_ts: float
+) -> list[dict]:
+ """Calculate test time ranges relative to memory timeline.
+
+ Returns:
+ List of dicts with model_id, start_elapsed, end_elapsed
+ """
+ if not samples or not tests:
+ return []
+
+ markers = []
+
+ for test in tests:
+ if "timestamp" not in test or "duration" not in test:
+ continue
+
+ test_end_ts = parse_iso_timestamp(test["timestamp"])
+ test_start_ts = test_end_ts - test["duration"]
+
+ # Convert to elapsed time relative to memory monitoring start
+ start_elapsed = test_start_ts - memory_start_ts
+ end_elapsed = test_end_ts - memory_start_ts
+
+ # Get model info if available (for model tests)
+ model_id = test.get("model", {}).get("id", None)
+ model_short = model_id.split("/")[-1][:20] if model_id else None
+
+ markers.append({
+ "model_id": model_id,
+ "model_short": model_short,
+ "start_elapsed": start_elapsed,
+ "end_elapsed": end_elapsed,
+ "duration": test["duration"],
+ "test": test.get("test", ""),
+ })
+
+ return markers
+
+
+def get_ram_color(ram_free_gb: float) -> str:
+ """Get color based on RAM availability."""
+ if ram_free_gb >= 32:
+ return "rgb(52, 199, 89)" # Green - healthy
+ elif ram_free_gb >= 16:
+ return "rgb(255, 149, 0)" # Orange - warning
+ else:
+ return "rgb(255, 59, 48)" # Red - critical
+
+
+def create_timeline_chart(
+ samples: list[dict],
+ summary: dict,
+ model_markers: list[dict],
+ infra_markers: list[dict],
+ title: str = "Memory Timeline"
+) -> "Figure":
+ """Create interactive plotly timeline chart."""
+ try:
+ import plotly.graph_objects as go
+ from plotly.subplots import make_subplots
+ except ImportError:
+ print("Error: plotly not installed. Run: pip install plotly")
+ sys.exit(1)
+
+ # Extract data series
+ elapsed = [s["elapsed_s"] for s in samples]
+ ram_free = [s["ram_free_gb"] for s in samples]
+ swap_used = [s["swap_used_mb"] for s in samples]
+ memory_pressure = [s.get("memory_pressure", 1) for s in samples] # Default: 1=NORMAL
+
+ # Convert elapsed to minutes for readability
+ elapsed_min = [e / 60 for e in elapsed]
+
+ # Create figure with secondary y-axis for swap
+ fig = make_subplots(specs=[[{"secondary_y": True}]])
+
+ # RAM trace - use marker color based on threshold
+ # Color each point based on RAM level
+ colors = [get_ram_color(ram) for ram in ram_free]
+
+ fig.add_trace(
+ go.Scatter(
+ x=elapsed_min,
+ y=ram_free,
+ mode="lines+markers",
+ name="RAM Free (GB)",
+ line=dict(color="rgb(52, 150, 235)", width=1.5), # Blue line
+ marker=dict(
+ color=colors,
+ size=3,
+ line=dict(width=0),
+ ),
+ hovertemplate="Time: %{x:.1f} min
RAM Free: %{y:.1f} GB",
+ ),
+ secondary_y=False,
+ )
+
+ # Threshold lines (assuming 64 GB total RAM)
+ max_elapsed_min = max(elapsed_min) if elapsed_min else 20
+ total_ram = 64 # GB - could be made configurable later
+
+ fig.add_trace(
+ go.Scatter(
+ x=[0, max_elapsed_min],
+ y=[32, 32],
+ mode="lines",
+ name=f"32 GB (50% of {total_ram} GB - healthy)",
+ line=dict(color="green", width=1, dash="dash"),
+ hoverinfo="skip",
+ ),
+ secondary_y=False,
+ )
+
+ fig.add_trace(
+ go.Scatter(
+ x=[0, max_elapsed_min],
+ y=[16, 16],
+ mode="lines",
+ name=f"16 GB (25% of {total_ram} GB - warning)",
+ line=dict(color="orange", width=1, dash="dash"),
+ hoverinfo="skip",
+ ),
+ secondary_y=False,
+ )
+
+ # Swap trace (secondary y-axis)
+ if any(s > 0 for s in swap_used):
+ fig.add_trace(
+ go.Scatter(
+ x=elapsed_min,
+ y=swap_used,
+ mode="lines",
+ name="Swap Used (MB)",
+ line=dict(color="red", width=2),
+ hovertemplate="Time: %{x:.1f} min
Swap: %{y:.0f} MB",
+ ),
+ secondary_y=True,
+ )
+
+ # Model test regions (gray background for each test with model)
+ # Sort markers by time
+ model_markers_sorted = sorted(model_markers, key=lambda m: m["start_elapsed"])
+
+ test_shapes = []
+ prev_model_id = None # Track previous model for switch detection
+
+ for i, marker in enumerate(model_markers_sorted):
+ start_min = marker["start_elapsed"] / 60
+ end_min = marker["end_elapsed"] / 60
+
+ if start_min < 0 or start_min > max_elapsed_min:
+ continue
+
+ # Add gray rectangle for this individual test
+ test_shapes.append(dict(
+ type="rect",
+ xref="x", yref="y",
+ x0=start_min,
+ x1=end_min,
+ y0=0, y1=70,
+ fillcolor="rgba(200, 200, 200, 0.3)", # Gray for model tests
+ layer="below",
+ line=dict(width=0),
+ ))
+
+ # Add model label when model CHANGES (not just first occurrence)
+ model_id = marker["model_id"]
+ if model_id != prev_model_id:
+ fig.add_annotation(
+ x=start_min,
+ y=1.0,
+ xref="x", yref="paper",
+ text=marker["model_short"],
+ textangle=-90,
+ font=dict(size=9, color="rgba(0, 0, 0, 0.7)"),
+ showarrow=False,
+ xanchor="left",
+ yanchor="top",
+ xshift=2,
+ )
+ prev_model_id = model_id
+
+ # Infrastructure test regions (light blue background)
+ infra_markers_sorted = sorted(infra_markers, key=lambda m: m["start_elapsed"])
+
+ for marker in infra_markers_sorted:
+ start_min = marker["start_elapsed"] / 60
+ end_min = marker["end_elapsed"] / 60
+
+ if start_min < 0 or start_min > max_elapsed_min:
+ continue
+
+ # Add very light blue rectangle for infrastructure tests
+ test_shapes.append(dict(
+ type="rect",
+ xref="x", yref="y",
+ x0=start_min,
+ x1=end_min,
+ y0=0, y1=70,
+ fillcolor="rgba(173, 216, 230, 0.2)", # Very light blue for infra tests
+ layer="below",
+ line=dict(width=0),
+ ))
+
+ region_shapes = test_shapes
+
+ # Add test markers (small vertical lines) and labels at bottom for both marker types
+ all_markers = model_markers_sorted + infra_markers_sorted
+ all_markers_sorted = sorted(all_markers, key=lambda m: m["start_elapsed"])
+
+ for marker in all_markers_sorted:
+ start_min = marker["start_elapsed"] / 60
+
+ if start_min < 0 or start_min > max_elapsed_min:
+ continue
+
+ # Extract test name (shorten if needed)
+ test_name = marker["test"].split("::")[-1].split("[")[0]
+ if len(test_name) > 25:
+ test_name = test_name[:22] + "..."
+
+ fig.add_vline(
+ x=start_min,
+ line=dict(color="rgba(128, 128, 128, 0.2)", width=0.5),
+ )
+
+ # Add test label at bottom (aligned with start time like model labels)
+ fig.add_annotation(
+ x=start_min,
+ y=0.0,
+ xref="x", yref="paper",
+ text=test_name,
+ textangle=-90,
+ font=dict(size=9, color="rgba(0, 0, 0, 0.6)"), # Same size as model labels
+ showarrow=False,
+ xanchor="left", # Same as model labels (aligned at start)
+ yanchor="bottom",
+ xshift=2, # Same offset as model labels
+ )
+
+ # Add memory pressure backgrounds (1=normal/white, 2=warn/yellow, 4=critical/red)
+ pressure_shapes = []
+ i = 0
+ while i < len(memory_pressure):
+ pressure = memory_pressure[i]
+
+ if pressure > 1: # 2=WARN or 4=CRITICAL
+ # Find end of this pressure region
+ start_min = elapsed_min[i]
+ j = i
+ while j < len(memory_pressure) and memory_pressure[j] == pressure:
+ j += 1
+ end_min = elapsed_min[j - 1] if j > i else start_min
+
+ # Color based on pressure level
+ if pressure == 2:
+ color = "rgba(255, 204, 0, 0.15)" # Yellow (WARN)
+ else: # pressure == 4
+ color = "rgba(255, 59, 48, 0.15)" # Red (CRITICAL)
+
+ pressure_shapes.append(dict(
+ type="rect",
+ xref="x", yref="y", # Changed from "paper" to "y" for rangeslider compatibility
+ x0=start_min, x1=end_min,
+ y0=0, y1=70, # Use actual y-axis values
+ fillcolor=color,
+ layer="below",
+ line=dict(width=0),
+ ))
+ i = j
+ else:
+ i += 1
+
+ # Combine all shapes (regions first, then pressure on top)
+ shapes = region_shapes + pressure_shapes
+
+ # Debug output
+ print(f" Test shapes (gray): {len(region_shapes)}")
+ print(f" Pressure shapes (yellow/red): {len(pressure_shapes)}")
+ print(f" Total shapes: {len(shapes)}")
+ if region_shapes:
+ print(f" Sample test shape: {region_shapes[0]}")
+
+ # Layout (without shapes - we'll add them individually)
+ fig.update_layout(
+ title=dict(
+ text=title,
+ font=dict(size=16),
+ ),
+ xaxis=dict(
+ title="Time (minutes)",
+ showgrid=True,
+ gridcolor="rgba(128,128,128,0.2)",
+ rangeslider=dict(visible=True, yaxis=dict(rangemode="match")),
+ ),
+ yaxis=dict(
+ title="RAM Free (GB)",
+ showgrid=True,
+ gridcolor="rgba(128,128,128,0.2)",
+ range=[0, 70], # Typical max for 64GB system
+ ),
+ yaxis2=dict(
+ title="Swap Used (MB)",
+ showgrid=False,
+ range=[0, max(swap_used) * 1.2] if any(s > 0 for s in swap_used) else [0, 100],
+ ),
+ legend=dict(
+ orientation="h",
+ yanchor="bottom",
+ y=1.02,
+ xanchor="right",
+ x=1,
+ ),
+ hovermode="x unified",
+ template="plotly_white",
+ plot_bgcolor="rgba(0,0,0,0)", # Transparent plot background so shapes show through
+ height=500,
+ margin=dict(t=80, b=60, l=60, r=60),
+ )
+
+ # Add shapes individually using fig.add_shape() method
+ # This is more explicit than passing shapes array to update_layout
+ for shape in shapes:
+ fig.add_shape(**shape)
+
+ # Debug: Check shapes after adding individually
+ print(f" Shapes in fig.layout after add_shape: {len(fig.layout.shapes)}")
+
+ # Add summary annotation
+ if summary:
+ summary_text = (
+ f"Duration: {summary.get('duration_s', 0)/60:.1f} min | "
+ f"Samples: {summary.get('samples', 0)} | "
+ f"RAM: {summary.get('ram_free_min_gb', 0):.1f}-{summary.get('ram_free_max_gb', 0):.1f} GB | "
+ f"Swap peak: {summary.get('swap_max_mb', 0):.0f} MB"
+ )
+ fig.add_annotation(
+ text=summary_text,
+ xref="paper", yref="paper",
+ x=0, y=-0.12,
+ showarrow=False,
+ font=dict(size=10, color="gray"),
+ align="left",
+ )
+
+ return fig
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Generate memory timeline visualization from benchmark data",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog=__doc__,
+ )
+ parser.add_argument(
+ "memory_file",
+ type=Path,
+ help="Memory samples JSONL from memmon.py",
+ )
+ parser.add_argument(
+ "benchmark_file",
+ type=Path,
+ nargs="?",
+ help="Benchmark results JSONL (optional, for model markers)",
+ )
+ parser.add_argument(
+ "-o", "--output",
+ type=Path,
+ help="Output file (default: memory_timeline.html)",
+ )
+ parser.add_argument(
+ "--format",
+ choices=["html", "png", "svg"],
+ default="html",
+ help="Output format (default: html)",
+ )
+ parser.add_argument(
+ "--title",
+ default="Memory Timeline",
+ help="Chart title",
+ )
+
+ args = parser.parse_args()
+
+ # Default output filename
+ if not args.output:
+ args.output = Path(f"memory_timeline.{args.format}")
+
+ # Parse inputs
+ print(f"Reading memory samples: {args.memory_file}")
+ samples, summary = parse_memory_samples(args.memory_file)
+ print(f" Found {len(samples)} samples")
+
+ model_markers = []
+ infra_markers = []
+ if args.benchmark_file:
+ print(f"Reading benchmark results: {args.benchmark_file}")
+ tests_with_model, tests_without_model = parse_benchmark_results(args.benchmark_file)
+ print(f" Found {len(tests_with_model)} test entries with models")
+ print(f" Found {len(tests_without_model)} infrastructure test entries")
+
+ # Get memory start timestamp from first sample
+ if samples:
+ memory_start_ts = samples[0]["ts"]
+ model_markers = correlate_tests_with_timeline(samples, tests_with_model, memory_start_ts)
+ infra_markers = correlate_tests_with_timeline(samples, tests_without_model, memory_start_ts)
+ print(f" Correlated {len(model_markers)} model test markers")
+ print(f" Correlated {len(infra_markers)} infrastructure test markers")
+
+ # Create chart
+ print(f"Generating {args.format.upper()} chart...")
+ fig = create_timeline_chart(samples, summary, model_markers, infra_markers, title=args.title)
+
+ # Export
+ if args.format == "html":
+ fig.write_html(
+ args.output,
+ include_plotlyjs="cdn",
+ full_html=True,
+ )
+ else:
+ try:
+ fig.write_image(args.output, scale=2)
+ except Exception as e:
+ print(f"Error: PNG/SVG export requires kaleido: pip install kaleido")
+ print(f"Details: {e}")
+ sys.exit(1)
+
+ print(f"Output: {args.output}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/validate_reports.py b/benchmarks/validate_reports.py
index 8f13c75..e91e4b0 100644
--- a/benchmarks/validate_reports.py
+++ b/benchmarks/validate_reports.py
@@ -77,14 +77,16 @@ def main():
print("Usage: python benchmarks/validate_reports.py [ ...]")
sys.exit(1)
- # Load schema
- schema_path = Path("benchmarks/schemas/report-v0.1.schema.json")
+ # Load schema (always use current version)
+ schema_path = Path("benchmarks/schemas/report-current.schema.json")
if not schema_path.exists():
print(f"Error: Schema not found at {schema_path}")
sys.exit(1)
schema = load_schema(schema_path)
- print(f"📋 Loaded schema: {schema_path}")
+ # Resolve symlink for display
+ resolved = schema_path.resolve()
+ print(f"📋 Loaded schema: {schema_path} → {resolved.name}")
print()
# Validate each file
diff --git a/docs/ADR/ADR-016-Memory-Aware-Model-Loading.md b/docs/ADR/ADR-016-Memory-Aware-Model-Loading.md
index bf4c4c1..601ae26 100644
--- a/docs/ADR/ADR-016-Memory-Aware-Model-Loading.md
+++ b/docs/ADR/ADR-016-Memory-Aware-Model-Loading.md
@@ -89,27 +89,11 @@ This is a **hardware fact** (from `sysctl -n hw.memsize`), not a heuristic.
- Vision >70%: HTTP 507 Insufficient Storage + JSON error response
- Text >70%: `logger.warning("Model size XX.X GB exceeds 70% of YY.Y GB system memory. Expect extreme slowness due to swapping.")` → visible via `--log-level warning` (default) and `--log-json` if enabled
-## TODO
+## Status
-### Phase 1 (2.0.4-beta.1) ✅ COMPLETE
-- [x] Add `system.memory_total_bytes` to JSON-API
-- [x] Schema bump to 0.1.6
-- [x] Document in json-api-specification.md
+**Phase 1+2:** ✅ Complete (2.0.4-beta.1) - See CHANGELOG.md
-### Phase 2 (2.0.4-beta.1) ✅ COMPLETE
-- [x] Implement pre-load memory check in `run.py`
- - `_get_system_memory_bytes()` via `sysctl -n hw.memsize`
- - `check_memory_before_load()` for CLI path
- - `check_memory_for_server()` for server path
-- [x] Vision: ERROR + abort if size > 70% total (empirically confirmed: crash at 73%)
- - CLI: stderr error + exit 1
- - Server: HTTP 507 + JSON error (via `ErrorType.INSUFFICIENT_MEMORY`)
-- [x] Text: Internal log only if size > 70% total (empirically confirmed: no crash at 95-97%)
- - CLI: No user-facing action (backwards compatible)
- - Server: `logger.warning()` only (uses existing `--log-level`/`--log-json` infrastructure)
-- [x] Unit tests: 18 tests in `tests_2.0/test_memory_checks.py`
-
-### Phase 3 (Future)
+**Phase 3 (Future):** Issue #46
- [ ] Configurable threshold (env var or CLI flag)
- [ ] Vision overhead estimation based on model architecture
- [ ] KV-Cache size estimation based on context length
diff --git a/docs/ADR/README.md b/docs/ADR/README.md
index 4c5497b..30ef4a5 100644
--- a/docs/ADR/README.md
+++ b/docs/ADR/README.md
@@ -15,16 +15,16 @@ This directory contains Architecture Decision Records (ADRs) that document signi
| [ADR-005](ADR-005-Clone-Implementation-Beta3.md) | Clone Implementation Beta3 | Superseded by ADR-007 | 2025-09-18 |
| [ADR-006](ADR-006-Clone-Implementation-Revised.md) | Clone Implementation Revised | Superseded by ADR-007 | 2025-09-18 |
| [ADR-007](ADR-007-Clone-Implementation-Fixed.md) | Clone Implementation Fixed Strategy | Accepted | 2025-09-18 |
-| [ADR-008](ADR-008-MLXModel-Package-Format.md) | MLXModel Package Format | Proposed | 2025-10-17 |
+| ADR-008 | MLXModel Package Format | Proposed | (not committed) |
| [ADR-009](ADR-009-Stop-Token-Detection-Fix.md) | Stop Token Detection Fix | Implemented | 2025-10-21 |
-| [ADR-010](ADR-010-Reasoning-Content-API.md) | Reasoning Content API | Draft | 2025-10-21 |
+| ADR-010 | Reasoning Content API | Draft | (not committed) |
| [ADR-011](ADR-011-E2E-Live-Test-Architecture.md) | E2E Live Test Architecture | Implemented | 2025-10-21 |
| [ADR-012](ADR-012-Vision-Support-Roadmap.md) | Vision Support Roadmap | Implemented (Phase 1-3) | 2025-11-12 |
-| [ADR-013](ADR-013-Community-Model-Quality-Database.md) | Community Model Quality Database | Planned | 2025-11-13 |
+| ADR-013 | Community Model Quality Database | Planned | (not committed) |
| [ADR-014](ADR-014-Unix-Pipe-Integration.md) | Unix Pipe Integration | Implemented (Phase 1) | 2025-11-16 |
-| [ADR-016](ADR-016-Memory-Aware-Model-Loading.md) | Memory-Aware Model Loading | Implemented | 2025-11-20 |
-| ADR-015 | Embeddings API | Planned | (future) |
-| ADR-017 | Image Metadata & RAG | Proposed | (future) |
+| ADR-015 | Embeddings API | Planned | (not committed) |
+| [ADR-016](ADR-016-Memory-Aware-Model-Loading.md) | Memory-Aware Model Loading | Implemented (Phase 1-2) | 2025-12-05 |
+| ADR-017 | Image Metadata Extraction (EXIF) | Implemented | (not committed) |
## ADR Format
diff --git a/docs/SERVER-HANDBOOK.md b/docs/SERVER-HANDBOOK.md
index 78986f8..54c1559 100644
--- a/docs/SERVER-HANDBOOK.md
+++ b/docs/SERVER-HANDBOOK.md
@@ -26,7 +26,7 @@ mlxk serve --host 0.0.0.0 --port 8000
- Python 3.9+ (Text models)
- Python 3.10+ (Vision models)
- mlx-lm 0.28.4+
-- mlx-vlm 0.3.9+ (optional, for vision)
+- mlx-vlm 0.3.9+ (optional, for vision; beta.3 recommends commit c4ea290e47e2155b67d94c708c662f8ab64e1b37)
---
@@ -380,6 +380,9 @@ python -m mlxk2.core.server_base
pyenv install 3.10
pyenv local 3.10
pip install mlx-lm mlx-vlm
+
+# Beta.3 (pre-0.3.10 fix)
+pip install mlx-lm "mlx-vlm @ git+https://github.com/Blaizzy/mlx-vlm.git@c4ea290e47e2155b67d94c708c662f8ab64e1b37"
```
### Memory Constraint Errors (HTTP 507)
diff --git a/docs/json-api-specification.md b/docs/json-api-specification.md
index f69ed01..8bc0592 100644
--- a/docs/json-api-specification.md
+++ b/docs/json-api-specification.md
@@ -1,8 +1,8 @@
# MLX-Knife 2.0 JSON API Specification
**Specification Version:** 0.1.6
-**Status:** Alpha - Subject to change
-**Target:** MLX-Knife 2.0.4-beta.1
+**Status:** Stable (backward-compatible)
+**Released:** MLX-Knife 2.0.4-beta.1
> Based on [GitHub Issue #8](https://github.com/mzau/mlx-knife/issues/8) - Comprehensive JSON output support for all commands
@@ -551,29 +551,31 @@ mlxk-json show "Phi-3-mini" --config --json # Include config.json content
}
```
-## Changes in 0.1.6 (Alpha)
+## Changes in 0.1.6 (Stable, 2.0.4-beta.1)
-**ADR-016 Preparation: System Memory Information**
+**System Memory Information**
- Added `system` object to `version` command response
- `system.memory_total_bytes`: Total physical RAM in bytes (from `sysctl hw.memsize`)
- `system` is `null` on non-macOS platforms where sysctl is unavailable
-- Enables ADR-016 Memory-Aware Model Loading (pre-load memory checks)
+- Enables memory-aware model loading (ADR-016)
-**ADR-012: Vision Support - Model Discovery**
+**Model Discovery: Vision capability flag**
- Vision models detected via `preprocessor_config.json` presence
-- `vision` capability added to model discovery (backward-compatible enum extension)
+- `vision` added to `capabilities` enum (backward-compatible extension)
- Visible in `mlxk list --json`, `mlxk show --json`, `mlxk health --json`
- Example: `"capabilities": ["text-generation", "chat", "vision"]`
-**Note on `mlxk run --image` (CLI):**
-- `mlxk run --image` command exists for vision models (ADR-012 Phase 1b)
-- Current output: Text mode only (Markdown table with filename mapping)
-- JSON output: Deferred to ADR-017 Phase 2 (requires formal schema extension)
-- Server OpenAI Vision API documented in `docs/SERVER-HANDBOOK.md`
+**Note:** Vision runtime support (`mlxk run --image`, Server API) is documented in README.md "Multi-Modal Support" and `docs/SERVER-HANDBOOK.md`.
-## Changes in 0.1.5 (Alpha)
+## Changes in 0.1.5 (Stable, 2.0.0)
+
+**Foundation: Model Object Schema**
+
+- Standardized `modelObject` across all commands
+- Machine-readable fields: `size_bytes`, `last_modified` (ISO-8601 UTC with `Z`)
+- No human-readable `size` or `modified` fields (JSON consumers parse structured data)
**Issue #36: Separate Integrity and Runtime Compatibility Checks**
@@ -587,12 +589,6 @@ mlxk-json show "Phi-3-mini" --config --json # Include config.json content
- Gate logic: Runtime check requires passing integrity check first
- `reason` field describes first problem found (integrity > runtime priority)
-## Changes in 0.1.2 (Alpha)
-
-- Introduced a common minimal Model Object for consistency across commands.
-- Replaced human-readable `size` with machine-friendly `size_bytes`.
-- Removed human-readable `modified`; `last_modified` (ISO-8601 UTC) is authoritative.
-
## Operations
### `mlxk-json pull --json`
diff --git a/mlxk2/__init__.py b/mlxk2/__init__.py
index d68e38b..8b9039b 100644
--- a/mlxk2/__init__.py
+++ b/mlxk2/__init__.py
@@ -7,4 +7,4 @@ import warnings
# Issue parity with 1.1.0 (Issue #22)
warnings.filterwarnings('ignore', message='urllib3 v2 only supports OpenSSL 1.1.1+')
-__version__ = "2.0.4b2"
+__version__ = "2.0.4b3"
diff --git a/mlxk2/core/capabilities.py b/mlxk2/core/capabilities.py
index 09427a2..e8516fd 100644
--- a/mlxk2/core/capabilities.py
+++ b/mlxk2/core/capabilities.py
@@ -167,7 +167,11 @@ def _has_any(path: Path, patterns: Tuple[str, ...]) -> bool:
def _detect_vision_from_config(config: Optional[Dict[str, Any]]) -> bool:
- """Detect vision capability from config.json content."""
+ """Detect vision capability from config.json content.
+
+ Video models (AutoVideoProcessor) are excluded as they require PyTorch/Torchvision.
+ mlx-vlm only supports image vision models (AutoImageProcessor).
+ """
if not isinstance(config, dict):
return False
@@ -181,15 +185,29 @@ def _detect_vision_from_config(config: Optional[Dict[str, Any]]) -> bool:
return True
# Check for embedded preprocessor_config
- if isinstance(config.get("preprocessor_config"), dict):
+ preprocessor_cfg = config.get("preprocessor_config")
+ if isinstance(preprocessor_cfg, dict):
+ # Exclude video processors (requires PyTorch/Torchvision)
+ if preprocessor_cfg.get("processor_class") == "AutoVideoProcessor":
+ return False
+ if "temporal_patch_size" in preprocessor_cfg:
+ return False
return True
return False
def _detect_vision_from_files(model_path: Path) -> bool:
- """Detect vision capability from file presence."""
- return _has_any(
+ """Detect vision capability from file presence.
+
+ Video models (AutoVideoProcessor) are excluded as they require PyTorch/Torchvision.
+ mlx-vlm only supports image vision models (AutoImageProcessor).
+ """
+ # Check if it's a video model (requires PyTorch/Torchvision)
+ if (model_path / "video_preprocessor_config.json").exists():
+ return False
+
+ if _has_any(
model_path,
(
"preprocessor_config.json",
@@ -199,7 +217,25 @@ def _detect_vision_from_files(model_path: Path) -> bool:
"**/processor_config.json",
"**/image_processor_config.json",
),
- )
+ ):
+ # Found vision-related files, but check if it's a video processor
+ preprocessor_path = model_path / "preprocessor_config.json"
+ if preprocessor_path.exists():
+ try:
+ import json
+ with open(preprocessor_path) as f:
+ preprocessor_data = json.load(f)
+ if isinstance(preprocessor_data, dict):
+ # Video model indicators
+ if preprocessor_data.get("processor_class") == "AutoVideoProcessor":
+ return False
+ if "temporal_patch_size" in preprocessor_data:
+ return False
+ except Exception:
+ pass
+ return True
+
+ return False
def _check_mlx_vlm_available() -> bool:
diff --git a/mlxk2/core/server_base.py b/mlxk2/core/server_base.py
index f9d6cfd..be7db39 100644
--- a/mlxk2/core/server_base.py
+++ b/mlxk2/core/server_base.py
@@ -118,8 +118,6 @@ def get_or_load_model(model_spec: str, verbose: bool = False) -> Any:
raise HTTPException(status_code=503, detail="Server is shutting down")
# Simple approach like run command - let MLXRunner handle everything
if _current_model_path != model_spec:
- logger.info(f"Switching to model: {model_spec}", model=model_spec)
-
# Clean up previous model
if _model_cache:
try:
@@ -229,8 +227,7 @@ def get_or_load_model(model_spec: str, verbose: bool = False) -> Any:
_model_cache[model_spec] = runner
_current_model_path = model_spec
- backend_name = "vision" if policy.backend == Backend.MLX_VLM else "text"
- logger.info(f"Model loaded successfully ({backend_name}): {model_spec}", model=model_spec)
+ logger.info(f"Switched to model: {model_spec}", model=model_spec)
except HTTPException:
# Re-raise HTTP exceptions (501, 507, etc.) from vision/memory checks
@@ -767,11 +764,10 @@ async def list_models():
"""List available MLX models in the cache.
Returns models sorted with preloaded model first (if set), then alphabetically.
- Filters to healthy MLX models (runtime compatibility deferred to P2 refactoring).
+ Filters to healthy + runtime_compatible models.
"""
from .cache import cache_dir_to_hf
- from ..operations.common import detect_framework, read_front_matter
- from ..operations.health import is_model_healthy
+ from ..operations.common import build_model_object
model_list = []
model_cache = get_current_model_cache()
@@ -783,8 +779,7 @@ async def list_models():
model_name = cache_dir_to_hf(model_dir.name)
try:
- # Check if it's a healthy MLX model
- # Get the latest snapshot for detection
+ # Get snapshot path
snapshots_dir = model_dir / "snapshots"
selected_path = None
if snapshots_dir.exists():
@@ -792,27 +787,21 @@ async def list_models():
if snapshots:
selected_path = snapshots[0]
- # Read front-matter for framework detection (align with CLI behavior)
- probe = selected_path if selected_path is not None else model_dir
- fm = read_front_matter(probe)
+ # Use shared build_model_object (single source of truth)
+ model_obj = build_model_object(model_name, model_dir, selected_path)
- framework = detect_framework(model_name, model_dir, selected_path, fm)
- healthy, _ = is_model_healthy(model_name)
-
- # Filter: Only MLX + healthy models
- # TODO P2: Add runtime_compatible check (needs refactoring to avoid duplication)
- if framework != "MLX" or not healthy:
+ # Filter: healthy AND runtime_compatible
+ if model_obj.get("health") != "healthy":
+ continue
+ if not model_obj.get("runtime_compatible"):
continue
# Get model context length (best effort)
context_length = None
try:
- snapshots_dir = model_dir / "snapshots"
- if snapshots_dir.exists():
- snapshots = [d for d in snapshots_dir.iterdir() if d.is_dir()]
- if snapshots:
- from .runner import get_model_context_length
- context_length = get_model_context_length(str(snapshots[0]))
+ if selected_path:
+ from .runner import get_model_context_length
+ context_length = get_model_context_length(str(selected_path))
except Exception:
pass
diff --git a/mlxk2/core/vision_runner.py b/mlxk2/core/vision_runner.py
index bce9071..e41409d 100644
--- a/mlxk2/core/vision_runner.py
+++ b/mlxk2/core/vision_runner.py
@@ -97,9 +97,8 @@ class VisionRunner:
raise RuntimeError("mlx-vlm is missing load()/generate() API")
# mlx-vlm expects HF repo_id, not local path
- # fix_mistral_regex=True: Suppress tokenizer regex warning for Mistral-based models
# local_files_only=True: Use mlx-knife's cache only, never download (pull's responsibility)
- loaded = self._load(self.model_name, fix_mistral_regex=True, local_files_only=True)
+ loaded = self._load(self.model_name, local_files_only=True)
if isinstance(loaded, tuple):
# Common pattern: (model, processor)
self.model = loaded[0] if len(loaded) > 0 else None
@@ -256,9 +255,9 @@ class VisionRunner:
lat = convert_to_degrees(gps_dict.get("GPSLatitude"))
lon = convert_to_degrees(gps_dict.get("GPSLongitude"))
- if lat and gps_dict.get("GPSLatitudeRef") == "S":
+ if lat is not None and gps_dict.get("GPSLatitudeRef") == "S":
lat = -lat
- if lon and gps_dict.get("GPSLongitudeRef") == "W":
+ if lon is not None and gps_dict.get("GPSLongitudeRef") == "W":
lon = -lon
exif.gps_lat = lat
@@ -280,7 +279,7 @@ class VisionRunner:
exif.camera = str(camera).strip()
# Return None if no useful EXIF found
- if not any([exif.gps_lat, exif.gps_lon, exif.datetime, exif.camera]):
+ if all(x is None for x in [exif.gps_lat, exif.gps_lon, exif.datetime, exif.camera]):
return None
return exif
diff --git a/mlxk2/operations/common.py b/mlxk2/operations/common.py
index 3e96519..1b16d5a 100644
--- a/mlxk2/operations/common.py
+++ b/mlxk2/operations/common.py
@@ -144,7 +144,8 @@ def detect_framework(hf_name: str, model_root: Path, selected_path: Optional[Pat
MLX if:
- org is mlx-community/*, or
- README front-matter tags include 'mlx', or
- - README front-matter library_name == 'mlx'.
+ - README front-matter library_name == 'mlx', or
+ - config.json contains 'quantization' key (MLX-specific).
Else GGUF if any *.gguf present under selected_path or snapshots.
Else PyTorch if any *.safetensors or pytorch_model.bin present under snapshots.
@@ -154,6 +155,13 @@ def detect_framework(hf_name: str, model_root: Path, selected_path: Optional[Pat
if "mlx-community/" in hf_name:
return "MLX"
+ # Search location preference: selected snapshot, else model root
+ root = selected_path if selected_path is not None else model_root
+
+ # Read front-matter if not provided (Issue #48: self-contained detection)
+ if fm is None:
+ fm = read_front_matter(root)
+
# Front-matter signals
if fm is not None:
tags = [t.lower() for t in (fm.tags or [])]
@@ -161,8 +169,10 @@ def detect_framework(hf_name: str, model_root: Path, selected_path: Optional[Pat
if "mlx" in tags or lib == "mlx":
return "MLX"
- # Search location preference: selected snapshot, else model root
- root = selected_path if selected_path is not None else model_root
+ # Config-based detection: 'quantization' key is MLX-specific (Issue #48)
+ config = _load_config_json(root)
+ if config and "quantization" in config:
+ return "MLX"
if _has_any(root, ("**/*.gguf",)):
return "GGUF"
@@ -176,7 +186,7 @@ def detect_framework(hf_name: str, model_root: Path, selected_path: Optional[Pat
return "Unknown"
-def detect_model_type(hf_name: str, config: Optional[Dict[str, Any]], tok_hints: Dict[str, Any]) -> str:
+def detect_model_type(hf_name: str, config: Optional[Dict[str, Any]], tok_hints: Dict[str, Any], probe: Optional[Path] = None) -> str:
name = hf_name.lower()
if "embed" in name:
return "embedding"
@@ -190,13 +200,20 @@ def detect_model_type(hf_name: str, config: Optional[Dict[str, Any]], tok_hints:
ct = tok_hints.get("chat_template")
if isinstance(ct, str) and ct.strip():
return "chat"
+ # Check for chat_template.json file (Issue #48: reliable indicator)
+ if probe is not None and (probe / "chat_template.json").exists():
+ return "chat"
if "instruct" in name or "chat" in name:
return "chat"
return "base"
def detect_vision_capability(probe: Path, config: Optional[Dict[str, Any]]) -> bool:
- """Detect whether the model snapshot supports vision inputs."""
+ """Detect whether the model snapshot supports vision inputs.
+
+ Video models (AutoVideoProcessor) are excluded as they require PyTorch/Torchvision.
+ mlx-vlm only supports image vision models (AutoImageProcessor).
+ """
try:
if isinstance(config, dict):
mt = config.get("model_type")
@@ -208,6 +225,9 @@ def detect_vision_capability(probe: Path, config: Optional[Dict[str, Any]]) -> b
preprocessor_cfg = config.get("preprocessor_config")
if isinstance(preprocessor_cfg, dict):
+ # Exclude video processors (requires PyTorch/Torchvision)
+ if preprocessor_cfg.get("processor_class") == "AutoVideoProcessor":
+ return False
return True
if _has_any(
@@ -221,6 +241,25 @@ def detect_vision_capability(probe: Path, config: Optional[Dict[str, Any]]) -> b
"**/image_processor_config.json",
),
):
+ # Check if it's a video processor (requires PyTorch/Torchvision)
+ # Video models have video_preprocessor_config.json or temporal_patch_size
+ if (probe / "video_preprocessor_config.json").exists():
+ return False
+
+ preprocessor_path = probe / "preprocessor_config.json"
+ if preprocessor_path.exists():
+ try:
+ import json
+ with open(preprocessor_path) as f:
+ preprocessor_data = json.load(f)
+ if isinstance(preprocessor_data, dict):
+ # Video model indicators
+ if preprocessor_data.get("processor_class") == "AutoVideoProcessor":
+ return False
+ if "temporal_patch_size" in preprocessor_data:
+ return False
+ except Exception:
+ pass
return True
except Exception:
return False
@@ -308,7 +347,7 @@ def build_model_object(hf_name: str, model_root: Path, selected_path: Optional[P
config = _load_config_json(probe)
framework = detect_framework(hf_name, model_root, selected_path=selected_path, fm=fm)
- model_type = detect_model_type(hf_name, config, tok)
+ model_type = detect_model_type(hf_name, config, tok, probe)
capabilities = detect_capabilities(model_type, hf_name, tok, config, probe)
has_vision = "vision" in capabilities
@@ -316,17 +355,21 @@ def build_model_object(hf_name: str, model_root: Path, selected_path: Optional[P
healthy, health_reason = is_model_healthy(hf_name)
# Runtime compatibility: ALWAYS computed (gate logic applies)
- # Gate: Only check runtime if file integrity is healthy
+ # Gate 1: File integrity must be healthy
+ # Gate 2: Framework must be MLX (only backend supported)
runtime_reason: Optional[str] = None
- if healthy:
- if has_vision:
- runtime_compatible, runtime_reason = vision_runtime_compatibility()
- else:
- runtime_compatible, runtime_reason = check_runtime_compatibility(probe, framework)
- else:
+ if not healthy:
# File integrity failed → skip runtime check
runtime_compatible = False
runtime_reason = None # health_reason takes precedence
+ elif framework != "MLX":
+ # Non-MLX frameworks not supported (PyTorch, GGUF, etc.)
+ runtime_compatible = False
+ runtime_reason = f"Incompatible framework: {framework}"
+ elif has_vision:
+ runtime_compatible, runtime_reason = vision_runtime_compatibility()
+ else:
+ runtime_compatible, runtime_reason = check_runtime_compatibility(probe, framework)
# Reason field: First problem encountered (health → runtime)
reason = health_reason if not healthy else runtime_reason
diff --git a/mlxk2/output/human.py b/mlxk2/output/human.py
index 7c56dc3..ef59c9b 100644
--- a/mlxk2/output/human.py
+++ b/mlxk2/output/human.py
@@ -134,25 +134,20 @@ def render_list(data: Dict[str, Any], show_health: bool, show_all: bool, verbose
headers.append("Health")
# Human filter:
- # - --all: show everything
- # - default: show only MLX chat models (safer for run/server selection)
- # - --verbose (without --all): show all MLX models (chat + base)
+ # - --all: show everything (no filter)
+ # - default/verbose: only healthy + runtime_compatible (runnable models)
+ # Same filter as Server /v1/models - single source of truth via build_model_object
filtered: List[Dict[str, Any]] = []
for m in models:
- fw = str(m.get("framework", "")).upper()
- typ = str(m.get("model_type", "")).lower()
if show_all:
filtered.append(m)
else:
- if fw != "MLX":
+ # Filter: healthy AND runtime_compatible
+ if m.get("health") != "healthy":
continue
- if verbose:
- # In verbose mode, show all MLX models
- filtered.append(m)
- else:
- # Default compact mode: only MLX chat
- if typ == "chat":
- filtered.append(m)
+ if not m.get("runtime_compatible"):
+ continue
+ filtered.append(m)
rows: List[List[str]] = []
for m in filtered:
diff --git a/pyproject.toml b/pyproject.toml
index f397ce9..7dea8f1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,7 @@ classifiers = [
"License :: OSI Approved :: Apache Software License",
]
dependencies = [
- "huggingface-hub>=0.34.0,<1.0",
+ "huggingface-hub>=0.34.0",
"requests>=2.32.0",
"mlx-lm>=0.28.4",
"mlx>=0.29.0",
@@ -66,7 +66,7 @@ dev = [
"mypy>=1.5.0",
]
vision = [
- "mlx-vlm>=0.3.9", # Vision Language Models support (ADR-012, requires Python 3.10+)
+ "mlx-vlm>=0.3.9", # Vision Language Models support (ADR-012, requires Python 3.10+; beta.3 recommends mlx-vlm commit c4ea290e47e2155b67d94c708c662f8ab64e1b37)
]
[tool.setuptools]
diff --git a/tests_2.0/live/conftest.py b/tests_2.0/live/conftest.py
index dda3ac2..b7ff59e 100644
--- a/tests_2.0/live/conftest.py
+++ b/tests_2.0/live/conftest.py
@@ -21,6 +21,7 @@ from .test_utils import (
discover_mlx_models_in_user_cache,
discover_text_models,
discover_vision_models,
+ parse_vm_stat_page_size,
TEST_MODELS,
)
@@ -499,9 +500,176 @@ def report_benchmark(request):
# ============================================================================
-# Benchmark Reporting (ADR-013 Phase 0)
+# Benchmark Reporting (ADR-013 Phase 0 + 0.5)
# ============================================================================
+def _get_macos_system_health() -> Dict[str, Any]:
+ """Collect macOS system health metrics (ADR-013 Phase 0.5 - v0.2.0).
+
+ Uses macOS-native tools (sysctl, vm_stat, ps) - ZERO new dependencies.
+ Enables automatic regression quality assessment via quality_flags.
+
+ Returns:
+ dict: System health metrics with keys:
+ - swap_used_mb: Current swap usage in MB
+ - ram_free_gb: Available RAM in GB
+ - zombie_processes: Count of zombie processes
+ - quality_flags: List of quality indicators
+ ["clean"] = healthy system
+ ["degraded_swap"] = swap usage detected (memory pressure)
+ ["degraded_zombies"] = zombie processes detected
+
+ Quality Thresholds (empirically derived from Session 43 analysis):
+ - Swap: >100 MB indicates memory pressure (beta2→beta3: 1.8 GB swap = +3.4% slowdown)
+ - Zombies: >0 indicates stuck processes (REGRESSION-2025-12-08: 14 zombies = +90% slowdown)
+ """
+ import subprocess
+
+ health = {
+ "swap_used_mb": 0,
+ "ram_free_gb": 0.0,
+ "zombie_processes": 0,
+ "quality_flags": []
+ }
+
+ try:
+ # Get swap usage via sysctl (macOS native)
+ # sysctl vm.swapusage returns: "vm.swapusage: total = 0.00M used = 0.00M free = 0.00M (encrypted)"
+ result = subprocess.run(
+ ["sysctl", "vm.swapusage"],
+ capture_output=True,
+ text=True,
+ timeout=2
+ )
+ if result.returncode == 0:
+ # Parse: "total = X.XXM used = Y.YYM free = Z.ZZM"
+ for part in result.stdout.split():
+ if part.endswith("M") and "used" in result.stdout:
+ # Extract used value (appears after "used = ")
+ parts = result.stdout.split("used = ")
+ if len(parts) > 1:
+ used_str = parts[1].split()[0]
+ # Parse size (can be M or G suffix)
+ if used_str.endswith("G"):
+ health["swap_used_mb"] = int(float(used_str[:-1]) * 1024)
+ elif used_str.endswith("M"):
+ health["swap_used_mb"] = int(float(used_str[:-1]))
+ break
+ except Exception:
+ pass # Swap metric is optional (not critical if it fails)
+
+ try:
+ # Get free RAM via vm_stat (macOS native)
+ # vm_stat reports page size in the header (Apple Silicon uses 16KB pages).
+ result = subprocess.run(
+ ["vm_stat"],
+ capture_output=True,
+ text=True,
+ timeout=2
+ )
+ if result.returncode == 0:
+ page_size = parse_vm_stat_page_size(result.stdout)
+ # Parse "Pages free: 12345."
+ for line in result.stdout.splitlines():
+ if "Pages free:" in line:
+ pages_free = int(line.split(":")[1].strip().rstrip("."))
+ health["ram_free_gb"] = round(pages_free * page_size / (1024**3), 2)
+ break
+ except Exception:
+ pass # RAM metric is optional
+
+ try:
+ # Get zombie process count via ps aux (macOS native)
+ # Zombies show as "" in ps output
+ result = subprocess.run(
+ ["ps", "aux"],
+ capture_output=True,
+ text=True,
+ timeout=2
+ )
+ if result.returncode == 0:
+ # Count lines containing ""
+ health["zombie_processes"] = result.stdout.count("")
+ except Exception:
+ pass # Zombie count is optional
+
+ # Determine quality flags (empirical thresholds from regression analysis)
+ flags = []
+ if health["swap_used_mb"] > 100:
+ flags.append("degraded_swap")
+ if health["zombie_processes"] > 0:
+ flags.append("degraded_zombies")
+
+ # If no degradation detected, mark as clean
+ if not flags:
+ flags.append("clean")
+
+ health["quality_flags"] = flags
+ return health
+
+
+def _get_macos_hardware_profile() -> Dict[str, Any]:
+ """Collect macOS hardware profile (ADR-013 Phase 0.5 - v0.2.0).
+
+ Uses macOS-native sysctl - ZERO new dependencies.
+ Enables hardware-specific performance analysis (M1 vs M2 vs M3 vs M4).
+
+ Returns:
+ dict: Hardware profile with keys:
+ - model: Mac model identifier (e.g., "Mac14,9" = M3 Max)
+ - cores_physical: Physical CPU cores (P-cores only)
+ - cores_logical: Logical CPU cores (P+E cores with hyperthreading)
+ """
+ import subprocess
+
+ profile = {
+ "model": "unknown",
+ "cores_physical": 0,
+ "cores_logical": 0,
+ }
+
+ try:
+ # Get Mac model identifier
+ result = subprocess.run(
+ ["sysctl", "-n", "hw.model"],
+ capture_output=True,
+ text=True,
+ timeout=2
+ )
+ if result.returncode == 0:
+ profile["model"] = result.stdout.strip()
+ except Exception:
+ pass
+
+ try:
+ # Get physical cores (P-cores)
+ result = subprocess.run(
+ ["sysctl", "-n", "hw.physicalcpu"],
+ capture_output=True,
+ text=True,
+ timeout=2
+ )
+ if result.returncode == 0:
+ profile["cores_physical"] = int(result.stdout.strip())
+ except Exception:
+ pass
+
+ try:
+ # Get logical cores (P+E cores with hyperthreading)
+ result = subprocess.run(
+ ["sysctl", "-n", "hw.logicalcpu"],
+ capture_output=True,
+ text=True,
+ timeout=2
+ )
+ if result.returncode == 0:
+ profile["cores_logical"] = int(result.stdout.strip())
+ except Exception:
+ pass
+
+ return profile
+
+
def pytest_addoption(parser):
"""Add --report-output option for benchmark reporting."""
parser.addoption(
@@ -509,7 +677,7 @@ def pytest_addoption(parser):
action="store",
default=None,
metavar="PATH",
- help="Generate benchmark reports to JSONL file (ADR-013 Phase 0)"
+ help="Generate benchmark reports to JSONL file (ADR-013 Phase 0.5)"
)
@@ -534,11 +702,16 @@ def pytest_runtest_makereport(item, call):
Reports are written as JSONL (one JSON object per line) to allow
streaming and easy appending across test runs.
- Schema version: 0.1.0 (Phase 0 - Experimental)
- See: benchmarks/schemas/report-v0.1.schema.json
+ Schema version: 0.2.0 (Phase 0.5 - System Health + Hardware Profile)
+ See: ADR-013 Phase 0.5 implementation
+
+ Changelog from 0.1.0 → 0.2.0:
+ - Added: system.hardware_profile (Mac model, cores)
+ - Added: system_health (swap, RAM, zombies, quality_flags)
+ - Backward compatible: All 0.1.0 fields preserved
"""
import json
- from datetime import datetime
+ from datetime import datetime, timezone
outcome = yield
report = outcome.get_result()
@@ -553,8 +726,8 @@ def pytest_runtest_makereport(item, call):
# Build report data (required fields)
data = {
- "schema_version": "0.1.0",
- "timestamp": datetime.utcnow().isoformat() + "Z",
+ "schema_version": "0.2.0",
+ "timestamp": datetime.now(timezone.utc).isoformat(),
"mlx_knife_version": __version__,
"test": item.nodeid,
"outcome": report.outcome,
@@ -581,6 +754,20 @@ def pytest_runtest_makereport(item, call):
# Everything else goes to metadata
data.setdefault("metadata", {})[key] = value
+ # ADR-013 Phase 0.5: Collect system health metrics (v0.2.0)
+ # Enables automatic regression quality assessment
+ system_health = _get_macos_system_health()
+ data["system_health"] = system_health
+
+ # ADR-013 Phase 0.5: Collect hardware profile (v0.2.0)
+ # Enables hardware-specific performance analysis (M1 vs M2 vs M3 vs M4)
+ hardware_profile = _get_macos_hardware_profile()
+
+ # Add hardware_profile to system section (create if not exists)
+ if "system" not in data:
+ data["system"] = {}
+ data["system"]["hardware_profile"] = hardware_profile
+
# Write JSONL (one line per report)
try:
item.config.report_file.write(json.dumps(data) + "\n")
@@ -588,4 +775,3 @@ def pytest_runtest_makereport(item, call):
except Exception as e:
# Don't fail tests if reporting fails
print(f"\n⚠️ Benchmark report write failed: {e}")
-
diff --git a/tests_2.0/live/test_server_e2e.py b/tests_2.0/live/test_server_e2e.py
index a877fa7..5803ee6 100644
--- a/tests_2.0/live/test_server_e2e.py
+++ b/tests_2.0/live/test_server_e2e.py
@@ -42,6 +42,8 @@ from .test_utils import (
# Server request timeout (increased from 30s to 45s in Session 22)
# Accounts for: baseline (15s) + probe/policy overhead (2.7s) + generation + safety margin
SERVER_REQUEST_TIMEOUT = 45.0
+# /v1/models can be slower due to cache scans + runtime checks
+MODEL_LIST_TIMEOUT = 20.0
# Opt-in markers
pytestmark = [
@@ -100,7 +102,7 @@ class TestServerHealthEndpoints:
pytest.skip("No text models available within RAM budget")
with LocalServer(test_model) as server_url:
- response = httpx.get(f"{server_url}/v1/models")
+ response = httpx.get(f"{server_url}/v1/models", timeout=MODEL_LIST_TIMEOUT)
assert response.status_code == 200
data = response.json()
diff --git a/tests_2.0/live/test_utils.py b/tests_2.0/live/test_utils.py
index 3022f23..c6f2141 100644
--- a/tests_2.0/live/test_utils.py
+++ b/tests_2.0/live/test_utils.py
@@ -8,6 +8,7 @@ Provides:
from __future__ import annotations
+import re
import sys
from pathlib import Path
from typing import Dict, Any, Tuple
@@ -108,6 +109,14 @@ def get_system_memory_bytes() -> int:
return 0
+def parse_vm_stat_page_size(output: str) -> int:
+ """Extract vm_stat page size in bytes, falling back to 4096."""
+ match = re.search(r"page size of (\d+) bytes", output)
+ if match:
+ return int(match.group(1))
+ return 4096
+
+
def discover_text_models() -> list[Dict[str, Any]]:
"""Discover text-only models (filter out Vision models).
diff --git a/tests_2.0/live/test_vm_stat_parsing.py b/tests_2.0/live/test_vm_stat_parsing.py
new file mode 100644
index 0000000..500a6ab
--- /dev/null
+++ b/tests_2.0/live/test_vm_stat_parsing.py
@@ -0,0 +1,13 @@
+"""Unit tests for vm_stat parsing helpers."""
+
+from .test_utils import parse_vm_stat_page_size
+
+
+def test_parse_vm_stat_page_size_apple_silicon():
+ output = "Mach Virtual Memory Statistics: (page size of 16384 bytes)\nPages free: 12345."
+ assert parse_vm_stat_page_size(output) == 16384
+
+
+def test_parse_vm_stat_page_size_fallback():
+ output = "Pages free: 12345."
+ assert parse_vm_stat_page_size(output) == 4096
diff --git a/tests_2.0/test_human_output.py b/tests_2.0/test_human_output.py
index bcd3d21..fec5e06 100644
--- a/tests_2.0/test_human_output.py
+++ b/tests_2.0/test_human_output.py
@@ -18,6 +18,7 @@ def sample_list_data():
"model_type": "chat",
"capabilities": ["text-generation", "chat"],
"health": "healthy",
+ "runtime_compatible": True,
"cached": True,
},
{
@@ -29,6 +30,7 @@ def sample_list_data():
"model_type": "base",
"capabilities": ["text-generation"],
"health": "unhealthy",
+ "runtime_compatible": False,
"cached": True,
},
],
@@ -98,6 +100,7 @@ def test_list_human_filters_mlx_base_default():
"model_type": "chat",
"capabilities": ["text-generation", "chat"],
"health": "healthy",
+ "runtime_compatible": True,
"cached": True,
},
{
@@ -109,26 +112,42 @@ def test_list_human_filters_mlx_base_default():
"model_type": "base",
"capabilities": ["text-generation"],
"health": "healthy",
+ "runtime_compatible": True,
+ "cached": True,
+ },
+ {
+ "name": "org/Unhealthy",
+ "hash": None,
+ "size_bytes": 500,
+ "last_modified": "2025-08-30T12:00:00Z",
+ "framework": "MLX",
+ "model_type": "chat",
+ "capabilities": ["text-generation"],
+ "health": "unhealthy",
+ "runtime_compatible": False,
"cached": True,
},
],
- "count": 2,
+ "count": 3,
},
"error": None,
}
- # Default (compact) should hide MLX base
+ # Default: shows healthy + runtime_compatible models (both MLXChat and MLXBase)
out_default = render_list(data, show_health=False, show_all=False, verbose=False)
assert "MLXChat" in out_default
- assert "MLXBase" not in out_default
+ assert "MLXBase" in out_default
+ assert "Unhealthy" not in out_default
- # Verbose (without --all) shows all MLX (chat + base)
+ # Verbose: same filter, more columns
out_verbose = render_list(data, show_health=False, show_all=False, verbose=True)
assert "MLXChat" in out_verbose
assert "MLXBase" in out_verbose
+ assert "Unhealthy" not in out_verbose
-def test_list_human_verbose_shows_all_mlx_only():
+def test_list_human_filters_by_healthy_and_runtime_compatible():
+ """Test that default/verbose filters by healthy + runtime_compatible."""
from mlxk2.output.human import render_list
data = {
@@ -136,9 +155,9 @@ def test_list_human_verbose_shows_all_mlx_only():
"command": "list",
"data": {
"models": [
- {"name": "org/MLXChat", "hash": None, "size_bytes": 1, "last_modified": "2025-08-30T12:00:00Z", "framework": "MLX", "model_type": "chat", "capabilities": ["text-generation", "chat"], "health": "healthy", "cached": True},
- {"name": "org/MLXBase", "hash": None, "size_bytes": 1, "last_modified": "2025-08-30T12:00:00Z", "framework": "MLX", "model_type": "base", "capabilities": ["text-generation"], "health": "healthy", "cached": True},
- {"name": "org/OtherPT", "hash": None, "size_bytes": 1, "last_modified": "2025-08-30T12:00:00Z", "framework": "PyTorch", "model_type": "base", "capabilities": ["text-generation"], "health": "healthy", "cached": True},
+ {"name": "org/Runnable", "hash": None, "size_bytes": 1, "last_modified": "2025-08-30T12:00:00Z", "framework": "MLX", "model_type": "chat", "capabilities": ["text-generation", "chat"], "health": "healthy", "runtime_compatible": True, "cached": True},
+ {"name": "org/Unhealthy", "hash": None, "size_bytes": 1, "last_modified": "2025-08-30T12:00:00Z", "framework": "MLX", "model_type": "base", "capabilities": ["text-generation"], "health": "unhealthy", "runtime_compatible": True, "cached": True},
+ {"name": "org/NotCompatible", "hash": None, "size_bytes": 1, "last_modified": "2025-08-30T12:00:00Z", "framework": "PyTorch", "model_type": "base", "capabilities": ["text-generation"], "health": "healthy", "runtime_compatible": False, "cached": True},
],
"count": 3,
},
@@ -146,11 +165,12 @@ def test_list_human_verbose_shows_all_mlx_only():
}
out_verbose = render_list(data, show_health=False, show_all=False, verbose=True)
- # Shows both MLX models (chat+base)
- assert "MLXChat" in out_verbose
- assert "MLXBase" in out_verbose
- # Hides non-MLX
- assert "OtherPT" not in out_verbose
+ # Shows only healthy + runtime_compatible
+ assert "Runnable" in out_verbose
+ # Hides unhealthy
+ assert "Unhealthy" not in out_verbose
+ # Hides not runtime_compatible
+ assert "NotCompatible" not in out_verbose
def test_list_human_all_shows_all_frameworks():
diff --git a/tests_2.0/test_issue_30_preflight.py b/tests_2.0/test_issue_30_preflight.py
index 722a57e..5496bed 100644
--- a/tests_2.0/test_issue_30_preflight.py
+++ b/tests_2.0/test_issue_30_preflight.py
@@ -4,6 +4,21 @@ import pytest
from mlxk2.operations.pull import preflight_repo_access, pull_operation
+def _create_mock_response(status_code=403):
+ """Create a mock httpx.Response for huggingface-hub 1.x exceptions.
+
+ Hub 1.x requires response parameter to be a real httpx.Response object.
+ """
+ try:
+ import httpx
+ # Create minimal mock response
+ request = httpx.Request("GET", "https://huggingface.co/api/models/test")
+ return httpx.Response(status_code=status_code, request=request)
+ except ImportError:
+ # Fallback for older hub versions that don't need it
+ return None
+
+
def test_preflight_private_model_without_token(monkeypatch):
"""Test preflight check with a known private model without token.
@@ -29,7 +44,8 @@ def test_preflight_private_model_without_token(monkeypatch):
from huggingface_hub import errors as _hub_errors
GatedRepoError = _hub_errors.GatedRepoError
def _fake_model_info(self, repo_id, token=None):
- raise GatedRepoError("Gated/private repository")
+ response = _create_mock_response(status_code=403)
+ raise GatedRepoError("Gated/private repository", response=response)
monkeypatch.setattr(HfApi, "model_info", _fake_model_info, raising=True)
success, error = preflight_repo_access("org/private-model")
@@ -53,7 +69,8 @@ def test_preflight_nonexistent_model(monkeypatch):
from huggingface_hub import errors as _hub_errors
RepositoryNotFoundError = _hub_errors.RepositoryNotFoundError
def _fake_model_info(self, repo_id, token=None):
- raise RepositoryNotFoundError("Not found")
+ response = _create_mock_response(status_code=404)
+ raise RepositoryNotFoundError("Not found", response=response)
monkeypatch.setattr(HfApi, "model_info", _fake_model_info, raising=True)
success, error = preflight_repo_access("definitely-not-existing-model-12345-xyz")
@@ -78,7 +95,8 @@ def test_preflight_integration_in_pull(isolated_cache, monkeypatch):
from huggingface_hub import errors as _hub_errors
RepositoryNotFoundError = _hub_errors.RepositoryNotFoundError
def _fake_model_info(self, repo_id, token=None):
- raise RepositoryNotFoundError("Not found")
+ response = _create_mock_response(status_code=404)
+ raise RepositoryNotFoundError("Not found", response=response)
monkeypatch.setattr(HfApi, "model_info", _fake_model_info, raising=True)
# Test with a non-existent model - should fail at preflight stage
@@ -145,7 +163,8 @@ def test_preflight_prevents_cache_pollution(isolated_cache, monkeypatch):
from huggingface_hub import errors as _hub_errors
GatedRepoError = _hub_errors.GatedRepoError
def _fake_model_info(self, repo_id, token=None):
- raise GatedRepoError("Gated/private repository")
+ response = _create_mock_response(status_code=403)
+ raise GatedRepoError("Gated/private repository", response=response)
monkeypatch.setattr(HfApi, "model_info", _fake_model_info, raising=True)
# Attempt to pull a gated/private model
diff --git a/tests_2.0/test_server_models_and_errors.py b/tests_2.0/test_server_models_and_errors.py
index 5f96191..a5d3733 100644
--- a/tests_2.0/test_server_models_and_errors.py
+++ b/tests_2.0/test_server_models_and_errors.py
@@ -63,45 +63,57 @@ def test_unknown_model_maps_to_404():
assert resp.status_code == 404
-def test_models_endpoint_filters_non_mlx_and_unhealthy():
- """Ensure /v1/models excludes non-MLX and unhealthy entries."""
+def test_models_endpoint_filters_unhealthy_and_not_runtime_compatible():
+ """Ensure /v1/models excludes unhealthy and non-runtime-compatible entries.
+
+ Filter logic: healthy == True AND runtime_compatible == True
+ Uses shared build_model_object from common.py (single source of truth).
+ """
client = TestClient(app)
with patch('mlxk2.core.server_base.get_current_model_cache') as mock_cache, \
patch('mlxk2.core.cache.cache_dir_to_hf') as mock_cache_to_hf, \
- patch('mlxk2.operations.common.detect_framework') as mock_framework, \
- patch('mlxk2.operations.health.is_model_healthy') as mock_healthy:
+ patch('mlxk2.operations.common.build_model_object') as mock_build:
- # Two cached dirs
- d1 = MagicMock(); d1.name = "models--org--mlx"
- d2 = MagicMock(); d2.name = "models--org--pt"
- mock_cache.return_value.iterdir.return_value = [d1, d2]
+ # Three cached dirs with proper snapshot structure
+ d1 = MagicMock(); d1.name = "models--org--healthy-compatible"
+ d2 = MagicMock(); d2.name = "models--org--unhealthy"
+ d3 = MagicMock(); d3.name = "models--org--not-compatible"
+
+ # Setup snapshot paths for each model dir
+ for d in [d1, d2, d3]:
+ snapshot_dir = MagicMock()
+ snapshot_path = MagicMock()
+ snapshot_dir.exists.return_value = True
+ snapshot_dir.iterdir.return_value = [snapshot_path]
+ snapshot_path.is_dir.return_value = True
+ d.__truediv__ = lambda self, x, snap=snapshot_dir, spath=snapshot_path: snap if x == "snapshots" else spath
+
+ mock_cache.return_value.iterdir.return_value = [d1, d2, d3]
# Map names
def map_name(n):
- if n == "models--org--mlx":
- return "org/mlx"
- return "org/pt"
-
+ return n.replace("models--", "").replace("--", "/")
mock_cache_to_hf.side_effect = map_name
- # Framework detection: d1 is MLX, d2 is not
- def detect_fw(model_name, *_args, **_kwargs):
- return "MLX" if model_name.endswith("/mlx") else "PyTorch"
-
- mock_framework.side_effect = detect_fw
-
- # Health: return False for the MLX one to ensure it is filtered, too
- def health(model_name):
- return (False, None) if model_name.endswith("/mlx") else (True, None)
-
- mock_healthy.side_effect = health
+ # build_model_object returns different health/runtime_compatible
+ def build(model_name, model_dir, selected_path):
+ if "unhealthy" in model_name:
+ return {"health": "unhealthy", "runtime_compatible": True}
+ elif "not-compatible" in model_name:
+ return {"health": "healthy", "runtime_compatible": False}
+ else:
+ return {"health": "healthy", "runtime_compatible": True}
+ mock_build.side_effect = build
resp = client.get("/v1/models")
assert resp.status_code == 200
data = resp.json()
- # Both should be filtered: one not MLX, one unhealthy
- assert data.get("data") == []
+ # Only d1 (healthy + runtime_compatible) should pass
+ model_ids = [m["id"] for m in data.get("data", [])]
+ assert "org/healthy-compatible" in model_ids
+ assert "org/unhealthy" not in model_ids
+ assert "org/not-compatible" not in model_ids
def test_chat_unknown_model_maps_to_404():