diff --git a/.gitignore b/.gitignore
index 1487cdf..a33ade5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,6 +27,7 @@ openwebui311/bin/
 *_report.json
 test-img-collection/
 small-img-collection
+benchmarks/reports/*.html
 
 # Benchmark reports (ADR-013 Phase 0)
 # These reports ARE tracked in git for historical data
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 49effbc..df79049 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,97 @@
 # Changelog
 
-## [2.0.4-beta.1] - WIP
+## [2.0.4-beta.3] - 2025-12-23
+
+### Added
+
+- **Benchmark Infrastructure v1.0 (ADR-013 Phase 0):**
+  - Template-based report generator: `benchmarks/generate_benchmark_report.py`
+  - Per-model statistics, per-test statistics, system health summary
+  - Schema validation: `benchmarks/validate_reports.py`
+  - Documentation: `benchmarks/README.md`, `benchmarks/TESTING.md`
+  - Quality tracking: Schema v0.2.0 with system_health (swap, RAM, zombies, quality_flags)
+  - Page-size fix: Corrected Apple Silicon 16KB page size (RAM values were 4x too low)
+  - Files: `tests_2.0/live/conftest.py`, `test_utils.py`, `test_vm_stat_parsing.py`
+
+- **Memory Timeline Visualization:**
+  - Interactive HTML visualizer: `benchmarks/tools/memplot.py` (500+ lines)
+  - Memory monitor enhanced: `benchmarks/tools/memmon.py` (memory pressure capture)
+  - Visual legend: Activity Monitor colors, memory pressure, test regions, model markers
+  - Documentation: Complete interpretation guide in `benchmarks/README.md`
+  - Schema learnings: Server test attribution problem + log-parsing solution documented
+  - File: `benchmarks/schemas/LEARNINGS-FOR-v1.0.md`
+
+### Fixed
+
+- **Server model switch log timing:** "Switched to model" now emitted only after successful load (past tense reflects completed action)
+  - File: `mlxk2/core/server_base.py:230`
+
+- **Unified model filter (Server + CLI):** Both `/v1/models` and `mlxk list` now use `build_model_object()` as single source of truth
+  - Filter: `healthy AND runtime_compatible` (no more code duplication)
+  - Framework gate: Non-MLX models (PyTorch, GGUF) now correctly marked `runtime_compatible=false`
+  - WebUI clients get consistent, runnable model lists
+  - Files: `mlxk2/core/server_base.py`, `mlxk2/output/human.py`, `mlxk2/operations/common.py`
+
+- **transformers 5.0 compatibility for vision models:** Removed `fix_mistral_regex` parameter from mlx-vlm load call
+  - transformers 5.0.0rc1 changed tokenizer initialization - `fix_mistral_regex` no longer accepted as kwarg
+  - Error was: `TypeError: _patch_mistral_regex() got multiple values for keyword argument 'fix_mistral_regex'`
+  - Removed deprecated parameter from vision model loading - all vision models now work with transformers 5.0
+  - File: `mlxk2/core/vision_runner.py:101`
+
+- **huggingface-hub 1.x compatibility:** Updated preflight test mocks for hub 1.x exception API changes
+  - Hub 1.x changed exception signatures: `GatedRepoError/RepositoryNotFoundError` now require `response` parameter
+  - Added `_create_mock_response()` helper to create proper httpx.Response objects for test mocks
+  - **Test-only changes** - preflight production code works unchanged with hub 0.x and 1.x
+  - **Result:** mlx-knife now fully compatible with mlx 0.30.x, mlx-lm 0.30.0, transformers 5.0, hub 1.x
+  - All 494 unit tests pass, vision models functional with newest dependencies
+  - Files: `tests_2.0/test_issue_30_preflight.py`, `mlxk2/core/vision_runner.py`
+
+- **EXIF GPS 0° coordinate handling:** Fixed truthiness checks in `VisionRunner._extract_exif` that incorrectly dropped valid GPS coordinates
+  - Equator (0° latitude) and Prime Meridian (0° longitude) now correctly preserved
+  - Changed latitude/longitude negation checks from `if lat` to `if lat is not None`
+  - Changed EXIF retention check from `not any([...])` to `all(x is None for x in [...])`
+  - Ensures 0.0 is treated as valid coordinate, not as missing data
+  - File: `mlxk2/core/vision_runner.py:259-262, 283`
+
+- **Framework/Type detection for non-mlx-community models (Issue #48):**
+  - `detect_framework()`: Now reads front-matter internally and checks config.json `quantization` key (MLX-specific)
+  - `detect_model_type()`: Added `probe` parameter and checks for `chat_template.json` file (reliable chat indicator)
+  - Removed redundant PR #42 code from server_base.py (cleaner architecture)
+  - Fixes: Models like locally converted quantized models now correctly show "MLX" + "chat" instead of "PyTorch" + "base"
+  - Files: `mlxk2/operations/common.py:118-157, 180-208`, `mlxk2/core/server_base.py:114-120`
+
+- **Video model detection and exclusion:**
+  - Video models (require PyTorch/Torchvision) now excluded from vision capability detection
+  - mlx-vlm only supports image vision models, not video models
+  - Video indicators: `video_preprocessor_config.json`, `temporal_patch_size`, `AutoVideoProcessor`
+  - Video models fall back to mlx-lm for text-only (consistent with vision architecture)
+  - Example: `mlx-community/MiMo-VL-7B-RL-bf16` now classified as "chat" (not "chat+vision")
+  - Files: `mlxk2/operations/common.py:211-266`, `mlxk2/core/capabilities.py:169-238`
+
+### Documentation
+
+- **mlx-vlm beta.3 install guidance:** Recommend upstream commit `c4ea290e47e2155b67d94c708c662f8ab64e1b37` until mlx-vlm 0.3.10 is released
+  - Files: `README.md`, `docs/SERVER-HANDBOOK.md`
+
+## [2.0.4-beta.2] - 2025-12-16
+
+**PyPI-only release** - Fixes Git dependency issue for PyPI compatibility. Not tagged on GitHub.
+
+### Fixed
+
+- **PyPI compatibility:** Changed `mlx-vlm` dependency from Git URL to PyPI version `mlx-vlm>=0.3.9`
+  - PyPI does not allow Git dependencies
+  - mlx-vlm 0.3.9 is available on PyPI
+  - File: `pyproject.toml:69`
+
+### Documentation
+
+- **Installation instructions:** Added Vision-specific installation to README.md
+  - Clear separation: Text models (Python 3.9+) vs Vision models (Python 3.10+)
+  - Installation command: `pip install mlx-knife[vision]`
+  - Updated all version references from 2.0.4-beta.1 → 2.0.4-beta.2
+
+## [2.0.4-beta.1] - 2025-12-16
 
 **Focus:** Unix Pipe Integration + Vision Support + Memory-Aware Loading + Python 3.14
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 18e6e53..3d2a54f 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -129,18 +129,15 @@ For detailed testing options, troubleshooting, and advanced workflows, see **[TE
 
 ### Before Submitting PRs
 
-Please ensure all tests pass locally:
-```bash
-# Complete test workflow
-ruff check mlxk2/ --fix         # Fix code style
-mypy mlxk2/                     # Check types
-pytest -v                       # Run all 2.0 tests
-```
+**All tests must pass:**
+- ✅ Code quality: `ruff check mlxk2/ --fix && mypy mlxk2/`
+- ✅ Unit tests: `pytest tests_2.0/ -v` (always required)
+- ✅ Live E2E tests: Required for model/inference changes
 
-Since we don't have CI/CD (MLX requires Apple Silicon), we rely on contributors to verify their changes locally. Please mention in your PR:
-- Which Python version you tested with
-- Which Mac model you tested on (M1/M2/M3)
-- Test results summary
+**PR requirements:**
+- State your Python version + Mac chip in PR description
+- For model/inference changes: Document which live tests you ran
+- **Important:** Unit tests alone are NOT sufficient - see **[TESTING.md](TESTING.md)** for why and how
 
 ## Python Version Requirements
 
diff --git a/README.md b/README.md
index 3c3862c..804ebb2 100644
--- a/README.md
+++ b/README.md
@@ -4,9 +4,9 @@
   <img src="https://github.com/mzau/mlx-knife/raw/main/mlxk-demo.gif" alt="MLX Knife Demo" width="900">
 </p>
 
-**Current Version: 2.0.4-beta.2** (Stable: 2.0.3)
+**Current Version: 2.0.4-beta.3** (Stable: 2.0.3)
 
-[![GitHub Release](https://img.shields.io/badge/version-2.0.4--beta.2-blue.svg)](https://github.com/mzau/mlx-knife/releases)
+[![GitHub Release](https://img.shields.io/badge/version-2.0.4--beta.3-blue.svg)](https://github.com/mzau/mlx-knife/releases)
 [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0)
 [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
 [![Apple Silicon](https://img.shields.io/badge/Apple%20Silicon-green.svg)](https://support.apple.com/en-us/HT211814)
@@ -20,7 +20,7 @@
 - **Model Information**: Detailed model metadata including quantization info
 - **Download Models**: Pull models from HuggingFace with progress tracking
 - **Run Models**: Native MLX execution with streaming and chat modes
-- **Vision Models**: Image analysis (Python 3.10+, alpha)
+- **Vision Models**: Image analysis (Python 3.10+, beta)
 - **Unix Pipes**: Chain models via stdin/stdout - no temp files (beta)
 - **Health Checks**: Verify model integrity and MLX runtime compatibility
 - **Cache Management**: Clean up and organize your model storage
@@ -67,7 +67,7 @@ This license applies **only** to the `mlx-knife` code and **does not extend** to
 MLX Knife has been comprehensively tested and verified on:
 
 ✅ **Python 3.9.6 - 3.14** - Text LLMs fully supported (mlx-lm 0.28.4+)
-✅ **Python 3.10 - 3.14** - Vision models supported (mlx-vlm 0.3.9+)
+✅ **Python 3.10 - 3.14** - Vision models supported (mlx-vlm 0.3.9+; beta.3 recommends commit c4ea290e47e2155b67d94c708c662f8ab64e1b37)
 
 **Note:** Vision features require Python 3.10+. Native macOS Python 3.9.6 users need to upgrade (e.g., via Homebrew).
 
@@ -85,12 +85,17 @@ pip install mlx-knife
 pip install mlx-knife[vision]
 
 # Verify installation
-mlxk --version  # → mlxk 2.0.3 (stable) or 2.0.4-beta.2 (dev)
+mlxk --version  # → mlxk 2.0.3 (stable) or 2.0.4-beta.3 (dev)
 ```
 
 **Python Requirements:**
 - **Text models:** Python 3.9-3.14
-- **Vision models:** Python 3.10-3.14 (requires `mlx-vlm>=0.3.9`)
+- **Vision models:** Python 3.10-3.14 (requires `mlx-vlm>=0.3.9`; beta.3 recommends commit c4ea290e47e2155b67d94c708c662f8ab64e1b37)
+
+**Beta.3 note:** Until mlx-vlm 0.3.10 is released, install the upstream commit before mlx-knife if you need the fix:
+```bash
+pip install "mlx-vlm @ git+https://github.com/Blaizzy/mlx-vlm.git@c4ea290e47e2155b67d94c708c662f8ab64e1b37"
+```
 
 ### Development Installation
 
@@ -106,7 +111,7 @@ pip install -e ".[dev,test]"
 pip install -e ".[dev,test,vision]"
 
 # Verify installation
-mlxk --version  # → mlxk 2.0.4-beta.2
+mlxk --version  # → mlxk 2.0.4-beta.3
 
 # Run tests and quality checks (before committing)
 pytest -v
@@ -182,6 +187,100 @@ open index.html
 | 🔒 `pipe mode` | **Beta feature** - Unix pipes with `mlxk run <model> - ...`; requires `MLXK2_ENABLE_PIPES=1` |
 
 
+## Multi-Modal Support
+
+MLX Knife supports multiple input modalities beyond text. All multi-modal features share a **common output pattern**: model responses are followed by collapsible metadata tables for transparency and traceability.
+
+### Vision (Beta)
+
+Image analysis via the `--image` flag (CLI and server). Requires Python 3.10+.
+
+#### Requirements
+
+- **Python 3.10+** (mlx-vlm dependency)
+- **Installation:** `pip install mlx-knife[vision]`
+- **Backend:** mlx-vlm 0.3.9+ from PyPI
+- **Beta.3 note:** For upstream bugfixes, install commit `c4ea290e47e2155b67d94c708c662f8ab64e1b37` before mlx-knife:
+  ```bash
+  pip install "mlx-vlm @ git+https://github.com/Blaizzy/mlx-vlm.git@c4ea290e47e2155b67d94c708c662f8ab64e1b37"
+  pip install mlx-knife[vision]
+  ```
+
+#### Usage
+
+```bash
+# Image analysis with custom prompt
+mlxk run "mlx-community/Llama-3.2-11B-Vision-Instruct-4bit" \
+  --image photo.jpg "Describe what you see in detail"
+
+# Multiple images (space-separated or glob)
+mlxk run vision-model --image img1.jpg img2.jpg img3.jpg "Compare these images"
+mlxk run vision-model --image photos/*.jpg "Which images show outdoor scenes?"
+
+# Auto-prompt (default: "Describe the image.")
+mlxk run vision-model --image cat.jpg
+
+# Text-only on vision model (no --image flag)
+mlxk run "mlx-community/Llama-3.2-11B-Vision-Instruct-4bit" "What is 2+2?"
+```
+
+#### Metadata Output Format
+
+When processing images, MLX Knife automatically appends metadata in a **collapsible table** (collapsed by default):
+
+```
+A beach with palm trees and clear blue water.
+
+<details>
+<summary>📸 Image Metadata (2 images)</summary>
+
+| Image | Filename | Original | Location | Date | Camera |
+|-------|----------|----------|----------|------|--------|
+| 1 | image_abc123.jpeg | beach.jpg | 📍 32.79°N, 16.92°W | 📅 2023-12-06 12:19 | 📷 Apple iPhone SE |
+| 2 | image_def456.jpeg | mountain.jpg | 📍 32.87°N, 17.17°W | 📅 2023-12-10 15:42 | 📷 Apple iPhone SE |
+
+</details>
+```
+
+**Metadata includes:**
+- **Image ID** → **Filename mapping** (identify which description belongs to which file)
+- **GPS coordinates** (latitude/longitude, if available in EXIF)
+- **Capture date/time** (ISO 8601 format)
+- **Camera model** (device info)
+
+**Privacy control:**
+
+EXIF extraction is **enabled by default**. To disable (e.g., for privacy-sensitive images):
+
+```bash
+export MLXK2_EXIF_METADATA=0
+mlxk run vision-model --image photo.jpg "describe"
+```
+
+**Output is the same for CLI and server** - metadata tables work in terminals, web UIs (nChat), and can be parsed programmatically.
+
+#### Limitations
+
+- **Non-streaming:** Vision runs always use batch mode (no streaming output)
+- **Image limits:** 5 images max per request, 20 MB per image, 50 MB total
+
+#### Server API
+
+Vision models work with OpenAI-compatible `/v1/chat/completions` endpoint using base64-encoded images:
+
+```bash
+curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
+  "model": "llama-vision",
+  "messages": [{
+    "role": "user",
+    "content": [
+      {"type": "text", "text": "What is in this image?"},
+      {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,..."}}
+    ]
+  }]
+}'
+```
+
 
 ## JSON API
 
@@ -211,28 +310,6 @@ mlxk show "Phi-3-mini" --json | jq '.data.model'
 
 ### Examples
 
-#### Pipe mode (Alpha: set `MLXK2_ENABLE_PIPES=1`)
-
-```bash
-# Read prompt from stdin and append trailing text (auto batch in pipes)
-echo "from stdin" | MLXK2_ENABLE_PIPES=1 mlxk run "<model>" - "append extra context"
-
-# JSON interactive guard (no prompt) emits JSON error on stdout, exit!=0
-MLXK2_ENABLE_PIPES=1 mlxk run "<model>" --json
-
-# Pipe list JSON into run for summarization
-MLXK2_ENABLE_PIPES=1 mlxk list --json \
-  | MLXK2_ENABLE_PIPES=1 mlxk run "<model>" - "Summarize the model list as a concise table."
-
-# Shortcut wrapper (same semantics)
-MLXK2_ENABLE_PIPES=1 mlx-run "<model>" - "translate into german" < README.md
-```
-
-Notes:
-- Stdin requires `MLXK2_ENABLE_PIPES=1` (alpha gate). Without it, `-` is rejected.
-- When stdout is a pipe (non-TTY), streaming is disabled automatically to keep clean output.
-- Use full model IDs in place of `<model>`; HF_HOME should point to your cache for live runs.
-
 #### List Models
 ```bash
 mlxk list --json
@@ -656,7 +733,7 @@ mlxk health --json | jq '.data.summary'
 ```
 
 
-## Hidden Alpha Features: `clone`, `push`, and pipe mode
+## Feature Gates: `clone`, `push` (Alpha), `pipe mode` (Beta)
 
 ### `clone` - Model Workspace Creation
 
@@ -710,38 +787,31 @@ These features are not final and may change or be removed in future releases.
 
 Pipe mode is beta (feature complete) and requires `MLXK2_ENABLE_PIPES=1`. It lets `mlxk run` (and `mlx-run`) read stdin when you pass `-` as the prompt.
 
-- Gate: `MLXK2_ENABLE_PIPES=1` (will become default in a future stable release).
-- Auto-batch: When stdout is a pipe (non-TTY), streaming is disabled automatically for clean output.
-- Robust: Handles SIGPIPE and BrokenPipeError gracefully (`| head`, `| grep -m1` work correctly).
-- Scope: Applies to `mlxk run` and `mlx-run`; other commands unchanged.
+- **Status:** Beta (feature complete), API stable (syntax will not change)
+- **Gate:** `MLXK2_ENABLE_PIPES=1` (will become default in a future stable release)
+- **Auto-batch:** When stdout is a pipe (non-TTY), streaming is disabled automatically for clean output
+- **Robust:** Handles SIGPIPE and BrokenPipeError gracefully (`| head`, `| grep -m1` work correctly)
+- **Scope:** Applies to `mlxk run` and `mlx-run`; other commands unchanged
 - Usage examples (replace `<model>` with a cached MLX chat model):
 
 ```bash
 # stdin + trailing text (batch when piped)
 MLXK2_ENABLE_PIPES=1 echo "from stdin" | mlxk run "<model>" - "append extra context"
 
-# JSON interactive guard (no prompt) → JSON error on stdout, exit 1
-MLXK2_ENABLE_PIPES=1 mlxk run "<model>" --json
-
 # list → run summarization
 MLXK2_ENABLE_PIPES=1 mlxk list --json \
-  | MLXK2_ENABLE_PIPES=1 mlxk run "<model>" - "Summarize the model list as a concise table."
+  | MLXK2_ENABLE_PIPES=1 mlxk run "<model>" - "Summarize the model list as a concise table." >my-hf-table.md
 
 # Wrapper shorthand
 MLXK2_ENABLE_PIPES=1 mlx-run "<model>" - "translate into german" < README.md
+
+# Vision → Text chain: Photo tour review
+MLXK2_ENABLE_PIPES=1 mlxk run pixtral --image photos/*.jpg "Describe each picture" \
+  | MLXK2_ENABLE_PIPES=1 mlxk run qwen3 - \
+    "Write a tour review. Create a table with picture names, metadata, and descriptions." \
+  > tour-review.md
 ```
 
-Pipe mode API is stable.
-
-### `vision` - mlx-vlm (Python 3.10+, non-streaming)
-
-- Install extras: `pip install -e .[vision]` (requires `mlx-vlm>=0.3.9` from PyPI, Python 3.10+).
-- Backend: Uses `mlx-vlm` (vision); streaming is disabled for vision runs.
-- Usage:
-  - Text-only on a vision model: `mlxk run "mlx-community/Llama-3.2-11B-Vision-Instruct-4bit" "what is 2+2"`
-  - Image + text: `mlxk run "<vision-model>" --image cat.jpg "describe the cat"`
-  - Image-only (auto prompt): `mlxk run "<vision-model>" --image cat.jpg`
-
 
 ## Testing
 
@@ -817,7 +887,7 @@ Apache License 2.0 — see `LICENSE` (root) and `mlxk2/NOTICE`.
 
 <p align="center">
   <b>Made with ❤️ by The BROKE team <img src="broke-logo.png" alt="BROKE Logo" width="30" align="middle"></b><br>
-  <i>Version 2.0.4-beta.2 | December 2025</i><br>
+  <i>Version 2.0.4-beta.3 | December 2025</i><br>
   <a href="https://github.com/mzau/broke-nchat">💬 Web UI: nChat - lightweight chat interface</a> •
   <a href="https://github.com/mzau/broke-cluster">🔮 Multi-node: BROKE Cluster</a>
 </p>
diff --git a/TESTING-DETAILS.md b/TESTING-DETAILS.md
index d07fb2c..a2f5d7d 100644
--- a/TESTING-DETAILS.md
+++ b/TESTING-DETAILS.md
@@ -847,7 +847,7 @@ MLXK2_LIVE_PUSH=1 \
 
 ---
 
-### Complete Test File Structure (2.0.4-beta.1)
+### Complete Test File Structure (2.0.4-beta.3)
 
 ```
 tests_2.0/
@@ -885,7 +885,8 @@ tests_2.0/
 │   ├── test_server_e2e.py                      # Server E2E tests with TEXT models (ADR-011 + Portfolio Separation, parametrized: text_XX)
 │   ├── test_streaming_parity.py                # Streaming vs batch parity tests (Issue #20, ADR-011, parametrized)
 │   ├── test_vision_e2e_live.py                 # Vision CLI E2E tests with real models (ADR-012, 5 deterministic vision queries)
-│   └── test_vision_server_e2e.py               # Vision Server E2E tests with VISION models (ADR-012 Phase 3 + Portfolio Separation, parametrized: vision_XX)
+│   ├── test_vision_server_e2e.py               # Vision Server E2E tests with VISION models (ADR-012 Phase 3 + Portfolio Separation, parametrized: vision_XX)
+│   └── test_vm_stat_parsing.py                 # vm_stat output parsing validation (macOS memory metrics)
 ├── test_adr004_error_logging.py       # ADR-004 error logging and redaction (tokens, paths)
 ├── test_capabilities.py               # Probe/Policy architecture (ADR-012, ADR-016, Session 18-19, 45 tests)
 ├── test_cli_log_json_flag.py          # CLI --log-json flag behavior and JSON log format
diff --git a/TESTING.md b/TESTING.md
index 5d5903e..cb027c5 100644
--- a/TESTING.md
+++ b/TESTING.md
@@ -19,6 +19,17 @@ For current test counts, version-specific details, and complete file listings, s
 - Delete operations fail if not in test cache (`MLXK2_STRICT_TEST_DELETE=1`)
 - Live tests never modify user cache without explicit environment variables
 
+**Unit Test Limitations:**
+
+MLX Knife has two test categories:
+1. **Unit tests** (~500 tests, fast, mocked) - verify code structure
+2. **Live E2E tests** (real models, slow) - verify actual functionality
+
+**Why both are needed:**
+When dependencies like `transformers` or `mlx-lm` update their APIs, unit tests (which mock these libraries) continue to pass, but real model loading breaks. Only live E2E tests catch these issues.
+
+**Example:** transformers 5.0 changed tokenizer initialization - unit tests passed (mocked API), but vision models failed to load in production. Live E2E tests caught the issue immediately.
+
 ## Quick Start
 
 ```bash
diff --git a/benchmarks/README.md b/benchmarks/README.md
index e44417c..437ee63 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,68 +1,288 @@
 # MLX Knife Benchmarks
 
-**Status:** Phase 0 - Organic Data Collection
+**Status:** Phase 0 - Organic Data Collection (WIP)
 
-## Architecture
+## What's Here?
 
-This directory tracks empirical performance and compatibility data from mlx-knife's test suite.
+This directory contains benchmark infrastructure for mlx-knife:
+- Empirical performance and compatibility data from E2E tests
+- Tools for analysis and visualization
+- Schema definitions for structured reports
 
-### Phase 0 Goals (2.0.3+)
+## Directory Structure
+
+```
+benchmarks/
+├── reports/                    # JSONL test reports + Markdown analyses
+│   ├── 2025-12-20-v2.0.4b3.jsonl   # Raw data (one file per test run)
+│   └── BENCHMARK-v1.0-*.md         # Generated analysis reports
+├── schemas/                    # JSON Schema definitions
+│   ├── report-v0.1.0.schema.json   # lecacy schema
+│   ├── report-v0.2.0.schema.json   # Current schema
+│   └── report-current.schema.json  # Symlink → current schema
+├── tools/                      # Standalone tools
+│   ├── memmon.py                   # Memory monitor (background sampling)
+│   └── memplot.py                  # Memory timeline visualizer
+├── generate_benchmark_report.py    # Report generator (Template v1.0)
+├── validate_reports.py             # Schema validation
+├── README.md                       # ← You are here
+└── TESTING.md                      # Benchmark handbook (How-To)
+```
+
+## Tools
+
+| Tool | Purpose |
+|------|---------|
+| `generate_benchmark_report.py` | JSONL → Markdown report (Template v1.0) |
+| `validate_reports.py` | Schema validation of JSONL files |
+| `tools/memmon.py` | Memory monitoring during test runs |
+| `tools/memplot.py` | Interactive memory timeline visualization (HTML) |
+
+## Schema
+
+**Current:** v0.2.0 (Phase 0 - Test Infrastructure)
+
+| Version | Release | Content |
+|---------|---------|---------|
+| v0.1.0 | 2.0.3 | Minimal: test, outcome, duration, model |
+| v0.2.0 | 2.0.4 | + hardware_profile, system_health, quality_flags |
+| v1.0.0 | Future | Model benchmarks (mlxk-benchmark package) |
+
+**Schema Strategy:** No v0.3.x planned. v0.2.0 → v1.0.0 directly.
+- v0.x = Test infrastructure ("Was the test run clean?")
+- v1.x = Model benchmarks ("How good is the model?")
+
+See `schemas/LEARNINGS-FOR-v1.0.md` for details.
+
+## Current Baseline
+
+**Report:** `reports/BENCHMARK-v1.0-2.0.4b3-2025-12-20.md`
+
+- Version: 2.0.4-beta.3
+- Hardware: Mac14,13 (M2 Max, 64 GB)
+- Tests: 141/162 passed, 19.5 min
+- Quality: 100% clean (0 MB swap, 0 zombies)
+
+## Phase 0 Goals
 
 1. **Collect data organically** from E2E tests
 2. **No perfect schema** - schema evolves with data
 3. **Git-tracked reports** - historical trends
-4. **Foundation for future** - community contributions, public database
+4. **Foundation for Phase 1** - mlxk-benchmark package
 
-### Directory Structure
+## Memory Timeline Visualization
 
-- `reports/` - JSONL test reports (one file per release)
-- `schemas/` - JSON Schema definitions (versioned)
+**Tool:** `tools/memplot.py` | **Created:** Session 45 (2025-12-21)
 
-### Current Schema
-
-**Version:** 0.2.0 (Phase 0 - Scheduling-Enhanced)
-
-- **v0.1.0** (2.0.3+): Minimal schema - basic performance metrics
-- **v0.2.0** (2.0.4+): Hardware profiling + detailed metrics for cluster scheduling
-  - `system.hardware_profile`: Mac model, cores, Metal version
-  - `performance.*_time_s`: model_load, time_to_first_token, cleanup
-  - `system_health`: swap, zombies, quality_flags
-  - Backward compatible: v0.1.0 reports still valid
-
-**Schema Files:**
-- `schemas/report-current.schema.json` → always points to latest version
-- `schemas/report-v0.2.schema.json` → current schema (2.0.4+)
-- `schemas/report-v0.1.schema.json` → legacy schema (2.0.3)
-
-**Required fields:**
-- `schema_version`, `timestamp`, `mlx_knife_version`, `test`, `outcome`
-
-**Optional sections:**
-- `model` - Model metadata
-- `performance` - tokens/sec, RAM usage
-- `stop_tokens` - ADR-009 validation data
-- `system` - Platform info
-- `metadata` - Extensible (anything)
-
-### Generating Reports
+### Quick Start
 
 ```bash
-# During E2E tests
-pytest -m live_e2e tests_2.0/live/ \
-  --report-output benchmarks/reports/$(date +%Y-%m-%d)-v$(mlxk --version | cut -d' ' -f2).jsonl
+# Collect data (memmon runs in background)
+python benchmarks/tools/memmon.py --output memory.jsonl -- \
+  pytest -m live_e2e tests_2.0/live/ --report-output benchmark.jsonl
+
+# Generate interactive HTML
+python benchmarks/tools/memplot.py memory.jsonl benchmark.jsonl -o timeline.html
 ```
 
-### Schema Evolution
+### Visual Legend
 
-As we collect more data, the schema will evolve:
-- New fields added (backward compatible)
-- Optional → Required (when stable)
-- Breaking changes documented in `schemas/MIGRATIONS.md`
+#### Main Graph: RAM Free (GB)
 
-### Future Phases
+**Blue line with colored markers:**
+- 🟢 **Green markers:** Healthy (≥32 GB free, ≥50% of 64 GB)
+- 🟠 **Orange markers:** Warning (16-32 GB free, 25-50%)
+- 🔴 **Red markers:** Critical (<16 GB free, <25%)
 
-- **Phase 1 (2.1+):** Schema formalization, validation tooling
-- **Phase 2 (2.2+):** `mlxk report` CLI for manual submissions
-- **Phase 3 (2.3+):** Public database, community contributions
+**Dashed threshold lines:**
+- **Green line (32 GB):** 50% threshold - system healthy
+- **Orange line (16 GB):** 25% threshold - warning level
 
-See `docs/ADR/ADR-013-Community-Model-Quality-Database.md` for full roadmap.
+#### Background Rectangles: Test Regions
+
+**Gray (rgba(200, 200, 200, 0.3)):**
+- Model tests that load an LLM model
+- Example: `test_run_command[text_00]`, `test_chat_completion[vision_01]`
+- **Meaning:** Model is loaded in RAM during this time
+
+**Light Blue (rgba(173, 216, 230, 0.2)):**
+- Infrastructure tests without model
+- Example: `test_portfolio_discovery`, `test_health_check`
+- **Meaning:** No model loaded, only test infrastructure active
+
+⚠️ **Known limitation (v0.2.0):** Server tests appear as "light blue" even when loading models (LocalServer fixture doesn't record model metadata). Recognizable by: high RAM usage + long duration in blue region. Example: `test_text_request_still_works_on_vision_model` (57 GB used, 16s duration).
+
+#### Memory Pressure Overlay
+
+**Yellow (rgba(255, 204, 0, 0.15)):**
+- macOS Memory Pressure: WARN
+- Source: `sysctl kern.memorystatus_vm_pressure_level = 2`
+
+**Red (rgba(255, 59, 48, 0.15)):**
+- macOS Memory Pressure: CRITICAL
+- Source: `sysctl kern.memorystatus_vm_pressure_level = 4`
+- **Meaning:** System begins swapping, performance degradation
+
+**White/Transparent:**
+- macOS Memory Pressure: NORMAL (level = 1)
+
+#### Labels
+
+**Top (90° rotated, black):**
+- Model names at each model switch
+- Example: `DeepHermes-3-Mistral`, `pixtral-12b-8bit`
+- Position: Left-aligned with test start
+
+**Bottom (90° rotated, gray):**
+- Test names for each test (model + infrastructure)
+- Example: `test_run_command`, `test_chat_completion`
+- Position: Left-aligned with test start
+
+**Vertical helper lines:**
+- Thin gray lines at each test start
+- Help correlate labels with timeline
+
+#### Secondary Y-Axis: Swap Used (MB)
+
+**Red line (right axis):**
+- Only visible when swap > 0 MB
+- **Meaning:** System paging RAM to SSD → performance loss
+- **Normal:** 0 MB
+- **Problematic:** >100 MB
+
+### Interpretation Patterns
+
+**Typical model load:**
+```
+Pattern: RAM Free drops suddenly (e.g., 52 GB → 28 GB)
+Duration: 2-5 seconds
+Color: Gray rectangle begins
+Label: Model name appears at top
+→ Model loaded into RAM (24 GB)
+```
+
+**Typical model unload:**
+```
+Pattern: RAM Free rises suddenly (e.g., 28 GB → 52 GB)
+Duration: <1 second
+Color: Gray rectangle ends (or switches to next)
+Label: New model name (or none)
+→ Model removed from RAM
+```
+
+**Memory pressure without swap:**
+```
+Pattern: Yellow/Red background WITHOUT swap line
+RAM Free: Still >10 GB
+→ macOS preparing to swap, not yet active
+→ Often during large model loads (temporary)
+```
+
+**Memory pressure with swap:**
+```
+Pattern: Red background + Red swap line rises
+RAM Free: <10 GB
+Swap: >100 MB
+→ System actually at limit
+→ Performance significantly worse
+→ Typical: Multiple large models in short time
+```
+
+**Infrastructure test with high RAM usage:**
+```
+Pattern: Light blue rectangle + RAM drops significantly (>20 GB)
+Duration: >10 seconds
+Example: 57 GB used in test_text_request_still_works_on_vision_model
+→ ⚠️ Schema bug: Server test loads model but "model": null
+→ Should be gray, not light blue
+→ Fix: v1.0 schema with log parsing
+```
+
+### Data Sources
+
+**RAM Free:**
+- Source: `vm_stat` (macOS native)
+- Calculation: `(free + inactive + purgeable + speculative) * page_size / 1e9`
+- Sample rate: 500ms (2 samples/second)
+
+**Memory Pressure:**
+- Source: `sysctl kern.memorystatus_vm_pressure_level`
+- Values: 1=NORMAL, 2=WARN, 4=CRITICAL
+- Sample rate: 500ms (synchronized with RAM)
+
+**Swap Used:**
+- Source: `sysctl vm.swapusage`
+- Unit: MB
+- Sample rate: 500ms
+
+**Test Metadata:**
+- Source: Benchmark JSONL (pytest-json-report format)
+- Fields: `timestamp`, `duration`, `test`, `model` (optional), `outcome`
+- Correlation: ISO timestamp → Unix timestamp → elapsed seconds
+
+### Known Limitations (v0.2.0)
+
+1. **Model load/unload events missing**
+   - Gray regions show "test with model", not "model is loaded"
+   - Pytest runs through ALL models 4x → each model loaded/unloaded 4x
+   - Regions overlap visually though sequential
+   - **Fix planned:** v1.0 schema with explicit events
+
+2. **Server tests without model attribution**
+   - Server tests (LocalServer fixture) load models internally
+   - Appear as "infrastructure" (light blue) instead of "model" (gray)
+   - Recognizable: High RAM + long duration in blue region
+   - **Fix planned:** Log parsing in v0.3.0/v1.0
+
+3. **Dense test sequences**
+   - Tests shorter than 500ms sample rate → no coloring
+   - Typical: Fast infrastructure tests (<100ms)
+   - **Workaround:** Test labels show all tests
+
+4. **Label overlap**
+   - Many tests in short time (>10 tests/min)
+   - Labels may overlap (90° rotated)
+   - **Mitigation:** Zoom for detailed view
+   - **Future:** Adaptive label density or collapsing
+
+### Interactive Features
+
+- **Zoom & Pan:** Mouse wheel (vertical), Shift+wheel (horizontal), click+drag
+- **Range Slider:** Quick navigation in long (>20 min) timelines
+- **Hover:** X-axis unified mode shows all values at same time
+
+### Future Extensions (Ideas)
+
+**For plot:**
+- [ ] Embedded legend in plot (not external file)
+- [ ] Toggle show/hide infrastructure tests
+- [ ] Hover shows full test names (not truncated)
+- [ ] Color-blind mode (alternative palette)
+
+**For schema v1.0:**
+- [ ] Model load/unload events → precise "in RAM" regions
+- [ ] Log parsing for server tests → correct attribution
+- [ ] GPU activity (Metal performance)
+- [ ] Net T/S (tokens/second, pure inference)
+
+**For analysis:**
+- [ ] Automatic anomaly detection (memory leaks, zombies)
+- [ ] Per-model memory profiling (min/max/avg RAM)
+- [ ] Scheduling optimization (avoid model-switch overlap)
+
+---
+
+## Roadmap
+
+| Phase | Release | Description |
+|-------|---------|-------------|
+| **Phase 0** | 2.0.3-2.0.4 | Organic Data Collection ✅ |
+| Phase 1 | 2.1+ | `mlxk-benchmark` package (separate tool) |
+| Phase 2 | 2.2+ | Report aggregation, hardware correlation |
+| Phase 3 | 2.3+ | Public database, community contributions |
+
+## Further Documentation
+
+- **[TESTING.md](TESTING.md)** - Benchmark handbook (How-To)
+- **[schemas/LEARNINGS-FOR-v1.0.md](schemas/LEARNINGS-FOR-v1.0.md)** - Learnings for Phase 1
+- **[docs/ADR/ADR-013-Community-Model-Quality-Database.md](../docs/ADR/ADR-013-Community-Model-Quality-Database.md)** - Architecture vision
diff --git a/benchmarks/TESTING.md b/benchmarks/TESTING.md
index 59e44f2..6d8d063 100644
--- a/benchmarks/TESTING.md
+++ b/benchmarks/TESTING.md
@@ -1,155 +1,263 @@
-# Testing with Benchmark Reports (ADR-013 Phase 0)
+# Benchmark Handbook
 
-This document explains how to generate benchmark reports during E2E tests.
+Step-by-step guide for running benchmarks and generating reports.
+
+## Quick Start
+
+```bash
+# 1. Run E2E tests with report output
+pytest -m live_e2e tests_2.0/live/ \
+  --report-output benchmarks/reports/$(date +%Y-%m-%d)-v2.0.4b3.jsonl
+
+# 2. Generate analysis report
+python benchmarks/generate_benchmark_report.py
+
+# 3. View results
+cat benchmarks/reports/BENCHMARK-v1.0-2.0.4b3-*.md
+```
+
+---
+
+## Running Benchmarks
+
+### Basic Test Run
+
+```bash
+# Run all E2E tests, output to JSONL
+pytest -m live_e2e tests_2.0/live/ \
+  --report-output benchmarks/reports/$(date +%Y-%m-%d)-v2.0.4b3.jsonl
+```
+
+### With Custom HuggingFace Cache
+
+```bash
+HF_HOME=/path/to/huggingface/cache \
+  pytest -m live_e2e tests_2.0/live/ -v \
+  --report-output benchmarks/reports/2025-12-20-v2.0.4b3.jsonl
+```
+
+### With Memory Monitoring
+
+```bash
+# Run memmon in parallel to capture memory profile
+python benchmarks/tools/memmon.py \
+  --output benchmarks/reports/2025-12-20-memory.jsonl \
+  -- pytest -m live_e2e tests_2.0/live/ \
+     --report-output benchmarks/reports/2025-12-20-v2.0.4b3.jsonl
+```
+
+---
 
 ## Generating Reports
 
-### Basic Usage
+### Auto-Detect Latest JSONL
 
 ```bash
-# Run E2E tests with reporting
-pytest -m live_e2e tests_2.0/live/ \
-  --report-output benchmarks/reports/$(date +%Y-%m-%d)-v2.0.3.jsonl
+python benchmarks/generate_benchmark_report.py
+# → Finds most recent .jsonl in benchmarks/reports/
+# → Outputs: BENCHMARK-v1.0-<version>-<date>.md
 ```
 
-### With Full Environment
+### Explicit Input File
 
 ```bash
-# Use specific HF cache + generate reports
-HF_HOME=/Volumes/mz-SSD/huggingface/cache \
-  pytest -m live_e2e tests_2.0/live/ -v \
-  --report-output benchmarks/reports/2025-11-16-v2.0.3.jsonl
+python benchmarks/generate_benchmark_report.py \
+  benchmarks/reports/2025-12-20-v2.0.4b3.jsonl
 ```
 
-## Adding Report Data to Tests
+### With Comparison (Regression Detection)
 
-Tests can add structured data to reports using `request.node.user_properties`:
+```bash
+python benchmarks/generate_benchmark_report.py \
+  benchmarks/reports/2025-12-20-new.jsonl \
+  --compare benchmarks/reports/2025-12-19-old.jsonl
+```
+
+Output includes:
+- Duration change (e.g., 20.5 min → 19.7 min, -3.8%)
+- Per-model changes with Old/Δ/Change columns
+- Per-test median time changes
+- Status indicators: ⚠️ (>5% slower), ✅ (>1% faster)
+
+### Custom Output Location
+
+```bash
+python benchmarks/generate_benchmark_report.py \
+  --output /tmp/my-report.md \
+  benchmarks/reports/2025-12-20-v2.0.4b3.jsonl
+```
+
+---
+
+## Memory Monitoring
+
+### Standalone Monitor (Fixed Duration)
+
+```bash
+python benchmarks/tools/memmon.py \
+  --duration 60 \
+  --interval 200 \
+  --output memory.jsonl
+```
+
+### Wrap Any Command
+
+```bash
+python benchmarks/tools/memmon.py \
+  --output memory.jsonl \
+  -- ./my-benchmark-script.sh
+```
+
+### Output Format
+
+```jsonl
+{"ts": 1734567890.1, "ram_free_gb": 45.2, "swap_used_mb": 0, "elapsed_s": 0.2}
+{"ts": 1734567890.3, "ram_free_gb": 42.1, "swap_used_mb": 0, "elapsed_s": 0.4}
+...
+{"summary": {"ram_free_min_gb": 21.3, "ram_free_max_gb": 45.2, "swap_max_mb": 0}}
+```
+
+### Correlating with Test Results
+
+Memory samples can be correlated with test results via timestamps:
 
 ```python
-def test_example(model_info, request):
-    # ... test logic ...
+# Test entry has: timestamp (end time), duration
+# Calculate: started_at = timestamp - duration
 
-    # Add model info
-    request.node.user_properties.append(("model", {
-        "id": model_info["id"],
-        "size_gb": model_info["ram_needed_gb"],
-        "family": extract_family(model_info["id"]),
-        "variant": extract_variant(model_info["id"])
-    }))
+test_start = parse_iso(entry["timestamp"]) - entry["duration"]
+test_end = parse_iso(entry["timestamp"])
 
-    # Add performance metrics
-    request.node.user_properties.append(("performance", {
-        "tokens_per_sec": measure_tokens_per_sec(response),
-        "ram_peak_mb": get_peak_ram_usage(),
-        "duration_s": response.elapsed
-    }))
-
-    # Add stop token data (ADR-009)
-    request.node.user_properties.append(("stop_tokens", {
-        "configured": model_stop_tokens,
-        "detected": find_stop_tokens_in_response(response),
-        "workaround": get_workaround_name(model_info["id"]),
-        "leaked": check_for_leaked_tokens(response)
-    }))
-
-    # Add system info (optional)
-    request.node.user_properties.append(("system", {
-        "platform": platform.system().lower(),
-        "platform_version": get_os_version(),
-        "python_version": platform.python_version(),
-        "mlx_version": get_mlx_version(),
-        "hardware": get_hardware_model(),
-        "ram_total_gb": get_total_ram_gb()
-    }))
-
-    # Anything else goes to metadata
-    request.node.user_properties.append(("custom_metric", "value"))
+# Find matching memory samples
+matching = [s for s in samples if test_start <= s["ts"] <= test_end]
 ```
 
-## Structured Sections
+---
 
-Reports have predefined structured sections that map to schema fields:
+## Validating Reports
 
-| user_properties key | Maps to report field | Description |
-|---------------------|----------------------|-------------|
-| `model` | `model` object | Model metadata (id, size, family, variant) |
-| `performance` | `performance` object | Performance metrics (tokens/sec, RAM, duration) |
-| `stop_tokens` | `stop_tokens` object | Stop token behavior (ADR-009 validation) |
-| `system` | `system` object | Platform information (OS, Python, MLX, hardware) |
-| _anything else_ | `metadata` object | Extensible catch-all for experiments |
-
-## Schema Validation
+### Validate Against Current Schema
 
 ```bash
-# Validate reports against schema (requires jsonschema)
-pip install jsonschema
-
-# Validate all reports
-for report in benchmarks/reports/*.jsonl; do
-  echo "Validating $report..."
-  cat "$report" | while read line; do
-    echo "$line" | python3 -c "
-import sys, json
-from jsonschema import validate
-
-with open('benchmarks/schemas/report-v0.1.schema.json') as f:
-    schema = json.load(f)
-
-report = json.load(sys.stdin)
-validate(instance=report, schema=schema)
-print('✓ Valid')
-"
-  done
-done
+python benchmarks/validate_reports.py benchmarks/reports/*.jsonl
 ```
 
-## Example Report
+### Validate Specific File
 
+```bash
+python benchmarks/validate_reports.py benchmarks/reports/2025-12-20-v2.0.4b3.jsonl
+```
+
+---
+
+## Schema Reference
+
+### Current Schema: v0.2.0
+
+Required fields:
 ```json
 {
-  "schema_version": "0.1.0",
-  "timestamp": "2025-11-16T10:30:00Z",
-  "mlx_knife_version": "2.0.3",
-  "test": "tests_2.0/live/test_stop_tokens_live.py::test_stop_tokens[phi-3-mini]",
+  "schema_version": "0.2.0",
+  "timestamp": "2025-12-20T02:26:10.722510+00:00",
+  "mlx_knife_version": "2.0.4-beta.3",
+  "test": "tests_2.0/live/test_cli_e2e.py::test_run_command[discovered_00]",
   "outcome": "passed",
-  "duration": 12.3,
+  "duration": 12.3
+}
+```
+
+Optional sections:
+```json
+{
   "model": {
-    "id": "mlx-community/phi-3-mini-4k-instruct",
-    "size_gb": 2.8,
-    "family": "phi-3",
-    "variant": "mini-4k-instruct"
+    "id": "mlx-community/Qwen3-32B-4bit",
+    "size_gb": 17.2,
+    "family": "qwen3"
   },
-  "performance": {
-    "tokens_per_sec": 45.2,
-    "ram_peak_mb": 3200,
-    "prompt_tokens": 15,
-    "completion_tokens": 42
+  "system": {
+    "hardware_profile": {
+      "model": "Mac14,13",
+      "cores_physical": 12
+    }
   },
-  "stop_tokens": {
-    "configured": ["<|end|>", "<|endoftext|>"],
-    "detected": ["<|end|>"],
-    "workaround": "phi-3-dual-eos",
-    "leaked": false
+  "system_health": {
+    "swap_used_mb": 0,
+    "ram_free_gb": 45.2,
+    "zombie_processes": 0,
+    "quality_flags": ["clean"]
   }
 }
 ```
 
-## Analyzing Reports
+### Quality Flags
 
-See `reports/README.md` for analysis examples (jq queries, statistics, trends).
+| Flag | Meaning | Threshold |
+|------|---------|-----------|
+| `clean` | Test ran without issues | swap=0, zombies=0 |
+| `degraded_swap` | Memory pressure detected | swap > 100 MB |
+| `degraded_zombies` | Zombie processes present | zombies > 0 |
+
+---
 
 ## Best Practices
 
 1. **File Naming:** Use `YYYY-MM-DD-vX.Y.Z.jsonl` format
 2. **Append Only:** Never edit existing reports (historical data)
 3. **Commit Reports:** Reports are git-tracked for trend analysis
-4. **Schema Version:** Always include `schema_version` for evolution tracking
-5. **Optional Data:** Only add what you can measure reliably
-6. **No PII:** Never include personal information in reports
+4. **Clean State:** Reboot before important benchmark runs
+5. **Close Apps:** Minimize background processes during tests
+6. **Multiple Runs:** Run 2-3 times, compare for consistency
 
-## Future Enhancements (Phase 1+)
+---
 
-- Automatic validation during `pytest --report-output`
-- Performance regression detection
-- Report comparison tools (`mlxk report diff`)
-- Schema migration utilities
+## Troubleshooting
+
+### "No JSONL files found"
+
+```bash
+# Check if reports exist
+ls -la benchmarks/reports/*.jsonl
+
+# Run tests with output
+pytest -m live_e2e tests_2.0/live/ --report-output benchmarks/reports/test.jsonl
+```
+
+### Schema Validation Fails
+
+```bash
+# Check schema version in file
+head -1 benchmarks/reports/file.jsonl | jq .schema_version
+
+# Validate manually
+python -c "
+import json
+from jsonschema import validate
+with open('benchmarks/schemas/report-current.schema.json') as f:
+    schema = json.load(f)
+with open('benchmarks/reports/file.jsonl') as f:
+    for line in f:
+        validate(json.loads(line), schema)
+print('OK')
+"
+```
+
+### Comparison Shows "N/A"
+
+Model not found in comparison file. Check:
+- Same models tested in both runs?
+- Model ID spelling matches exactly?
+
+---
+
+## Future: Phase 1 (mlxk-benchmark)
+
+Phase 1 will introduce a standalone benchmark package:
+
+```bash
+pip install mlxk-benchmark
+mlx-benchmark --model llama-3.2-3b --contribute
+```
+
+No pytest, no fixtures, no conftest.py - just simple CLI for community contributions.
+
+See `schemas/LEARNINGS-FOR-v1.0.md` for design notes.
diff --git a/benchmarks/generate_benchmark_report.py b/benchmarks/generate_benchmark_report.py
new file mode 100644
index 0000000..e49d79f
--- /dev/null
+++ b/benchmarks/generate_benchmark_report.py
@@ -0,0 +1,575 @@
+#!/usr/bin/env python3
+"""Generate benchmark analysis report from JSONL test data.
+
+Reads JSONL benchmark reports and generates structured Markdown analysis.
+
+Usage:
+    # Auto-detect latest JSONL
+    python benchmarks/generate_benchmark_report.py
+
+    # Explicit file
+    python benchmarks/generate_benchmark_report.py benchmarks/reports/2025-12-20-v2.0.4b3.jsonl
+
+    # With comparison
+    python benchmarks/generate_benchmark_report.py new.jsonl --compare old.jsonl
+"""
+
+import argparse
+import json
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+try:
+    import jsonschema
+except ImportError:
+    print("Error: jsonschema not installed. Install with: pip install jsonschema")
+    sys.exit(1)
+
+
+# Template version
+TEMPLATE_VERSION = "1.0"
+REPORTS_DIR = Path("benchmarks/reports")
+SCHEMA_PATH = Path("benchmarks/schemas/report-current.schema.json")
+
+
+def load_schema() -> dict:
+    """Load current JSON schema."""
+    if not SCHEMA_PATH.exists():
+        print(f"❌ Schema not found: {SCHEMA_PATH}")
+        sys.exit(1)
+
+    with open(SCHEMA_PATH) as f:
+        return json.load(f)
+
+
+def validate_jsonl(data: List[dict], schema: dict, filepath: Path) -> bool:
+    """Validate JSONL data against schema."""
+    errors = []
+    for i, entry in enumerate(data, 1):
+        try:
+            jsonschema.validate(instance=entry, schema=schema)
+        except jsonschema.ValidationError as e:
+            errors.append(f"Line {i}: {e.message}")
+
+    if errors:
+        print(f"❌ Validation failed for {filepath}")
+        for error in errors[:5]:  # Show first 5 errors
+            print(f"   {error}")
+        if len(errors) > 5:
+            print(f"   ... and {len(errors) - 5} more errors")
+        return False
+
+    return True
+
+
+def load_jsonl(filepath: Path) -> List[dict]:
+    """Load JSONL file."""
+    data = []
+    with open(filepath) as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                data.append(json.loads(line))
+    return data
+
+
+def find_latest_jsonl() -> Optional[Path]:
+    """Find the most recent JSONL file in reports directory."""
+    if not REPORTS_DIR.exists():
+        return None
+
+    jsonl_files = sorted(REPORTS_DIR.glob("*.jsonl"), key=lambda p: p.stat().st_mtime, reverse=True)
+    return jsonl_files[0] if jsonl_files else None
+
+
+def extract_version_from_filename(filepath: Path) -> Optional[str]:
+    """Extract version string from filename like '2025-12-20-v2.0.4b3.jsonl'."""
+    parts = filepath.stem.split("-v")
+    return parts[1].split("-")[0] if len(parts) > 1 else None
+
+
+def calculate_statistics(data: List[dict]) -> Dict:
+    """Calculate all benchmark statistics from JSONL data."""
+    # Separate by outcome
+    passed_tests = [e for e in data if e.get("outcome") == "passed"]
+    skipped_tests = [e for e in data if e.get("outcome") == "skipped"]
+    passed_with_model = [e for e in passed_tests if "model" in e]
+    passed_without_model = [e for e in passed_tests if "model" not in e]
+
+    # System health metrics (optional for backward compatibility with older schemas)
+    swap_values = []
+    ram_values = []
+    zombie_values = []
+    quality_flags = []
+
+    for e in data:
+        if "system_health" in e:
+            swap_values.append(e["system_health"].get("swap_used_mb", 0))
+            ram_values.append(e["system_health"].get("ram_free_gb", 0))
+            zombie_values.append(e["system_health"].get("zombie_processes", 0))
+            quality_flags.append(e["system_health"].get("quality_flags", ["unknown"]))
+
+    clean_count = sum(1 for flags in quality_flags if flags == ["clean"])
+    degraded_swap = sum(1 for flags in quality_flags if "degraded_swap" in flags)
+    degraded_zombies = sum(1 for flags in quality_flags if "degraded_zombies" in flags)
+
+    # Per-model statistics
+    model_stats = {}
+    for entry in passed_with_model:
+        model_id = entry["model"]["id"]
+        if model_id not in model_stats:
+            model_stats[model_id] = {
+                "id": model_id,
+                "size_gb": entry["model"]["size_gb"],
+                "count": 0,
+                "total_time": 0,
+                "ram_min": float("inf"),
+                "ram_max": 0,
+                "swap_max": 0,
+                "zombies_max": 0,
+            }
+
+        stats = model_stats[model_id]
+        stats["count"] += 1
+        stats["total_time"] += entry["duration"]
+        # Handle optional system_health (backward compatibility)
+        if "system_health" in entry:
+            stats["ram_min"] = min(stats["ram_min"], entry["system_health"].get("ram_free_gb", 0))
+            stats["ram_max"] = max(stats["ram_max"], entry["system_health"].get("ram_free_gb", 0))
+            stats["swap_max"] = max(stats["swap_max"], entry["system_health"].get("swap_used_mb", 0))
+            stats["zombies_max"] = max(stats["zombies_max"], entry["system_health"].get("zombie_processes", 0))
+
+    # Per-test statistics
+    import statistics
+    test_stats = {}
+    for entry in passed_with_model:
+        # Extract test function name and normalize (remove parametrization)
+        test_full = entry["test"].split("::")[-1]
+        test_name = test_full.split("[")[0]  # Remove [discovered_XX] part
+
+        model_id = entry["model"]["id"]
+        model_short = model_id.replace("mlx-community/", "").split("-")[0]  # Short name
+        duration = entry["duration"]
+
+        if test_name not in test_stats:
+            test_stats[test_name] = {
+                "name": test_name,
+                "models": set(),
+                "runs": [],
+            }
+
+        test_stats[test_name]["models"].add(model_id)
+        test_stats[test_name]["runs"].append({
+            "model": model_id,
+            "model_short": model_short,
+            "duration": duration
+        })
+
+    # Calculate aggregates per test
+    for test_name, stats in test_stats.items():
+        durations = [r["duration"] for r in stats["runs"]]
+        stats["model_count"] = len(stats["models"])
+        stats["median_time"] = statistics.median(durations) if durations else 0
+
+        # Find fastest and slowest
+        sorted_runs = sorted(stats["runs"], key=lambda r: r["duration"])
+        stats["fastest"] = sorted_runs[0] if sorted_runs else None
+        stats["slowest"] = sorted_runs[-1] if sorted_runs else None
+
+        # Convert set to list for JSON serialization
+        stats["models"] = list(stats["models"])
+
+    # Hardware profile (from first entry, optional for backward compatibility)
+    hw_profile = {}
+    if data and "system" in data[0] and "hardware_profile" in data[0]["system"]:
+        hw_profile = data[0]["system"]["hardware_profile"]
+
+    return {
+        "total_tests": len(data),
+        "passed": len(passed_tests),
+        "passed_with_model": len(passed_with_model),
+        "passed_infrastructure": len(passed_without_model),
+        "skipped": len(skipped_tests),
+        "total_duration": sum(e["duration"] for e in passed_tests),
+        "schema_version": data[0]["schema_version"] if data else "unknown",
+        "mlx_knife_version": data[0]["mlx_knife_version"] if data else "unknown",
+        "swap": {
+            "min": min(swap_values) if swap_values else 0,
+            "max": max(swap_values) if swap_values else 0,
+            "avg": sum(swap_values) / len(swap_values) if swap_values else 0,
+        },
+        "ram": {
+            "min": min(ram_values) if ram_values else 0,
+            "max": max(ram_values) if ram_values else 0,
+            "avg": sum(ram_values) / len(ram_values) if ram_values else 0,
+        },
+        "zombies": {
+            "min": min(zombie_values) if zombie_values else 0,
+            "max": max(zombie_values) if zombie_values else 0,
+        },
+        "quality": {
+            "clean": clean_count,
+            "degraded_swap": degraded_swap,
+            "degraded_zombies": degraded_zombies,
+            "clean_percent": 100 * clean_count / len(data) if data else 0,
+        },
+        "hardware": hw_profile,
+        "models": model_stats,
+        "tests": test_stats,
+    }
+
+
+def generate_markdown(stats: Dict, input_file: Path, compare_file: Optional[Path] = None, compare_stats: Optional[Dict] = None) -> str:
+    """Generate Markdown report from statistics."""
+    version = stats["mlx_knife_version"]
+    date = input_file.stem.split("-v")[0]  # Extract date from filename
+    now = datetime.now(timezone.utc).isoformat()
+
+    # Header
+    md = f"""# Benchmark Report v{TEMPLATE_VERSION}: {version}
+
+**Date:** {date}
+**Generated:** {now}
+**Generator:** generate_benchmark_report.py v{TEMPLATE_VERSION}
+**Hardware:** {stats['hardware'].get('model', 'unknown')}, {stats['hardware'].get('cores_physical', '?')} cores
+
+---
+
+## Input Files
+
+- **Primary:** `{input_file}`
+- **Schema:** v{stats['schema_version']}
+"""
+
+    if compare_file:
+        md += f"- **Comparison:** `{compare_file}`\n"
+
+    md += "\n---\n\n"
+
+    # Executive Summary
+    md += "## Executive Summary\n\n"
+    md += f"**Tests:** {stats['total_tests']} total ({stats['passed']} passed, {stats['skipped']} skipped)\n"
+    md += f"**Duration:** {stats['total_duration']:.1f}s ({stats['total_duration']/60:.1f} min)\n"
+    md += f"**Quality:** {stats['quality']['clean_percent']:.1f}% clean ({stats['quality']['clean']}/{stats['total_tests']})\n"
+    md += f"**Models:** {len(stats['models'])} tested\n\n"
+
+    # Comparison Summary
+    if compare_stats:
+        old_duration = compare_stats['total_duration']
+        new_duration = stats['total_duration']
+        duration_delta = new_duration - old_duration
+        duration_pct = (duration_delta / old_duration * 100) if old_duration > 0 else 0
+
+        # Count models by change direction
+        compare_models_dict = {m['id']: m for m in compare_stats['models'].values()}
+        slower_count = 0
+        faster_count = 0
+        for model in stats['models'].values():
+            old_model = compare_models_dict.get(model['id'])
+            if old_model:
+                if model['total_time'] > old_model['total_time']:
+                    slower_count += 1
+                elif model['total_time'] < old_model['total_time']:
+                    faster_count += 1
+
+        total_compared = slower_count + faster_count
+        change_icon = "⚠️" if duration_pct > 3 else "✅" if duration_pct < -1 else "➡️"
+
+        md += f"### Comparison\n\n"
+        md += f"**vs:** `{compare_file.name}`\n"
+        md += f"**Duration:** {old_duration/60:.1f} min → {new_duration/60:.1f} min ({duration_pct:+.1f}%) {change_icon}\n"
+        if total_compared > 0:
+            md += f"**Models:** {slower_count}/{total_compared} slower ({100*slower_count/total_compared:.0f}%), {faster_count}/{total_compared} faster ({100*faster_count/total_compared:.0f}%)\n"
+        md += "\n"
+
+    # Validation Status
+    quality_icon = "✅" if stats['quality']['clean_percent'] == 100 else "⚠️"
+    md += f"{quality_icon} **System Health:** "
+    if stats['quality']['clean_percent'] == 100:
+        md += "All tests clean (0 MB swap, 0 zombies)\n"
+    else:
+        md += f"{stats['quality']['degraded_swap']} degraded (swap), {stats['quality']['degraded_zombies']} degraded (zombies)\n"
+
+    md += "\n---\n\n"
+
+    # Test Summary
+    md += "## Test Summary\n\n"
+    md += f"""```
+Total tests:       {stats['total_tests']}
+Passed:            {stats['passed']}
+  With model:      {stats['passed_with_model']}
+  Infrastructure:  {stats['passed_infrastructure']}
+Skipped:           {stats['skipped']}
+Duration:          {stats['total_duration']:.1f}s ({stats['total_duration']/60:.1f} min)
+```
+
+---
+
+## System Health
+
+"""
+    md += f"""```
+Swap (MB):         min={stats['swap']['min']}, max={stats['swap']['max']}, avg={stats['swap']['avg']:.1f}
+RAM free (GB):     min={stats['ram']['min']:.1f}, max={stats['ram']['max']:.1f}, avg={stats['ram']['avg']:.1f}
+Zombies:           min={stats['zombies']['min']}, max={stats['zombies']['max']}
+
+Quality Flags:
+  Clean:           {stats['quality']['clean']}/{stats['total_tests']} ({stats['quality']['clean_percent']:.1f}%)
+  Degraded (swap): {stats['quality']['degraded_swap']}
+  Degraded (zombies): {stats['quality']['degraded_zombies']}
+```
+
+---
+
+## Per-Model Statistics
+
+"""
+
+    # Sort models by total time (descending), or by change if comparing
+    sorted_models = sorted(stats['models'].values(), key=lambda m: m['total_time'], reverse=True)
+
+    # Build comparison lookup if available
+    compare_models = {}
+    if compare_stats:
+        compare_models = {m['id']: m for m in compare_stats['models'].values()}
+        # Re-sort by change percentage (biggest regression first)
+        def get_change_pct(model):
+            old = compare_models.get(model['id'])
+            if old and old['total_time'] > 0:
+                return (model['total_time'] - old['total_time']) / old['total_time'] * 100
+            return 0
+        sorted_models = sorted(stats['models'].values(), key=get_change_pct, reverse=True)
+
+    if compare_stats:
+        md += f"""```
+{'Model':<42} {'Size':<7} {'Tests':<5} {'Time':<8} {'Old':<8} {'Δ':<8} {'Change':<10} {'RAM (GB)':<12}
+{'='*42} {'='*7} {'='*5} {'='*8} {'='*8} {'='*8} {'='*10} {'='*12}
+"""
+    else:
+        md += f"""```
+{'Model':<50} {'Size':<8} {'Tests':<6} {'Time':<10} {'RAM (GB)':<20}
+{'='*50} {'='*8} {'='*6} {'='*10} {'='*20}
+"""
+
+    for model in sorted_models:
+        # Shorten model ID (remove mlx-community/ prefix)
+        model_short = model['id'].replace('mlx-community/', '')
+        max_len = 40 if compare_stats else 48
+        if len(model_short) > max_len:
+            model_short = model_short[:max_len-3] + "..."
+
+        ram_range = f"{model['ram_min']:.1f}-{model['ram_max']:.1f}"
+
+        if compare_stats:
+            old_model = compare_models.get(model['id'])
+            if old_model:
+                old_time = old_model['total_time']
+                delta = model['total_time'] - old_time
+                change_pct = (delta / old_time * 100) if old_time > 0 else 0
+                # Status indicator
+                if change_pct > 5:
+                    status = "⚠️"
+                elif change_pct < -1:
+                    status = "✅"
+                else:
+                    status = ""
+                change_str = f"{change_pct:+.1f}% {status}"
+                md += f"{model_short:<42} {model['size_gb']:>5.1f}GB {model['count']:<5} {model['total_time']:>6.1f}s {old_time:>6.1f}s {delta:>+6.1f}s {change_str:<10} {ram_range:<12}\n"
+            else:
+                md += f"{model_short:<42} {model['size_gb']:>5.1f}GB {model['count']:<5} {model['total_time']:>6.1f}s {'N/A':<8} {'N/A':<8} {'NEW':<10} {ram_range:<12}\n"
+        else:
+            md += f"{model_short:<50} {model['size_gb']:>6.1f}GB {model['count']:<6} {model['total_time']:>8.1f}s  {ram_range:<20}\n"
+
+    md += "```\n\n"
+
+    # Model Categories
+    large_models = [m for m in sorted_models if m['size_gb'] >= 20]
+    medium_models = [m for m in sorted_models if 10 <= m['size_gb'] < 20]
+    small_models = [m for m in sorted_models if m['size_gb'] < 10]
+
+    md += "### Model Categories\n\n"
+    md += f"""```
+LARGE MODELS (≥20 GB):    {len(large_models)} models
+  Avg size:               {sum(m['size_gb'] for m in large_models) / len(large_models):.1f} GB
+  Avg test time:          {sum(m['total_time']/m['count'] for m in large_models) / len(large_models):.1f}s
+  Avg min RAM:            {sum(m['ram_min'] for m in large_models) / len(large_models):.1f} GB
+
+MEDIUM MODELS (10-20 GB): {len(medium_models)} models
+  Avg size:               {sum(m['size_gb'] for m in medium_models) / len(medium_models):.1f} GB
+  Avg test time:          {sum(m['total_time']/m['count'] for m in medium_models) / len(medium_models):.1f}s
+  Avg min RAM:            {sum(m['ram_min'] for m in medium_models) / len(medium_models):.1f} GB
+
+SMALL MODELS (<10 GB):    {len(small_models)} models
+  Avg size:               {sum(m['size_gb'] for m in small_models) / len(small_models):.1f} GB
+  Avg test time:          {sum(m['total_time']/m['count'] for m in small_models) / len(small_models):.1f}s
+  Avg min RAM:            {sum(m['ram_min'] for m in small_models) / len(small_models):.1f} GB
+```
+""" if large_models and medium_models and small_models else ""
+
+    md += "\n---\n\n"
+
+    # Per-Test Statistics
+    md += "## Per-Test Statistics\n\n"
+    md += "Shows performance range across models for each test.\n\n"
+
+    # Sort tests by model count (descending) - most representative tests first
+    sorted_tests = sorted(stats['tests'].values(), key=lambda t: t['model_count'], reverse=True)
+
+    # Build comparison lookup for tests
+    compare_tests = {}
+    if compare_stats:
+        compare_tests = {t['name']: t for t in compare_stats['tests'].values()}
+
+    if compare_stats:
+        md += f"""```
+{'Test Name':<40} {'Models':<7} {'Fastest':<20} {'Slowest':<20} {'Med':<6} {'Old':<6} {'Δ Med':<8}
+{'='*40} {'='*7} {'='*20} {'='*20} {'='*6} {'='*6} {'='*8}
+"""
+    else:
+        md += f"""```
+{'Test Name':<50} {'Models':<7} {'Fastest':<25} {'Slowest':<25} {'Med Time'}
+{'='*50} {'='*7} {'='*25} {'='*25} {'='*8}
+"""
+
+    for test in sorted_tests:
+        # Shorten test name if needed
+        max_test_len = 38 if compare_stats else 48
+        test_short = test['name']
+        if len(test_short) > max_test_len:
+            test_short = test_short[:max_test_len-3] + "..."
+
+        # Format fastest/slowest
+        fastest = test['fastest']
+        slowest = test['slowest']
+
+        if fastest and slowest:
+            max_model_len = 18 if compare_stats else 23
+            fastest_str = f"{fastest['model_short']} ({fastest['duration']:.1f}s)"
+            slowest_str = f"{slowest['model_short']} ({slowest['duration']:.1f}s)"
+            if len(fastest_str) > max_model_len:
+                fastest_str = fastest_str[:max_model_len-3] + "..."
+            if len(slowest_str) > max_model_len:
+                slowest_str = slowest_str[:max_model_len-3] + "..."
+
+            med_time = test['median_time']
+
+            if compare_stats:
+                old_test = compare_tests.get(test['name'])
+                if old_test:
+                    old_med = old_test['median_time']
+                    delta_pct = ((med_time - old_med) / old_med * 100) if old_med > 0 else 0
+                    delta_str = f"{delta_pct:+.1f}%"
+                    md += f"{test_short:<40} {test['model_count']:<7} {fastest_str:<20} {slowest_str:<20} {med_time:<5.1f}s {old_med:<5.1f}s {delta_str:<8}\n"
+                else:
+                    md += f"{test_short:<40} {test['model_count']:<7} {fastest_str:<20} {slowest_str:<20} {med_time:<5.1f}s {'N/A':<6} {'NEW':<8}\n"
+            else:
+                md += f"{test_short:<50} {test['model_count']:<7} {fastest_str:<25} {slowest_str:<25} {med_time:.1f}s\n"
+
+    md += "```\n\n"
+
+    md += "\n---\n\n"
+    md += "## Files\n\n"
+    md += f"- **Benchmark report:** `{input_file}`\n"
+    md += f"- **Schema:** `benchmarks/schemas/report-v{stats['schema_version']}.schema.json`\n"
+
+    return md
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate benchmark analysis report from JSONL data",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__
+    )
+    parser.add_argument(
+        'input',
+        nargs='?',
+        type=Path,
+        help='JSONL benchmark file (default: latest in benchmarks/reports/)'
+    )
+    parser.add_argument(
+        '--compare',
+        type=Path,
+        help='Compare with this JSONL file (adds Old/Δ/Change columns)'
+    )
+    parser.add_argument(
+        '--output',
+        type=Path,
+        help='Output markdown file (default: auto-generated in benchmarks/reports/)'
+    )
+
+    args = parser.parse_args()
+
+    # Determine input file
+    if args.input:
+        input_file = args.input
+    else:
+        input_file = find_latest_jsonl()
+        if not input_file:
+            print("❌ No JSONL files found in benchmarks/reports/")
+            sys.exit(1)
+        print(f"📊 Auto-detected: {input_file}")
+
+    if not input_file.exists():
+        print(f"❌ File not found: {input_file}")
+        sys.exit(1)
+
+    # Load and validate
+    print(f"📋 Loading: {input_file}")
+    schema = load_schema()
+    data = load_jsonl(input_file)
+
+    print(f"✓ Loaded {len(data)} entries")
+
+    # Validate against schema
+    if not validate_jsonl(data, schema, input_file):
+        sys.exit(1)
+
+    print(f"✓ Schema validation passed")
+
+    # Calculate statistics
+    stats = calculate_statistics(data)
+
+    # Load and calculate comparison statistics if requested
+    compare_stats = None
+    if args.compare:
+        if not args.compare.exists():
+            print(f"❌ Comparison file not found: {args.compare}")
+            sys.exit(1)
+        print(f"📊 Comparing with: {args.compare}")
+        compare_data = load_jsonl(args.compare)
+        if not validate_jsonl(compare_data, schema, args.compare):
+            sys.exit(1)
+        compare_stats = calculate_statistics(compare_data)
+        print(f"✓ Loaded {len(compare_data)} comparison entries")
+
+    # Generate report
+    markdown = generate_markdown(stats, input_file, args.compare, compare_stats)
+
+    # Determine output file
+    if args.output:
+        output_file = args.output
+    else:
+        # Auto-generate: BENCHMARK-v1-<version>-<date>.md
+        version = extract_version_from_filename(input_file) or stats["mlx_knife_version"]
+        date = input_file.stem.split("-v")[0]  # Extract date portion
+        output_file = REPORTS_DIR / f"BENCHMARK-v{TEMPLATE_VERSION}-{version}-{date}.md"
+
+    # Write output
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_file, 'w') as f:
+        f.write(markdown)
+
+    print(f"✅ Generated: {output_file}")
+    print()
+    print(f"Summary:")
+    print(f"  Tests: {stats['passed']}/{stats['total_tests']} passed")
+    print(f"  Duration: {stats['total_duration']/60:.1f} min")
+    print(f"  Quality: {stats['quality']['clean_percent']:.1f}% clean")
+    print(f"  Models: {len(stats['models'])}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/reports/BENCHMARK-v1.0-2.0.4b3-2025-12-20.md b/benchmarks/reports/BENCHMARK-v1.0-2.0.4b3-2025-12-20.md
new file mode 100644
index 0000000..87fb85b
--- /dev/null
+++ b/benchmarks/reports/BENCHMARK-v1.0-2.0.4b3-2025-12-20.md
@@ -0,0 +1,125 @@
+# Benchmark Report v1.0: 2.0.4b3
+
+**Date:** 2025-12-20
+**Generated:** 2025-12-20T14:43:01.786689+00:00
+**Generator:** generate_benchmark_report.py v1.0
+**Hardware:** Mac14,13, 12 cores
+
+---
+
+## Input Files
+
+- **Primary:** `benchmarks/reports/2025-12-20-v2.0.4b3-2nd_0.2.0_schema.jsonl`
+- **Schema:** v0.2.0
+
+---
+
+## Executive Summary
+
+**Tests:** 162 total (141 passed, 21 skipped)
+**Duration:** 1169.3s (19.5 min)
+**Quality:** 100.0% clean (162/162)
+**Models:** 22 tested
+
+✅ **System Health:** All tests clean (0 MB swap, 0 zombies)
+
+---
+
+## Test Summary
+
+```
+Total tests:       162
+Passed:            141
+  With model:      84
+  Infrastructure:  57
+Skipped:           21
+Duration:          1169.3s (19.5 min)
+```
+
+---
+
+## System Health
+
+```
+Swap (MB):         min=0, max=0, avg=0.0
+RAM free (GB):     min=0.0, max=46.7, avg=19.0
+Zombies:           min=0, max=0
+
+Quality Flags:
+  Clean:           162/162 (100.0%)
+  Degraded (swap): 0
+  Degraded (zombies): 0
+```
+
+---
+
+## Per-Model Statistics
+
+```
+Model                                              Size     Tests  Time       RAM (GB)            
+================================================== ======== ====== ========== ====================
+Mistral-Small-3.2-24B-Instruct-2506-8bit             23.3GB 4         102.2s  21.4-25.9           
+Qwen3-Coder-30B-A3B-Instruct-6bit-DWQ-lr9e-8         24.9GB 4          97.5s  21.6-26.8           
+Mixtral-8x7B-Instruct-v0.1-4bit                      24.5GB 4          96.9s  2.7-26.4            
+DeepHermes-3-Mistral-24B-Preview-8bit                23.3GB 4          63.0s  0.0-24.6            
+OpenCodeInterpreter-DS-33B-hf-4bit-mlx               17.8GB 4          62.9s  17.9-33.0           
+Qwen3-32B-4bit                                       17.2GB 4          48.7s  17.1-20.3           
+Klear-46B-A2.5B-Instruct-3bit                        18.9GB 4          40.7s  18.9-19.9           
+MiMo-VL-7B-RL-bf16                                   15.5GB 4          38.9s  14.6-19.7           
+gpt-oss-20b-MXFP4-Q8                                 11.3GB 4          36.6s  14.2-36.4           
+Qwen3-30B-A3B-Instruct-2507-4bit                     16.0GB 4          34.2s  16.2-23.7           
+Qwen3-Coder-30B-A3B-Instruct-4bit                    16.0GB 4          33.1s  16.3-17.1           
+Mistral-Small-3.2-24B-Instruct-2506-4bit             12.4GB 4          32.9s  13.0-16.9           
+Mistral-Small-Instruct-2409-4bit                     11.7GB 4          27.6s  12.9-26.2           
+Qwen2.5-Coder-7B-Instruct-8bit                        7.5GB 4          19.9s  8.5-31.2            
+DeepSeek-R1-Distill-Llama-8B-4bit                     4.2GB 4          19.7s  20.2-37.6           
+pixtral-12b-8bit                                     12.6GB 2          15.5s  14.3-14.4           
+Mistral-7B-Instruct-v0.2-4bit                         4.0GB 4          14.1s  8.9-26.2            
+Gabliterated-Qwen3-0.6B-float32                       2.2GB 4          12.7s  16.1-37.3           
+Phi-3-mini-4k-instruct-4bit                           2.0GB 4          11.5s  14.6-46.7           
+Phi-3.5-mini-instruct-4bit                            2.0GB 4          10.2s  12.6-44.6           
+Qwen2.5-0.5B-Instruct-4bit                            0.3GB 4           9.2s  13.8-46.0           
+Llama-3.2-11B-Vision-Instruct-4bit                    5.6GB 2           8.9s  10.3-12.1           
+```
+
+### Model Categories
+
+```
+LARGE MODELS (≥20 GB):    4 models
+  Avg size:               24.0 GB
+  Avg test time:          22.5s
+  Avg min RAM:            11.5 GB
+
+MEDIUM MODELS (10-20 GB): 10 models
+  Avg size:               14.9 GB
+  Avg test time:          9.7s
+  Avg min RAM:            15.6 GB
+
+SMALL MODELS (<10 GB):    8 models
+  Avg size:               3.5 GB
+  Avg test time:          3.6s
+  Avg min RAM:            13.1 GB
+```
+
+---
+
+## Per-Test Statistics
+
+Shows performance range across models for each test.
+
+```
+Test Name                                          Models  Fastest                   Slowest                   Med Time
+================================================== ======= ========================= ========================= ========
+test_run_command                                   22      Qwen2.5 (1.2s)            DeepHermes (22.1s)        7.1s    
+test_run_json_output                               22      Qwen2.5 (1.2s)            Mistral (13.3s)           7.1s    
+test_chat_completions_batch                        20      Phi (3.3s)                Mixtral (30.9s)           8.7s    
+test_chat_completions_streaming                    20      Qwen2.5 (3.4s)            Qwen3 (51.3s)             10.6s   
+```
+
+
+---
+
+## Files
+
+- **Benchmark report:** `benchmarks/reports/2025-12-20-v2.0.4b3-2nd_0.2.0_schema.jsonl`
+- **Schema:** `benchmarks/schemas/report-v0.2.0.schema.json`
diff --git a/benchmarks/tools/memmon.py b/benchmarks/tools/memmon.py
new file mode 100644
index 0000000..bf57c3e
--- /dev/null
+++ b/benchmarks/tools/memmon.py
@@ -0,0 +1,371 @@
+#!/usr/bin/env python3
+"""Memory Monitor - Standalone tool for tracking memory during subprocess execution.
+
+Samples RAM, swap, and memory pressure while running any command.
+Outputs JSONL with per-sample data and final summary.
+
+Usage:
+    # Basic usage
+    python benchmarks/tools/memmon.py -- pytest -m live_e2e tests_2.0/live/
+
+    # With options
+    python benchmarks/tools/memmon.py --interval 200 --output memory.jsonl -- pytest -v
+
+    # Just monitor (no subprocess)
+    python benchmarks/tools/memmon.py --duration 60 --output memory.jsonl
+
+Future: Will be part of mlxk-benchmark kit.
+"""
+
+import argparse
+import json
+import subprocess
+import sys
+import threading
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+
+
+def get_memory_sample() -> dict:
+    """Get current memory state using psutil."""
+    try:
+        import psutil
+        import subprocess
+
+        # Get memory pressure from sysctl (macOS only)
+        # Values: 1=NORMAL (green), 2=WARN (yellow), 4=CRITICAL (red)
+        memory_pressure = 1  # Default to NORMAL
+        try:
+            result = subprocess.run(
+                ["sysctl", "-n", "kern.memorystatus_vm_pressure_level"],
+                capture_output=True, text=True, timeout=1
+            )
+            memory_pressure = int(result.stdout.strip())
+        except Exception:
+            pass
+
+        vm = psutil.virtual_memory()
+        swap = psutil.swap_memory()
+        return {
+            "ram_free_gb": round(vm.available / 1e9, 2),
+            "ram_used_gb": round(vm.used / 1e9, 2),
+            "ram_percent": vm.percent,
+            "swap_used_mb": round(swap.used / 1e6, 1),
+            "swap_percent": swap.percent,
+            "memory_pressure": memory_pressure,
+        }
+    except ImportError:
+        # Fallback without psutil
+        return get_memory_sample_native()
+
+
+def get_memory_sample_native() -> dict:
+    """Get memory state using native macOS commands (no psutil)."""
+    import subprocess
+
+    # Get memory pressure (1=NORMAL/green, 2=WARN/yellow, 4=CRITICAL/red)
+    memory_pressure = 1  # Default to NORMAL
+    try:
+        result = subprocess.run(
+            ["sysctl", "-n", "kern.memorystatus_vm_pressure_level"],
+            capture_output=True, text=True, timeout=1
+        )
+        memory_pressure = int(result.stdout.strip())
+    except Exception:
+        pass
+
+    # Get swap usage
+    swap_mb = 0
+    try:
+        result = subprocess.run(
+            ["sysctl", "-n", "vm.swapusage"],
+            capture_output=True, text=True, timeout=1
+        )
+        # Parse: "total = 0.00M  used = 0.00M  free = 0.00M  (encrypted)"
+        for part in result.stdout.split():
+            if part.endswith("M") and "used" in result.stdout.split()[result.stdout.split().index(part)-2]:
+                swap_mb = float(part[:-1])
+                break
+        # Simpler parsing
+        parts = result.stdout.replace("M", "").split()
+        for i, p in enumerate(parts):
+            if p == "used" and i + 2 < len(parts):
+                swap_mb = float(parts[i + 2])
+                break
+    except Exception:
+        pass
+
+    # Get RAM via vm_stat
+    ram_free_gb = 0
+    try:
+        result = subprocess.run(
+            ["vm_stat"],
+            capture_output=True, text=True, timeout=1
+        )
+        # Parse page size and available pages
+        page_size = 16384  # Default for Apple Silicon
+        pages_free = 0
+        pages_inactive = 0
+        pages_purgeable = 0
+        pages_speculative = 0
+
+        for line in result.stdout.splitlines():
+            if "page size of" in line:
+                page_size = int(line.split()[-2])
+            elif "Pages free:" in line:
+                pages_free = int(line.split()[-1].rstrip("."))
+            elif "Pages inactive:" in line:
+                pages_inactive = int(line.split()[-1].rstrip("."))
+            elif "Pages purgeable:" in line:
+                pages_purgeable = int(line.split()[-1].rstrip("."))
+            elif "Pages speculative:" in line:
+                pages_speculative = int(line.split()[-1].rstrip("."))
+
+        # Total available = free + inactive + purgeable + speculative
+        total_available_pages = pages_free + pages_inactive + pages_purgeable + pages_speculative
+        ram_free_gb = round((total_available_pages * page_size) / 1e9, 2)
+    except Exception:
+        pass
+
+    return {
+        "ram_free_gb": ram_free_gb,
+        "ram_used_gb": 0,  # Not available without psutil
+        "ram_percent": 0,
+        "swap_used_mb": swap_mb,
+        "swap_percent": 0,
+        "memory_pressure": memory_pressure,
+    }
+
+
+class MemoryMonitor:
+    """Background memory sampler.
+
+    Usage:
+        monitor = MemoryMonitor(interval_ms=200)
+        monitor.start()
+        # ... do work ...
+        summary = monitor.stop()
+    """
+
+    def __init__(self, interval_ms: int = 200):
+        self.interval = interval_ms / 1000
+        self.samples: list[dict] = []
+        self.running = False
+        self.thread: Optional[threading.Thread] = None
+        self.start_time: float = 0
+
+    def start(self):
+        """Start background sampling."""
+        self.running = True
+        self.samples = []
+        self.start_time = time.time()
+        self.thread = threading.Thread(target=self._sample_loop, daemon=True)
+        self.thread.start()
+
+    def stop(self) -> dict:
+        """Stop sampling and return summary."""
+        self.running = False
+        if self.thread:
+            self.thread.join(timeout=1.0)
+
+        if not self.samples:
+            return {"error": "No samples collected"}
+
+        ram_values = [s["ram_free_gb"] for s in self.samples]
+        swap_values = [s["swap_used_mb"] for s in self.samples]
+
+        return {
+            "duration_s": round(time.time() - self.start_time, 2),
+            "samples": len(self.samples),
+            "interval_ms": int(self.interval * 1000),
+            "ram_free_min_gb": min(ram_values),
+            "ram_free_max_gb": max(ram_values),
+            "ram_free_avg_gb": round(sum(ram_values) / len(ram_values), 2),
+            "swap_max_mb": max(swap_values),
+            "swap_avg_mb": round(sum(swap_values) / len(swap_values), 1),
+        }
+
+    def get_samples(self) -> list[dict]:
+        """Get all collected samples."""
+        return self.samples.copy()
+
+    def _sample_loop(self):
+        """Background sampling loop."""
+        while self.running:
+            sample = get_memory_sample()
+            sample["ts"] = round(time.time(), 3)
+            sample["elapsed_s"] = round(time.time() - self.start_time, 2)
+            self.samples.append(sample)
+            time.sleep(self.interval)
+
+
+def run_with_monitoring(
+    command: list[str],
+    interval_ms: int = 200,
+    output_file: Optional[Path] = None,
+    verbose: bool = False
+) -> dict:
+    """Run a command while monitoring memory.
+
+    Args:
+        command: Command and arguments to run
+        interval_ms: Sampling interval in milliseconds
+        output_file: Optional JSONL output file
+        verbose: Print samples as they're collected
+
+    Returns:
+        Summary dict with memory statistics
+    """
+    monitor = MemoryMonitor(interval_ms=interval_ms)
+
+    print(f"Starting memory monitor (interval: {interval_ms}ms)")
+    print(f"Running: {' '.join(command)}")
+    print("-" * 60)
+
+    monitor.start()
+
+    # Run subprocess
+    try:
+        result = subprocess.run(command)
+        exit_code = result.returncode
+    except KeyboardInterrupt:
+        exit_code = 130
+        print("\nInterrupted")
+    except Exception as e:
+        exit_code = 1
+        print(f"\nError: {e}")
+
+    summary = monitor.stop()
+    summary["exit_code"] = exit_code
+    summary["command"] = " ".join(command)
+    summary["timestamp"] = datetime.now(timezone.utc).isoformat()
+
+    print("-" * 60)
+    print(f"Memory Monitor Summary:")
+    print(f"  Duration:     {summary['duration_s']:.1f}s ({summary['samples']} samples)")
+    print(f"  RAM free:     {summary['ram_free_min_gb']:.1f} - {summary['ram_free_max_gb']:.1f} GB")
+    print(f"  Swap peak:    {summary['swap_max_mb']:.1f} MB")
+    print(f"  Exit code:    {exit_code}")
+
+    # Write output
+    if output_file:
+        with open(output_file, "w") as f:
+            # Write samples
+            for sample in monitor.get_samples():
+                f.write(json.dumps(sample) + "\n")
+            # Write summary as last line
+            f.write(json.dumps({"summary": summary}) + "\n")
+        print(f"  Output:       {output_file}")
+
+    return summary
+
+
+def monitor_only(
+    duration_s: float,
+    interval_ms: int = 200,
+    output_file: Optional[Path] = None
+) -> dict:
+    """Monitor memory for a fixed duration (no subprocess).
+
+    Args:
+        duration_s: How long to monitor
+        interval_ms: Sampling interval in milliseconds
+        output_file: Optional JSONL output file
+
+    Returns:
+        Summary dict with memory statistics
+    """
+    monitor = MemoryMonitor(interval_ms=interval_ms)
+
+    print(f"Monitoring memory for {duration_s}s (interval: {interval_ms}ms)")
+    print("-" * 60)
+
+    monitor.start()
+
+    try:
+        time.sleep(duration_s)
+    except KeyboardInterrupt:
+        print("\nInterrupted")
+
+    summary = monitor.stop()
+    summary["timestamp"] = datetime.now(timezone.utc).isoformat()
+
+    print("-" * 60)
+    print(f"Memory Monitor Summary:")
+    print(f"  Duration:     {summary['duration_s']:.1f}s ({summary['samples']} samples)")
+    print(f"  RAM free:     {summary['ram_free_min_gb']:.1f} - {summary['ram_free_max_gb']:.1f} GB")
+    print(f"  Swap peak:    {summary['swap_max_mb']:.1f} MB")
+
+    if output_file:
+        with open(output_file, "w") as f:
+            for sample in monitor.get_samples():
+                f.write(json.dumps(sample) + "\n")
+            f.write(json.dumps({"summary": summary}) + "\n")
+        print(f"  Output:       {output_file}")
+
+    return summary
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Monitor memory while running a command",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__
+    )
+    parser.add_argument(
+        "--interval", "-i",
+        type=int,
+        default=200,
+        help="Sampling interval in milliseconds (default: 200)"
+    )
+    parser.add_argument(
+        "--output", "-o",
+        type=Path,
+        help="Output JSONL file for samples and summary"
+    )
+    parser.add_argument(
+        "--duration", "-d",
+        type=float,
+        help="Monitor for fixed duration (seconds), no subprocess"
+    )
+    parser.add_argument(
+        "--verbose", "-v",
+        action="store_true",
+        help="Print samples as they're collected"
+    )
+    parser.add_argument(
+        "command",
+        nargs="*",
+        help="Command to run (after --)"
+    )
+
+    args = parser.parse_args()
+
+    if args.duration:
+        # Monitor-only mode
+        summary = monitor_only(
+            duration_s=args.duration,
+            interval_ms=args.interval,
+            output_file=args.output
+        )
+    elif args.command:
+        # Run command with monitoring
+        summary = run_with_monitoring(
+            command=args.command,
+            interval_ms=args.interval,
+            output_file=args.output,
+            verbose=args.verbose
+        )
+        sys.exit(summary.get("exit_code", 0))
+    else:
+        parser.print_help()
+        print("\nExamples:")
+        print("  python benchmarks/tools/memmon.py -- pytest -m live_e2e")
+        print("  python benchmarks/tools/memmon.py --duration 10 --output mem.jsonl")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/tools/memplot.py b/benchmarks/tools/memplot.py
new file mode 100644
index 0000000..3da6883
--- /dev/null
+++ b/benchmarks/tools/memplot.py
@@ -0,0 +1,522 @@
+#!/usr/bin/env python3
+"""Memory Timeline Visualization - Generate interactive HTML charts from benchmark data.
+
+Correlates memory samples (memmon.py) with test results to show RAM/swap usage
+over time with model markers.
+
+Usage:
+    # Basic usage
+    python benchmarks/tools/memplot.py memory.jsonl benchmark.jsonl
+
+    # Custom output
+    python benchmarks/tools/memplot.py memory.jsonl benchmark.jsonl -o report.html
+
+    # PNG export (requires kaleido)
+    python benchmarks/tools/memplot.py memory.jsonl benchmark.jsonl --format png
+
+Requires: plotly (pip install plotly)
+Optional: kaleido (pip install kaleido) for PNG export
+"""
+
+import argparse
+import json
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+
+def parse_memory_samples(path: Path) -> tuple[list[dict], dict]:
+    """Parse memmon JSONL output.
+
+    Returns:
+        Tuple of (samples list, summary dict)
+    """
+    samples = []
+    summary = {}
+
+    with open(path) as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            entry = json.loads(line)
+            if "summary" in entry:
+                summary = entry["summary"]
+            else:
+                samples.append(entry)
+
+    return samples, summary
+
+
+def parse_benchmark_results(path: Path) -> tuple[list[dict], list[dict]]:
+    """Parse benchmark JSONL output.
+
+    Returns:
+        Tuple of (tests with models, tests without models)
+    """
+    tests_with_model = []
+    tests_without_model = []
+
+    with open(path) as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            entry = json.loads(line)
+            if "timestamp" not in entry or "duration" not in entry:
+                continue
+
+            if "model" in entry and entry.get("outcome") == "passed":
+                tests_with_model.append(entry)
+            elif "model" not in entry and entry.get("outcome") == "passed":
+                tests_without_model.append(entry)
+
+    return tests_with_model, tests_without_model
+
+
+def parse_iso_timestamp(ts_str: str) -> float:
+    """Convert ISO timestamp to Unix timestamp."""
+    # Handle timezone suffix
+    if ts_str.endswith("Z"):
+        ts_str = ts_str[:-1] + "+00:00"
+    dt = datetime.fromisoformat(ts_str)
+    return dt.timestamp()
+
+
+def correlate_tests_with_timeline(
+    samples: list[dict],
+    tests: list[dict],
+    memory_start_ts: float
+) -> list[dict]:
+    """Calculate test time ranges relative to memory timeline.
+
+    Returns:
+        List of dicts with model_id, start_elapsed, end_elapsed
+    """
+    if not samples or not tests:
+        return []
+
+    markers = []
+
+    for test in tests:
+        if "timestamp" not in test or "duration" not in test:
+            continue
+
+        test_end_ts = parse_iso_timestamp(test["timestamp"])
+        test_start_ts = test_end_ts - test["duration"]
+
+        # Convert to elapsed time relative to memory monitoring start
+        start_elapsed = test_start_ts - memory_start_ts
+        end_elapsed = test_end_ts - memory_start_ts
+
+        # Get model info if available (for model tests)
+        model_id = test.get("model", {}).get("id", None)
+        model_short = model_id.split("/")[-1][:20] if model_id else None
+
+        markers.append({
+            "model_id": model_id,
+            "model_short": model_short,
+            "start_elapsed": start_elapsed,
+            "end_elapsed": end_elapsed,
+            "duration": test["duration"],
+            "test": test.get("test", ""),
+        })
+
+    return markers
+
+
+def get_ram_color(ram_free_gb: float) -> str:
+    """Get color based on RAM availability."""
+    if ram_free_gb >= 32:
+        return "rgb(52, 199, 89)"   # Green - healthy
+    elif ram_free_gb >= 16:
+        return "rgb(255, 149, 0)"   # Orange - warning
+    else:
+        return "rgb(255, 59, 48)"   # Red - critical
+
+
+def create_timeline_chart(
+    samples: list[dict],
+    summary: dict,
+    model_markers: list[dict],
+    infra_markers: list[dict],
+    title: str = "Memory Timeline"
+) -> "Figure":
+    """Create interactive plotly timeline chart."""
+    try:
+        import plotly.graph_objects as go
+        from plotly.subplots import make_subplots
+    except ImportError:
+        print("Error: plotly not installed. Run: pip install plotly")
+        sys.exit(1)
+
+    # Extract data series
+    elapsed = [s["elapsed_s"] for s in samples]
+    ram_free = [s["ram_free_gb"] for s in samples]
+    swap_used = [s["swap_used_mb"] for s in samples]
+    memory_pressure = [s.get("memory_pressure", 1) for s in samples]  # Default: 1=NORMAL
+
+    # Convert elapsed to minutes for readability
+    elapsed_min = [e / 60 for e in elapsed]
+
+    # Create figure with secondary y-axis for swap
+    fig = make_subplots(specs=[[{"secondary_y": True}]])
+
+    # RAM trace - use marker color based on threshold
+    # Color each point based on RAM level
+    colors = [get_ram_color(ram) for ram in ram_free]
+
+    fig.add_trace(
+        go.Scatter(
+            x=elapsed_min,
+            y=ram_free,
+            mode="lines+markers",
+            name="RAM Free (GB)",
+            line=dict(color="rgb(52, 150, 235)", width=1.5),  # Blue line
+            marker=dict(
+                color=colors,
+                size=3,
+                line=dict(width=0),
+            ),
+            hovertemplate="Time: %{x:.1f} min<br>RAM Free: %{y:.1f} GB<extra></extra>",
+        ),
+        secondary_y=False,
+    )
+
+    # Threshold lines (assuming 64 GB total RAM)
+    max_elapsed_min = max(elapsed_min) if elapsed_min else 20
+    total_ram = 64  # GB - could be made configurable later
+
+    fig.add_trace(
+        go.Scatter(
+            x=[0, max_elapsed_min],
+            y=[32, 32],
+            mode="lines",
+            name=f"32 GB (50% of {total_ram} GB - healthy)",
+            line=dict(color="green", width=1, dash="dash"),
+            hoverinfo="skip",
+        ),
+        secondary_y=False,
+    )
+
+    fig.add_trace(
+        go.Scatter(
+            x=[0, max_elapsed_min],
+            y=[16, 16],
+            mode="lines",
+            name=f"16 GB (25% of {total_ram} GB - warning)",
+            line=dict(color="orange", width=1, dash="dash"),
+            hoverinfo="skip",
+        ),
+        secondary_y=False,
+    )
+
+    # Swap trace (secondary y-axis)
+    if any(s > 0 for s in swap_used):
+        fig.add_trace(
+            go.Scatter(
+                x=elapsed_min,
+                y=swap_used,
+                mode="lines",
+                name="Swap Used (MB)",
+                line=dict(color="red", width=2),
+                hovertemplate="Time: %{x:.1f} min<br>Swap: %{y:.0f} MB<extra></extra>",
+            ),
+            secondary_y=True,
+        )
+
+    # Model test regions (gray background for each test with model)
+    # Sort markers by time
+    model_markers_sorted = sorted(model_markers, key=lambda m: m["start_elapsed"])
+
+    test_shapes = []
+    prev_model_id = None  # Track previous model for switch detection
+
+    for i, marker in enumerate(model_markers_sorted):
+        start_min = marker["start_elapsed"] / 60
+        end_min = marker["end_elapsed"] / 60
+
+        if start_min < 0 or start_min > max_elapsed_min:
+            continue
+
+        # Add gray rectangle for this individual test
+        test_shapes.append(dict(
+            type="rect",
+            xref="x", yref="y",
+            x0=start_min,
+            x1=end_min,
+            y0=0, y1=70,
+            fillcolor="rgba(200, 200, 200, 0.3)",  # Gray for model tests
+            layer="below",
+            line=dict(width=0),
+        ))
+
+        # Add model label when model CHANGES (not just first occurrence)
+        model_id = marker["model_id"]
+        if model_id != prev_model_id:
+            fig.add_annotation(
+                x=start_min,
+                y=1.0,
+                xref="x", yref="paper",
+                text=marker["model_short"],
+                textangle=-90,
+                font=dict(size=9, color="rgba(0, 0, 0, 0.7)"),
+                showarrow=False,
+                xanchor="left",
+                yanchor="top",
+                xshift=2,
+            )
+            prev_model_id = model_id
+
+    # Infrastructure test regions (light blue background)
+    infra_markers_sorted = sorted(infra_markers, key=lambda m: m["start_elapsed"])
+
+    for marker in infra_markers_sorted:
+        start_min = marker["start_elapsed"] / 60
+        end_min = marker["end_elapsed"] / 60
+
+        if start_min < 0 or start_min > max_elapsed_min:
+            continue
+
+        # Add very light blue rectangle for infrastructure tests
+        test_shapes.append(dict(
+            type="rect",
+            xref="x", yref="y",
+            x0=start_min,
+            x1=end_min,
+            y0=0, y1=70,
+            fillcolor="rgba(173, 216, 230, 0.2)",  # Very light blue for infra tests
+            layer="below",
+            line=dict(width=0),
+        ))
+
+    region_shapes = test_shapes
+
+    # Add test markers (small vertical lines) and labels at bottom for both marker types
+    all_markers = model_markers_sorted + infra_markers_sorted
+    all_markers_sorted = sorted(all_markers, key=lambda m: m["start_elapsed"])
+
+    for marker in all_markers_sorted:
+        start_min = marker["start_elapsed"] / 60
+
+        if start_min < 0 or start_min > max_elapsed_min:
+            continue
+
+        # Extract test name (shorten if needed)
+        test_name = marker["test"].split("::")[-1].split("[")[0]
+        if len(test_name) > 25:
+            test_name = test_name[:22] + "..."
+
+        fig.add_vline(
+            x=start_min,
+            line=dict(color="rgba(128, 128, 128, 0.2)", width=0.5),
+        )
+
+        # Add test label at bottom (aligned with start time like model labels)
+        fig.add_annotation(
+            x=start_min,
+            y=0.0,
+            xref="x", yref="paper",
+            text=test_name,
+            textangle=-90,
+            font=dict(size=9, color="rgba(0, 0, 0, 0.6)"),  # Same size as model labels
+            showarrow=False,
+            xanchor="left",  # Same as model labels (aligned at start)
+            yanchor="bottom",
+            xshift=2,  # Same offset as model labels
+        )
+
+    # Add memory pressure backgrounds (1=normal/white, 2=warn/yellow, 4=critical/red)
+    pressure_shapes = []
+    i = 0
+    while i < len(memory_pressure):
+        pressure = memory_pressure[i]
+
+        if pressure > 1:  # 2=WARN or 4=CRITICAL
+            # Find end of this pressure region
+            start_min = elapsed_min[i]
+            j = i
+            while j < len(memory_pressure) and memory_pressure[j] == pressure:
+                j += 1
+            end_min = elapsed_min[j - 1] if j > i else start_min
+
+            # Color based on pressure level
+            if pressure == 2:
+                color = "rgba(255, 204, 0, 0.15)"  # Yellow (WARN)
+            else:  # pressure == 4
+                color = "rgba(255, 59, 48, 0.15)"  # Red (CRITICAL)
+
+            pressure_shapes.append(dict(
+                type="rect",
+                xref="x", yref="y",  # Changed from "paper" to "y" for rangeslider compatibility
+                x0=start_min, x1=end_min,
+                y0=0, y1=70,  # Use actual y-axis values
+                fillcolor=color,
+                layer="below",
+                line=dict(width=0),
+            ))
+            i = j
+        else:
+            i += 1
+
+    # Combine all shapes (regions first, then pressure on top)
+    shapes = region_shapes + pressure_shapes
+
+    # Debug output
+    print(f"  Test shapes (gray): {len(region_shapes)}")
+    print(f"  Pressure shapes (yellow/red): {len(pressure_shapes)}")
+    print(f"  Total shapes: {len(shapes)}")
+    if region_shapes:
+        print(f"  Sample test shape: {region_shapes[0]}")
+
+    # Layout (without shapes - we'll add them individually)
+    fig.update_layout(
+        title=dict(
+            text=title,
+            font=dict(size=16),
+        ),
+        xaxis=dict(
+            title="Time (minutes)",
+            showgrid=True,
+            gridcolor="rgba(128,128,128,0.2)",
+            rangeslider=dict(visible=True, yaxis=dict(rangemode="match")),
+        ),
+        yaxis=dict(
+            title="RAM Free (GB)",
+            showgrid=True,
+            gridcolor="rgba(128,128,128,0.2)",
+            range=[0, 70],  # Typical max for 64GB system
+        ),
+        yaxis2=dict(
+            title="Swap Used (MB)",
+            showgrid=False,
+            range=[0, max(swap_used) * 1.2] if any(s > 0 for s in swap_used) else [0, 100],
+        ),
+        legend=dict(
+            orientation="h",
+            yanchor="bottom",
+            y=1.02,
+            xanchor="right",
+            x=1,
+        ),
+        hovermode="x unified",
+        template="plotly_white",
+        plot_bgcolor="rgba(0,0,0,0)",  # Transparent plot background so shapes show through
+        height=500,
+        margin=dict(t=80, b=60, l=60, r=60),
+    )
+
+    # Add shapes individually using fig.add_shape() method
+    # This is more explicit than passing shapes array to update_layout
+    for shape in shapes:
+        fig.add_shape(**shape)
+
+    # Debug: Check shapes after adding individually
+    print(f"  Shapes in fig.layout after add_shape: {len(fig.layout.shapes)}")
+
+    # Add summary annotation
+    if summary:
+        summary_text = (
+            f"Duration: {summary.get('duration_s', 0)/60:.1f} min | "
+            f"Samples: {summary.get('samples', 0)} | "
+            f"RAM: {summary.get('ram_free_min_gb', 0):.1f}-{summary.get('ram_free_max_gb', 0):.1f} GB | "
+            f"Swap peak: {summary.get('swap_max_mb', 0):.0f} MB"
+        )
+        fig.add_annotation(
+            text=summary_text,
+            xref="paper", yref="paper",
+            x=0, y=-0.12,
+            showarrow=False,
+            font=dict(size=10, color="gray"),
+            align="left",
+        )
+
+    return fig
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate memory timeline visualization from benchmark data",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    parser.add_argument(
+        "memory_file",
+        type=Path,
+        help="Memory samples JSONL from memmon.py",
+    )
+    parser.add_argument(
+        "benchmark_file",
+        type=Path,
+        nargs="?",
+        help="Benchmark results JSONL (optional, for model markers)",
+    )
+    parser.add_argument(
+        "-o", "--output",
+        type=Path,
+        help="Output file (default: memory_timeline.html)",
+    )
+    parser.add_argument(
+        "--format",
+        choices=["html", "png", "svg"],
+        default="html",
+        help="Output format (default: html)",
+    )
+    parser.add_argument(
+        "--title",
+        default="Memory Timeline",
+        help="Chart title",
+    )
+
+    args = parser.parse_args()
+
+    # Default output filename
+    if not args.output:
+        args.output = Path(f"memory_timeline.{args.format}")
+
+    # Parse inputs
+    print(f"Reading memory samples: {args.memory_file}")
+    samples, summary = parse_memory_samples(args.memory_file)
+    print(f"  Found {len(samples)} samples")
+
+    model_markers = []
+    infra_markers = []
+    if args.benchmark_file:
+        print(f"Reading benchmark results: {args.benchmark_file}")
+        tests_with_model, tests_without_model = parse_benchmark_results(args.benchmark_file)
+        print(f"  Found {len(tests_with_model)} test entries with models")
+        print(f"  Found {len(tests_without_model)} infrastructure test entries")
+
+        # Get memory start timestamp from first sample
+        if samples:
+            memory_start_ts = samples[0]["ts"]
+            model_markers = correlate_tests_with_timeline(samples, tests_with_model, memory_start_ts)
+            infra_markers = correlate_tests_with_timeline(samples, tests_without_model, memory_start_ts)
+            print(f"  Correlated {len(model_markers)} model test markers")
+            print(f"  Correlated {len(infra_markers)} infrastructure test markers")
+
+    # Create chart
+    print(f"Generating {args.format.upper()} chart...")
+    fig = create_timeline_chart(samples, summary, model_markers, infra_markers, title=args.title)
+
+    # Export
+    if args.format == "html":
+        fig.write_html(
+            args.output,
+            include_plotlyjs="cdn",
+            full_html=True,
+        )
+    else:
+        try:
+            fig.write_image(args.output, scale=2)
+        except Exception as e:
+            print(f"Error: PNG/SVG export requires kaleido: pip install kaleido")
+            print(f"Details: {e}")
+            sys.exit(1)
+
+    print(f"Output: {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/validate_reports.py b/benchmarks/validate_reports.py
index 8f13c75..e91e4b0 100644
--- a/benchmarks/validate_reports.py
+++ b/benchmarks/validate_reports.py
@@ -77,14 +77,16 @@ def main():
         print("Usage: python benchmarks/validate_reports.py <jsonl_file> [<jsonl_file> ...]")
         sys.exit(1)
 
-    # Load schema
-    schema_path = Path("benchmarks/schemas/report-v0.1.schema.json")
+    # Load schema (always use current version)
+    schema_path = Path("benchmarks/schemas/report-current.schema.json")
     if not schema_path.exists():
         print(f"Error: Schema not found at {schema_path}")
         sys.exit(1)
 
     schema = load_schema(schema_path)
-    print(f"📋 Loaded schema: {schema_path}")
+    # Resolve symlink for display
+    resolved = schema_path.resolve()
+    print(f"📋 Loaded schema: {schema_path} → {resolved.name}")
     print()
 
     # Validate each file
diff --git a/docs/ADR/ADR-016-Memory-Aware-Model-Loading.md b/docs/ADR/ADR-016-Memory-Aware-Model-Loading.md
index bf4c4c1..601ae26 100644
--- a/docs/ADR/ADR-016-Memory-Aware-Model-Loading.md
+++ b/docs/ADR/ADR-016-Memory-Aware-Model-Loading.md
@@ -89,27 +89,11 @@ This is a **hardware fact** (from `sysctl -n hw.memsize`), not a heuristic.
 - Vision >70%: HTTP 507 Insufficient Storage + JSON error response
 - Text >70%: `logger.warning("Model size XX.X GB exceeds 70% of YY.Y GB system memory. Expect extreme slowness due to swapping.")` → visible via `--log-level warning` (default) and `--log-json` if enabled
 
-## TODO
+## Status
 
-### Phase 1 (2.0.4-beta.1) ✅ COMPLETE
-- [x] Add `system.memory_total_bytes` to JSON-API
-- [x] Schema bump to 0.1.6
-- [x] Document in json-api-specification.md
+**Phase 1+2:** ✅ Complete (2.0.4-beta.1) - See CHANGELOG.md
 
-### Phase 2 (2.0.4-beta.1) ✅ COMPLETE
-- [x] Implement pre-load memory check in `run.py`
-  - `_get_system_memory_bytes()` via `sysctl -n hw.memsize`
-  - `check_memory_before_load()` for CLI path
-  - `check_memory_for_server()` for server path
-- [x] Vision: ERROR + abort if size > 70% total (empirically confirmed: crash at 73%)
-  - CLI: stderr error + exit 1
-  - Server: HTTP 507 + JSON error (via `ErrorType.INSUFFICIENT_MEMORY`)
-- [x] Text: Internal log only if size > 70% total (empirically confirmed: no crash at 95-97%)
-  - CLI: No user-facing action (backwards compatible)
-  - Server: `logger.warning()` only (uses existing `--log-level`/`--log-json` infrastructure)
-- [x] Unit tests: 18 tests in `tests_2.0/test_memory_checks.py`
-
-### Phase 3 (Future)
+**Phase 3 (Future):** Issue #46
 - [ ] Configurable threshold (env var or CLI flag)
 - [ ] Vision overhead estimation based on model architecture
 - [ ] KV-Cache size estimation based on context length
diff --git a/docs/ADR/README.md b/docs/ADR/README.md
index 4c5497b..30ef4a5 100644
--- a/docs/ADR/README.md
+++ b/docs/ADR/README.md
@@ -15,16 +15,16 @@ This directory contains Architecture Decision Records (ADRs) that document signi
 | [ADR-005](ADR-005-Clone-Implementation-Beta3.md) | Clone Implementation Beta3 | Superseded by ADR-007 | 2025-09-18 |
 | [ADR-006](ADR-006-Clone-Implementation-Revised.md) | Clone Implementation Revised | Superseded by ADR-007 | 2025-09-18 |
 | [ADR-007](ADR-007-Clone-Implementation-Fixed.md) | Clone Implementation Fixed Strategy | Accepted | 2025-09-18 |
-| [ADR-008](ADR-008-MLXModel-Package-Format.md) | MLXModel Package Format | Proposed | 2025-10-17 |
+| ADR-008 | MLXModel Package Format | Proposed | (not committed) |
 | [ADR-009](ADR-009-Stop-Token-Detection-Fix.md) | Stop Token Detection Fix | Implemented | 2025-10-21 |
-| [ADR-010](ADR-010-Reasoning-Content-API.md) | Reasoning Content API | Draft | 2025-10-21 |
+| ADR-010 | Reasoning Content API | Draft | (not committed) |
 | [ADR-011](ADR-011-E2E-Live-Test-Architecture.md) | E2E Live Test Architecture | Implemented | 2025-10-21 |
 | [ADR-012](ADR-012-Vision-Support-Roadmap.md) | Vision Support Roadmap | Implemented (Phase 1-3) | 2025-11-12 |
-| [ADR-013](ADR-013-Community-Model-Quality-Database.md) | Community Model Quality Database | Planned | 2025-11-13 |
+| ADR-013 | Community Model Quality Database | Planned | (not committed) |
 | [ADR-014](ADR-014-Unix-Pipe-Integration.md) | Unix Pipe Integration | Implemented (Phase 1) | 2025-11-16 |
-| [ADR-016](ADR-016-Memory-Aware-Model-Loading.md) | Memory-Aware Model Loading | Implemented | 2025-11-20 |
-| ADR-015 | Embeddings API | Planned | (future) |
-| ADR-017 | Image Metadata & RAG | Proposed | (future) |
+| ADR-015 | Embeddings API | Planned | (not committed) |
+| [ADR-016](ADR-016-Memory-Aware-Model-Loading.md) | Memory-Aware Model Loading | Implemented (Phase 1-2) | 2025-12-05 |
+| ADR-017 | Image Metadata Extraction (EXIF) | Implemented | (not committed) |
 
 ## ADR Format
 
diff --git a/docs/SERVER-HANDBOOK.md b/docs/SERVER-HANDBOOK.md
index 78986f8..54c1559 100644
--- a/docs/SERVER-HANDBOOK.md
+++ b/docs/SERVER-HANDBOOK.md
@@ -26,7 +26,7 @@ mlxk serve --host 0.0.0.0 --port 8000
 - Python 3.9+ (Text models)
 - Python 3.10+ (Vision models)
 - mlx-lm 0.28.4+
-- mlx-vlm 0.3.9+ (optional, for vision)
+- mlx-vlm 0.3.9+ (optional, for vision; beta.3 recommends commit c4ea290e47e2155b67d94c708c662f8ab64e1b37)
 
 ---
 
@@ -380,6 +380,9 @@ python -m mlxk2.core.server_base
 pyenv install 3.10
 pyenv local 3.10
 pip install mlx-lm mlx-vlm
+
+# Beta.3 (pre-0.3.10 fix)
+pip install mlx-lm "mlx-vlm @ git+https://github.com/Blaizzy/mlx-vlm.git@c4ea290e47e2155b67d94c708c662f8ab64e1b37"
 ```
 
 ### Memory Constraint Errors (HTTP 507)
diff --git a/docs/json-api-specification.md b/docs/json-api-specification.md
index f69ed01..8bc0592 100644
--- a/docs/json-api-specification.md
+++ b/docs/json-api-specification.md
@@ -1,8 +1,8 @@
 # MLX-Knife 2.0 JSON API Specification
 
 **Specification Version:** 0.1.6
-**Status:** Alpha - Subject to change
-**Target:** MLX-Knife 2.0.4-beta.1
+**Status:** Stable (backward-compatible)
+**Released:** MLX-Knife 2.0.4-beta.1
 
 > Based on [GitHub Issue #8](https://github.com/mzau/mlx-knife/issues/8) - Comprehensive JSON output support for all commands
 
@@ -551,29 +551,31 @@ mlxk-json show "Phi-3-mini" --config --json      # Include config.json content
 }
 ```
 
-## Changes in 0.1.6 (Alpha)
+## Changes in 0.1.6 (Stable, 2.0.4-beta.1)
 
-**ADR-016 Preparation: System Memory Information**
+**System Memory Information**
 
 - Added `system` object to `version` command response
 - `system.memory_total_bytes`: Total physical RAM in bytes (from `sysctl hw.memsize`)
 - `system` is `null` on non-macOS platforms where sysctl is unavailable
-- Enables ADR-016 Memory-Aware Model Loading (pre-load memory checks)
+- Enables memory-aware model loading (ADR-016)
 
-**ADR-012: Vision Support - Model Discovery**
+**Model Discovery: Vision capability flag**
 
 - Vision models detected via `preprocessor_config.json` presence
-- `vision` capability added to model discovery (backward-compatible enum extension)
+- `vision` added to `capabilities` enum (backward-compatible extension)
 - Visible in `mlxk list --json`, `mlxk show --json`, `mlxk health --json`
 - Example: `"capabilities": ["text-generation", "chat", "vision"]`
 
-**Note on `mlxk run --image` (CLI):**
-- `mlxk run --image` command exists for vision models (ADR-012 Phase 1b)
-- Current output: Text mode only (Markdown table with filename mapping)
-- JSON output: Deferred to ADR-017 Phase 2 (requires formal schema extension)
-- Server OpenAI Vision API documented in `docs/SERVER-HANDBOOK.md`
+**Note:** Vision runtime support (`mlxk run --image`, Server API) is documented in README.md "Multi-Modal Support" and `docs/SERVER-HANDBOOK.md`.
 
-## Changes in 0.1.5 (Alpha)
+## Changes in 0.1.5 (Stable, 2.0.0)
+
+**Foundation: Model Object Schema**
+
+- Standardized `modelObject` across all commands
+- Machine-readable fields: `size_bytes`, `last_modified` (ISO-8601 UTC with `Z`)
+- No human-readable `size` or `modified` fields (JSON consumers parse structured data)
 
 **Issue #36: Separate Integrity and Runtime Compatibility Checks**
 
@@ -587,12 +589,6 @@ mlxk-json show "Phi-3-mini" --config --json      # Include config.json content
 - Gate logic: Runtime check requires passing integrity check first
 - `reason` field describes first problem found (integrity > runtime priority)
 
-## Changes in 0.1.2 (Alpha)
-
-- Introduced a common minimal Model Object for consistency across commands.
-- Replaced human-readable `size` with machine-friendly `size_bytes`.
-- Removed human-readable `modified`; `last_modified` (ISO-8601 UTC) is authoritative.
-
 ## Operations
 
 ### `mlxk-json pull <model> --json`
diff --git a/mlxk2/__init__.py b/mlxk2/__init__.py
index d68e38b..8b9039b 100644
--- a/mlxk2/__init__.py
+++ b/mlxk2/__init__.py
@@ -7,4 +7,4 @@ import warnings
 # Issue parity with 1.1.0 (Issue #22)
 warnings.filterwarnings('ignore', message='urllib3 v2 only supports OpenSSL 1.1.1+')
 
-__version__ = "2.0.4b2"
+__version__ = "2.0.4b3"
diff --git a/mlxk2/core/capabilities.py b/mlxk2/core/capabilities.py
index 09427a2..e8516fd 100644
--- a/mlxk2/core/capabilities.py
+++ b/mlxk2/core/capabilities.py
@@ -167,7 +167,11 @@ def _has_any(path: Path, patterns: Tuple[str, ...]) -> bool:
 
 
 def _detect_vision_from_config(config: Optional[Dict[str, Any]]) -> bool:
-    """Detect vision capability from config.json content."""
+    """Detect vision capability from config.json content.
+
+    Video models (AutoVideoProcessor) are excluded as they require PyTorch/Torchvision.
+    mlx-vlm only supports image vision models (AutoImageProcessor).
+    """
     if not isinstance(config, dict):
         return False
 
@@ -181,15 +185,29 @@ def _detect_vision_from_config(config: Optional[Dict[str, Any]]) -> bool:
         return True
 
     # Check for embedded preprocessor_config
-    if isinstance(config.get("preprocessor_config"), dict):
+    preprocessor_cfg = config.get("preprocessor_config")
+    if isinstance(preprocessor_cfg, dict):
+        # Exclude video processors (requires PyTorch/Torchvision)
+        if preprocessor_cfg.get("processor_class") == "AutoVideoProcessor":
+            return False
+        if "temporal_patch_size" in preprocessor_cfg:
+            return False
         return True
 
     return False
 
 
 def _detect_vision_from_files(model_path: Path) -> bool:
-    """Detect vision capability from file presence."""
-    return _has_any(
+    """Detect vision capability from file presence.
+
+    Video models (AutoVideoProcessor) are excluded as they require PyTorch/Torchvision.
+    mlx-vlm only supports image vision models (AutoImageProcessor).
+    """
+    # Check if it's a video model (requires PyTorch/Torchvision)
+    if (model_path / "video_preprocessor_config.json").exists():
+        return False
+
+    if _has_any(
         model_path,
         (
             "preprocessor_config.json",
@@ -199,7 +217,25 @@ def _detect_vision_from_files(model_path: Path) -> bool:
             "**/processor_config.json",
             "**/image_processor_config.json",
         ),
-    )
+    ):
+        # Found vision-related files, but check if it's a video processor
+        preprocessor_path = model_path / "preprocessor_config.json"
+        if preprocessor_path.exists():
+            try:
+                import json
+                with open(preprocessor_path) as f:
+                    preprocessor_data = json.load(f)
+                if isinstance(preprocessor_data, dict):
+                    # Video model indicators
+                    if preprocessor_data.get("processor_class") == "AutoVideoProcessor":
+                        return False
+                    if "temporal_patch_size" in preprocessor_data:
+                        return False
+            except Exception:
+                pass
+        return True
+
+    return False
 
 
 def _check_mlx_vlm_available() -> bool:
diff --git a/mlxk2/core/server_base.py b/mlxk2/core/server_base.py
index f9d6cfd..be7db39 100644
--- a/mlxk2/core/server_base.py
+++ b/mlxk2/core/server_base.py
@@ -118,8 +118,6 @@ def get_or_load_model(model_spec: str, verbose: bool = False) -> Any:
             raise HTTPException(status_code=503, detail="Server is shutting down")
         # Simple approach like run command - let MLXRunner handle everything
         if _current_model_path != model_spec:
-            logger.info(f"Switching to model: {model_spec}", model=model_spec)
-
             # Clean up previous model
             if _model_cache:
                 try:
@@ -229,8 +227,7 @@ def get_or_load_model(model_spec: str, verbose: bool = False) -> Any:
                 _model_cache[model_spec] = runner
                 _current_model_path = model_spec
 
-                backend_name = "vision" if policy.backend == Backend.MLX_VLM else "text"
-                logger.info(f"Model loaded successfully ({backend_name}): {model_spec}", model=model_spec)
+                logger.info(f"Switched to model: {model_spec}", model=model_spec)
 
             except HTTPException:
                 # Re-raise HTTP exceptions (501, 507, etc.) from vision/memory checks
@@ -767,11 +764,10 @@ async def list_models():
     """List available MLX models in the cache.
 
     Returns models sorted with preloaded model first (if set), then alphabetically.
-    Filters to healthy MLX models (runtime compatibility deferred to P2 refactoring).
+    Filters to healthy + runtime_compatible models.
     """
     from .cache import cache_dir_to_hf
-    from ..operations.common import detect_framework, read_front_matter
-    from ..operations.health import is_model_healthy
+    from ..operations.common import build_model_object
 
     model_list = []
     model_cache = get_current_model_cache()
@@ -783,8 +779,7 @@ async def list_models():
         model_name = cache_dir_to_hf(model_dir.name)
 
         try:
-            # Check if it's a healthy MLX model
-            # Get the latest snapshot for detection
+            # Get snapshot path
             snapshots_dir = model_dir / "snapshots"
             selected_path = None
             if snapshots_dir.exists():
@@ -792,27 +787,21 @@ async def list_models():
                 if snapshots:
                     selected_path = snapshots[0]
 
-            # Read front-matter for framework detection (align with CLI behavior)
-            probe = selected_path if selected_path is not None else model_dir
-            fm = read_front_matter(probe)
+            # Use shared build_model_object (single source of truth)
+            model_obj = build_model_object(model_name, model_dir, selected_path)
 
-            framework = detect_framework(model_name, model_dir, selected_path, fm)
-            healthy, _ = is_model_healthy(model_name)
-
-            # Filter: Only MLX + healthy models
-            # TODO P2: Add runtime_compatible check (needs refactoring to avoid duplication)
-            if framework != "MLX" or not healthy:
+            # Filter: healthy AND runtime_compatible
+            if model_obj.get("health") != "healthy":
+                continue
+            if not model_obj.get("runtime_compatible"):
                 continue
 
             # Get model context length (best effort)
             context_length = None
             try:
-                snapshots_dir = model_dir / "snapshots"
-                if snapshots_dir.exists():
-                    snapshots = [d for d in snapshots_dir.iterdir() if d.is_dir()]
-                    if snapshots:
-                        from .runner import get_model_context_length
-                        context_length = get_model_context_length(str(snapshots[0]))
+                if selected_path:
+                    from .runner import get_model_context_length
+                    context_length = get_model_context_length(str(selected_path))
             except Exception:
                 pass
 
diff --git a/mlxk2/core/vision_runner.py b/mlxk2/core/vision_runner.py
index bce9071..e41409d 100644
--- a/mlxk2/core/vision_runner.py
+++ b/mlxk2/core/vision_runner.py
@@ -97,9 +97,8 @@ class VisionRunner:
             raise RuntimeError("mlx-vlm is missing load()/generate() API")
 
         # mlx-vlm expects HF repo_id, not local path
-        # fix_mistral_regex=True: Suppress tokenizer regex warning for Mistral-based models
         # local_files_only=True: Use mlx-knife's cache only, never download (pull's responsibility)
-        loaded = self._load(self.model_name, fix_mistral_regex=True, local_files_only=True)
+        loaded = self._load(self.model_name, local_files_only=True)
         if isinstance(loaded, tuple):
             # Common pattern: (model, processor)
             self.model = loaded[0] if len(loaded) > 0 else None
@@ -256,9 +255,9 @@ class VisionRunner:
                 lat = convert_to_degrees(gps_dict.get("GPSLatitude"))
                 lon = convert_to_degrees(gps_dict.get("GPSLongitude"))
 
-                if lat and gps_dict.get("GPSLatitudeRef") == "S":
+                if lat is not None and gps_dict.get("GPSLatitudeRef") == "S":
                     lat = -lat
-                if lon and gps_dict.get("GPSLongitudeRef") == "W":
+                if lon is not None and gps_dict.get("GPSLongitudeRef") == "W":
                     lon = -lon
 
                 exif.gps_lat = lat
@@ -280,7 +279,7 @@ class VisionRunner:
                 exif.camera = str(camera).strip()
 
             # Return None if no useful EXIF found
-            if not any([exif.gps_lat, exif.gps_lon, exif.datetime, exif.camera]):
+            if all(x is None for x in [exif.gps_lat, exif.gps_lon, exif.datetime, exif.camera]):
                 return None
 
             return exif
diff --git a/mlxk2/operations/common.py b/mlxk2/operations/common.py
index 3e96519..1b16d5a 100644
--- a/mlxk2/operations/common.py
+++ b/mlxk2/operations/common.py
@@ -144,7 +144,8 @@ def detect_framework(hf_name: str, model_root: Path, selected_path: Optional[Pat
     MLX if:
     - org is mlx-community/*, or
     - README front-matter tags include 'mlx', or
-    - README front-matter library_name == 'mlx'.
+    - README front-matter library_name == 'mlx', or
+    - config.json contains 'quantization' key (MLX-specific).
 
     Else GGUF if any *.gguf present under selected_path or snapshots.
     Else PyTorch if any *.safetensors or pytorch_model.bin present under snapshots.
@@ -154,6 +155,13 @@ def detect_framework(hf_name: str, model_root: Path, selected_path: Optional[Pat
         if "mlx-community/" in hf_name:
             return "MLX"
 
+        # Search location preference: selected snapshot, else model root
+        root = selected_path if selected_path is not None else model_root
+
+        # Read front-matter if not provided (Issue #48: self-contained detection)
+        if fm is None:
+            fm = read_front_matter(root)
+
         # Front-matter signals
         if fm is not None:
             tags = [t.lower() for t in (fm.tags or [])]
@@ -161,8 +169,10 @@ def detect_framework(hf_name: str, model_root: Path, selected_path: Optional[Pat
             if "mlx" in tags or lib == "mlx":
                 return "MLX"
 
-        # Search location preference: selected snapshot, else model root
-        root = selected_path if selected_path is not None else model_root
+        # Config-based detection: 'quantization' key is MLX-specific (Issue #48)
+        config = _load_config_json(root)
+        if config and "quantization" in config:
+            return "MLX"
 
         if _has_any(root, ("**/*.gguf",)):
             return "GGUF"
@@ -176,7 +186,7 @@ def detect_framework(hf_name: str, model_root: Path, selected_path: Optional[Pat
     return "Unknown"
 
 
-def detect_model_type(hf_name: str, config: Optional[Dict[str, Any]], tok_hints: Dict[str, Any]) -> str:
+def detect_model_type(hf_name: str, config: Optional[Dict[str, Any]], tok_hints: Dict[str, Any], probe: Optional[Path] = None) -> str:
     name = hf_name.lower()
     if "embed" in name:
         return "embedding"
@@ -190,13 +200,20 @@ def detect_model_type(hf_name: str, config: Optional[Dict[str, Any]], tok_hints:
     ct = tok_hints.get("chat_template")
     if isinstance(ct, str) and ct.strip():
         return "chat"
+    # Check for chat_template.json file (Issue #48: reliable indicator)
+    if probe is not None and (probe / "chat_template.json").exists():
+        return "chat"
     if "instruct" in name or "chat" in name:
         return "chat"
     return "base"
 
 
 def detect_vision_capability(probe: Path, config: Optional[Dict[str, Any]]) -> bool:
-    """Detect whether the model snapshot supports vision inputs."""
+    """Detect whether the model snapshot supports vision inputs.
+
+    Video models (AutoVideoProcessor) are excluded as they require PyTorch/Torchvision.
+    mlx-vlm only supports image vision models (AutoImageProcessor).
+    """
     try:
         if isinstance(config, dict):
             mt = config.get("model_type")
@@ -208,6 +225,9 @@ def detect_vision_capability(probe: Path, config: Optional[Dict[str, Any]]) -> b
 
             preprocessor_cfg = config.get("preprocessor_config")
             if isinstance(preprocessor_cfg, dict):
+                # Exclude video processors (requires PyTorch/Torchvision)
+                if preprocessor_cfg.get("processor_class") == "AutoVideoProcessor":
+                    return False
                 return True
 
         if _has_any(
@@ -221,6 +241,25 @@ def detect_vision_capability(probe: Path, config: Optional[Dict[str, Any]]) -> b
                 "**/image_processor_config.json",
             ),
         ):
+            # Check if it's a video processor (requires PyTorch/Torchvision)
+            # Video models have video_preprocessor_config.json or temporal_patch_size
+            if (probe / "video_preprocessor_config.json").exists():
+                return False
+
+            preprocessor_path = probe / "preprocessor_config.json"
+            if preprocessor_path.exists():
+                try:
+                    import json
+                    with open(preprocessor_path) as f:
+                        preprocessor_data = json.load(f)
+                    if isinstance(preprocessor_data, dict):
+                        # Video model indicators
+                        if preprocessor_data.get("processor_class") == "AutoVideoProcessor":
+                            return False
+                        if "temporal_patch_size" in preprocessor_data:
+                            return False
+                except Exception:
+                    pass
             return True
     except Exception:
         return False
@@ -308,7 +347,7 @@ def build_model_object(hf_name: str, model_root: Path, selected_path: Optional[P
     config = _load_config_json(probe)
 
     framework = detect_framework(hf_name, model_root, selected_path=selected_path, fm=fm)
-    model_type = detect_model_type(hf_name, config, tok)
+    model_type = detect_model_type(hf_name, config, tok, probe)
     capabilities = detect_capabilities(model_type, hf_name, tok, config, probe)
     has_vision = "vision" in capabilities
 
@@ -316,17 +355,21 @@ def build_model_object(hf_name: str, model_root: Path, selected_path: Optional[P
     healthy, health_reason = is_model_healthy(hf_name)
 
     # Runtime compatibility: ALWAYS computed (gate logic applies)
-    # Gate: Only check runtime if file integrity is healthy
+    # Gate 1: File integrity must be healthy
+    # Gate 2: Framework must be MLX (only backend supported)
     runtime_reason: Optional[str] = None
-    if healthy:
-        if has_vision:
-            runtime_compatible, runtime_reason = vision_runtime_compatibility()
-        else:
-            runtime_compatible, runtime_reason = check_runtime_compatibility(probe, framework)
-    else:
+    if not healthy:
         # File integrity failed → skip runtime check
         runtime_compatible = False
         runtime_reason = None  # health_reason takes precedence
+    elif framework != "MLX":
+        # Non-MLX frameworks not supported (PyTorch, GGUF, etc.)
+        runtime_compatible = False
+        runtime_reason = f"Incompatible framework: {framework}"
+    elif has_vision:
+        runtime_compatible, runtime_reason = vision_runtime_compatibility()
+    else:
+        runtime_compatible, runtime_reason = check_runtime_compatibility(probe, framework)
 
     # Reason field: First problem encountered (health → runtime)
     reason = health_reason if not healthy else runtime_reason
diff --git a/mlxk2/output/human.py b/mlxk2/output/human.py
index 7c56dc3..ef59c9b 100644
--- a/mlxk2/output/human.py
+++ b/mlxk2/output/human.py
@@ -134,25 +134,20 @@ def render_list(data: Dict[str, Any], show_health: bool, show_all: bool, verbose
             headers.append("Health")
 
     # Human filter:
-    # - --all: show everything
-    # - default: show only MLX chat models (safer for run/server selection)
-    # - --verbose (without --all): show all MLX models (chat + base)
+    # - --all: show everything (no filter)
+    # - default/verbose: only healthy + runtime_compatible (runnable models)
+    # Same filter as Server /v1/models - single source of truth via build_model_object
     filtered: List[Dict[str, Any]] = []
     for m in models:
-        fw = str(m.get("framework", "")).upper()
-        typ = str(m.get("model_type", "")).lower()
         if show_all:
             filtered.append(m)
         else:
-            if fw != "MLX":
+            # Filter: healthy AND runtime_compatible
+            if m.get("health") != "healthy":
                 continue
-            if verbose:
-                # In verbose mode, show all MLX models
-                filtered.append(m)
-            else:
-                # Default compact mode: only MLX chat
-                if typ == "chat":
-                    filtered.append(m)
+            if not m.get("runtime_compatible"):
+                continue
+            filtered.append(m)
 
     rows: List[List[str]] = []
     for m in filtered:
diff --git a/pyproject.toml b/pyproject.toml
index f397ce9..7dea8f1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,7 @@ classifiers = [
     "License :: OSI Approved :: Apache Software License",
 ]
 dependencies = [
-    "huggingface-hub>=0.34.0,<1.0",
+    "huggingface-hub>=0.34.0",
     "requests>=2.32.0",
     "mlx-lm>=0.28.4",
     "mlx>=0.29.0",
@@ -66,7 +66,7 @@ dev = [
     "mypy>=1.5.0",
 ]
 vision = [
-    "mlx-vlm>=0.3.9",  # Vision Language Models support (ADR-012, requires Python 3.10+)
+    "mlx-vlm>=0.3.9",  # Vision Language Models support (ADR-012, requires Python 3.10+; beta.3 recommends mlx-vlm commit c4ea290e47e2155b67d94c708c662f8ab64e1b37)
 ]
 
 [tool.setuptools]
diff --git a/tests_2.0/live/conftest.py b/tests_2.0/live/conftest.py
index dda3ac2..b7ff59e 100644
--- a/tests_2.0/live/conftest.py
+++ b/tests_2.0/live/conftest.py
@@ -21,6 +21,7 @@ from .test_utils import (
     discover_mlx_models_in_user_cache,
     discover_text_models,
     discover_vision_models,
+    parse_vm_stat_page_size,
     TEST_MODELS,
 )
 
@@ -499,9 +500,176 @@ def report_benchmark(request):
 
 
 # ============================================================================
-# Benchmark Reporting (ADR-013 Phase 0)
+# Benchmark Reporting (ADR-013 Phase 0 + 0.5)
 # ============================================================================
 
+def _get_macos_system_health() -> Dict[str, Any]:
+    """Collect macOS system health metrics (ADR-013 Phase 0.5 - v0.2.0).
+
+    Uses macOS-native tools (sysctl, vm_stat, ps) - ZERO new dependencies.
+    Enables automatic regression quality assessment via quality_flags.
+
+    Returns:
+        dict: System health metrics with keys:
+            - swap_used_mb: Current swap usage in MB
+            - ram_free_gb: Available RAM in GB
+            - zombie_processes: Count of zombie processes
+            - quality_flags: List of quality indicators
+                ["clean"] = healthy system
+                ["degraded_swap"] = swap usage detected (memory pressure)
+                ["degraded_zombies"] = zombie processes detected
+
+    Quality Thresholds (empirically derived from Session 43 analysis):
+        - Swap: >100 MB indicates memory pressure (beta2→beta3: 1.8 GB swap = +3.4% slowdown)
+        - Zombies: >0 indicates stuck processes (REGRESSION-2025-12-08: 14 zombies = +90% slowdown)
+    """
+    import subprocess
+
+    health = {
+        "swap_used_mb": 0,
+        "ram_free_gb": 0.0,
+        "zombie_processes": 0,
+        "quality_flags": []
+    }
+
+    try:
+        # Get swap usage via sysctl (macOS native)
+        # sysctl vm.swapusage returns: "vm.swapusage: total = 0.00M  used = 0.00M  free = 0.00M  (encrypted)"
+        result = subprocess.run(
+            ["sysctl", "vm.swapusage"],
+            capture_output=True,
+            text=True,
+            timeout=2
+        )
+        if result.returncode == 0:
+            # Parse: "total = X.XXM  used = Y.YYM  free = Z.ZZM"
+            for part in result.stdout.split():
+                if part.endswith("M") and "used" in result.stdout:
+                    # Extract used value (appears after "used = ")
+                    parts = result.stdout.split("used = ")
+                    if len(parts) > 1:
+                        used_str = parts[1].split()[0]
+                        # Parse size (can be M or G suffix)
+                        if used_str.endswith("G"):
+                            health["swap_used_mb"] = int(float(used_str[:-1]) * 1024)
+                        elif used_str.endswith("M"):
+                            health["swap_used_mb"] = int(float(used_str[:-1]))
+                        break
+    except Exception:
+        pass  # Swap metric is optional (not critical if it fails)
+
+    try:
+        # Get free RAM via vm_stat (macOS native)
+        # vm_stat reports page size in the header (Apple Silicon uses 16KB pages).
+        result = subprocess.run(
+            ["vm_stat"],
+            capture_output=True,
+            text=True,
+            timeout=2
+        )
+        if result.returncode == 0:
+            page_size = parse_vm_stat_page_size(result.stdout)
+            # Parse "Pages free: 12345."
+            for line in result.stdout.splitlines():
+                if "Pages free:" in line:
+                    pages_free = int(line.split(":")[1].strip().rstrip("."))
+                    health["ram_free_gb"] = round(pages_free * page_size / (1024**3), 2)
+                    break
+    except Exception:
+        pass  # RAM metric is optional
+
+    try:
+        # Get zombie process count via ps aux (macOS native)
+        # Zombies show as "<defunct>" in ps output
+        result = subprocess.run(
+            ["ps", "aux"],
+            capture_output=True,
+            text=True,
+            timeout=2
+        )
+        if result.returncode == 0:
+            # Count lines containing "<defunct>"
+            health["zombie_processes"] = result.stdout.count("<defunct>")
+    except Exception:
+        pass  # Zombie count is optional
+
+    # Determine quality flags (empirical thresholds from regression analysis)
+    flags = []
+    if health["swap_used_mb"] > 100:
+        flags.append("degraded_swap")
+    if health["zombie_processes"] > 0:
+        flags.append("degraded_zombies")
+
+    # If no degradation detected, mark as clean
+    if not flags:
+        flags.append("clean")
+
+    health["quality_flags"] = flags
+    return health
+
+
+def _get_macos_hardware_profile() -> Dict[str, Any]:
+    """Collect macOS hardware profile (ADR-013 Phase 0.5 - v0.2.0).
+
+    Uses macOS-native sysctl - ZERO new dependencies.
+    Enables hardware-specific performance analysis (M1 vs M2 vs M3 vs M4).
+
+    Returns:
+        dict: Hardware profile with keys:
+            - model: Mac model identifier (e.g., "Mac14,9" = M3 Max)
+            - cores_physical: Physical CPU cores (P-cores only)
+            - cores_logical: Logical CPU cores (P+E cores with hyperthreading)
+    """
+    import subprocess
+
+    profile = {
+        "model": "unknown",
+        "cores_physical": 0,
+        "cores_logical": 0,
+    }
+
+    try:
+        # Get Mac model identifier
+        result = subprocess.run(
+            ["sysctl", "-n", "hw.model"],
+            capture_output=True,
+            text=True,
+            timeout=2
+        )
+        if result.returncode == 0:
+            profile["model"] = result.stdout.strip()
+    except Exception:
+        pass
+
+    try:
+        # Get physical cores (P-cores)
+        result = subprocess.run(
+            ["sysctl", "-n", "hw.physicalcpu"],
+            capture_output=True,
+            text=True,
+            timeout=2
+        )
+        if result.returncode == 0:
+            profile["cores_physical"] = int(result.stdout.strip())
+    except Exception:
+        pass
+
+    try:
+        # Get logical cores (P+E cores with hyperthreading)
+        result = subprocess.run(
+            ["sysctl", "-n", "hw.logicalcpu"],
+            capture_output=True,
+            text=True,
+            timeout=2
+        )
+        if result.returncode == 0:
+            profile["cores_logical"] = int(result.stdout.strip())
+    except Exception:
+        pass
+
+    return profile
+
+
 def pytest_addoption(parser):
     """Add --report-output option for benchmark reporting."""
     parser.addoption(
@@ -509,7 +677,7 @@ def pytest_addoption(parser):
         action="store",
         default=None,
         metavar="PATH",
-        help="Generate benchmark reports to JSONL file (ADR-013 Phase 0)"
+        help="Generate benchmark reports to JSONL file (ADR-013 Phase 0.5)"
     )
 
 
@@ -534,11 +702,16 @@ def pytest_runtest_makereport(item, call):
     Reports are written as JSONL (one JSON object per line) to allow
     streaming and easy appending across test runs.
 
-    Schema version: 0.1.0 (Phase 0 - Experimental)
-    See: benchmarks/schemas/report-v0.1.schema.json
+    Schema version: 0.2.0 (Phase 0.5 - System Health + Hardware Profile)
+    See: ADR-013 Phase 0.5 implementation
+
+    Changelog from 0.1.0 → 0.2.0:
+        - Added: system.hardware_profile (Mac model, cores)
+        - Added: system_health (swap, RAM, zombies, quality_flags)
+        - Backward compatible: All 0.1.0 fields preserved
     """
     import json
-    from datetime import datetime
+    from datetime import datetime, timezone
 
     outcome = yield
     report = outcome.get_result()
@@ -553,8 +726,8 @@ def pytest_runtest_makereport(item, call):
 
         # Build report data (required fields)
         data = {
-            "schema_version": "0.1.0",
-            "timestamp": datetime.utcnow().isoformat() + "Z",
+            "schema_version": "0.2.0",
+            "timestamp": datetime.now(timezone.utc).isoformat(),
             "mlx_knife_version": __version__,
             "test": item.nodeid,
             "outcome": report.outcome,
@@ -581,6 +754,20 @@ def pytest_runtest_makereport(item, call):
                 # Everything else goes to metadata
                 data.setdefault("metadata", {})[key] = value
 
+        # ADR-013 Phase 0.5: Collect system health metrics (v0.2.0)
+        # Enables automatic regression quality assessment
+        system_health = _get_macos_system_health()
+        data["system_health"] = system_health
+
+        # ADR-013 Phase 0.5: Collect hardware profile (v0.2.0)
+        # Enables hardware-specific performance analysis (M1 vs M2 vs M3 vs M4)
+        hardware_profile = _get_macos_hardware_profile()
+
+        # Add hardware_profile to system section (create if not exists)
+        if "system" not in data:
+            data["system"] = {}
+        data["system"]["hardware_profile"] = hardware_profile
+
         # Write JSONL (one line per report)
         try:
             item.config.report_file.write(json.dumps(data) + "\n")
@@ -588,4 +775,3 @@ def pytest_runtest_makereport(item, call):
         except Exception as e:
             # Don't fail tests if reporting fails
             print(f"\n⚠️  Benchmark report write failed: {e}")
-
diff --git a/tests_2.0/live/test_server_e2e.py b/tests_2.0/live/test_server_e2e.py
index a877fa7..5803ee6 100644
--- a/tests_2.0/live/test_server_e2e.py
+++ b/tests_2.0/live/test_server_e2e.py
@@ -42,6 +42,8 @@ from .test_utils import (
 # Server request timeout (increased from 30s to 45s in Session 22)
 # Accounts for: baseline (15s) + probe/policy overhead (2.7s) + generation + safety margin
 SERVER_REQUEST_TIMEOUT = 45.0
+# /v1/models can be slower due to cache scans + runtime checks
+MODEL_LIST_TIMEOUT = 20.0
 
 # Opt-in markers
 pytestmark = [
@@ -100,7 +102,7 @@ class TestServerHealthEndpoints:
             pytest.skip("No text models available within RAM budget")
 
         with LocalServer(test_model) as server_url:
-            response = httpx.get(f"{server_url}/v1/models")
+            response = httpx.get(f"{server_url}/v1/models", timeout=MODEL_LIST_TIMEOUT)
 
             assert response.status_code == 200
             data = response.json()
diff --git a/tests_2.0/live/test_utils.py b/tests_2.0/live/test_utils.py
index 3022f23..c6f2141 100644
--- a/tests_2.0/live/test_utils.py
+++ b/tests_2.0/live/test_utils.py
@@ -8,6 +8,7 @@ Provides:
 
 from __future__ import annotations
 
+import re
 import sys
 from pathlib import Path
 from typing import Dict, Any, Tuple
@@ -108,6 +109,14 @@ def get_system_memory_bytes() -> int:
     return 0
 
 
+def parse_vm_stat_page_size(output: str) -> int:
+    """Extract vm_stat page size in bytes, falling back to 4096."""
+    match = re.search(r"page size of (\d+) bytes", output)
+    if match:
+        return int(match.group(1))
+    return 4096
+
+
 def discover_text_models() -> list[Dict[str, Any]]:
     """Discover text-only models (filter out Vision models).
 
diff --git a/tests_2.0/live/test_vm_stat_parsing.py b/tests_2.0/live/test_vm_stat_parsing.py
new file mode 100644
index 0000000..500a6ab
--- /dev/null
+++ b/tests_2.0/live/test_vm_stat_parsing.py
@@ -0,0 +1,13 @@
+"""Unit tests for vm_stat parsing helpers."""
+
+from .test_utils import parse_vm_stat_page_size
+
+
+def test_parse_vm_stat_page_size_apple_silicon():
+    output = "Mach Virtual Memory Statistics: (page size of 16384 bytes)\nPages free: 12345."
+    assert parse_vm_stat_page_size(output) == 16384
+
+
+def test_parse_vm_stat_page_size_fallback():
+    output = "Pages free: 12345."
+    assert parse_vm_stat_page_size(output) == 4096
diff --git a/tests_2.0/test_human_output.py b/tests_2.0/test_human_output.py
index bcd3d21..fec5e06 100644
--- a/tests_2.0/test_human_output.py
+++ b/tests_2.0/test_human_output.py
@@ -18,6 +18,7 @@ def sample_list_data():
                     "model_type": "chat",
                     "capabilities": ["text-generation", "chat"],
                     "health": "healthy",
+                    "runtime_compatible": True,
                     "cached": True,
                 },
                 {
@@ -29,6 +30,7 @@ def sample_list_data():
                     "model_type": "base",
                     "capabilities": ["text-generation"],
                     "health": "unhealthy",
+                    "runtime_compatible": False,
                     "cached": True,
                 },
             ],
@@ -98,6 +100,7 @@ def test_list_human_filters_mlx_base_default():
                     "model_type": "chat",
                     "capabilities": ["text-generation", "chat"],
                     "health": "healthy",
+                    "runtime_compatible": True,
                     "cached": True,
                 },
                 {
@@ -109,26 +112,42 @@ def test_list_human_filters_mlx_base_default():
                     "model_type": "base",
                     "capabilities": ["text-generation"],
                     "health": "healthy",
+                    "runtime_compatible": True,
+                    "cached": True,
+                },
+                {
+                    "name": "org/Unhealthy",
+                    "hash": None,
+                    "size_bytes": 500,
+                    "last_modified": "2025-08-30T12:00:00Z",
+                    "framework": "MLX",
+                    "model_type": "chat",
+                    "capabilities": ["text-generation"],
+                    "health": "unhealthy",
+                    "runtime_compatible": False,
                     "cached": True,
                 },
             ],
-            "count": 2,
+            "count": 3,
         },
         "error": None,
     }
 
-    # Default (compact) should hide MLX base
+    # Default: shows healthy + runtime_compatible models (both MLXChat and MLXBase)
     out_default = render_list(data, show_health=False, show_all=False, verbose=False)
     assert "MLXChat" in out_default
-    assert "MLXBase" not in out_default
+    assert "MLXBase" in out_default
+    assert "Unhealthy" not in out_default
 
-    # Verbose (without --all) shows all MLX (chat + base)
+    # Verbose: same filter, more columns
     out_verbose = render_list(data, show_health=False, show_all=False, verbose=True)
     assert "MLXChat" in out_verbose
     assert "MLXBase" in out_verbose
+    assert "Unhealthy" not in out_verbose
 
 
-def test_list_human_verbose_shows_all_mlx_only():
+def test_list_human_filters_by_healthy_and_runtime_compatible():
+    """Test that default/verbose filters by healthy + runtime_compatible."""
     from mlxk2.output.human import render_list
 
     data = {
@@ -136,9 +155,9 @@ def test_list_human_verbose_shows_all_mlx_only():
         "command": "list",
         "data": {
             "models": [
-                {"name": "org/MLXChat", "hash": None, "size_bytes": 1, "last_modified": "2025-08-30T12:00:00Z", "framework": "MLX", "model_type": "chat", "capabilities": ["text-generation", "chat"], "health": "healthy", "cached": True},
-                {"name": "org/MLXBase", "hash": None, "size_bytes": 1, "last_modified": "2025-08-30T12:00:00Z", "framework": "MLX", "model_type": "base", "capabilities": ["text-generation"], "health": "healthy", "cached": True},
-                {"name": "org/OtherPT", "hash": None, "size_bytes": 1, "last_modified": "2025-08-30T12:00:00Z", "framework": "PyTorch", "model_type": "base", "capabilities": ["text-generation"], "health": "healthy", "cached": True},
+                {"name": "org/Runnable", "hash": None, "size_bytes": 1, "last_modified": "2025-08-30T12:00:00Z", "framework": "MLX", "model_type": "chat", "capabilities": ["text-generation", "chat"], "health": "healthy", "runtime_compatible": True, "cached": True},
+                {"name": "org/Unhealthy", "hash": None, "size_bytes": 1, "last_modified": "2025-08-30T12:00:00Z", "framework": "MLX", "model_type": "base", "capabilities": ["text-generation"], "health": "unhealthy", "runtime_compatible": True, "cached": True},
+                {"name": "org/NotCompatible", "hash": None, "size_bytes": 1, "last_modified": "2025-08-30T12:00:00Z", "framework": "PyTorch", "model_type": "base", "capabilities": ["text-generation"], "health": "healthy", "runtime_compatible": False, "cached": True},
             ],
             "count": 3,
         },
@@ -146,11 +165,12 @@ def test_list_human_verbose_shows_all_mlx_only():
     }
 
     out_verbose = render_list(data, show_health=False, show_all=False, verbose=True)
-    # Shows both MLX models (chat+base)
-    assert "MLXChat" in out_verbose
-    assert "MLXBase" in out_verbose
-    # Hides non-MLX
-    assert "OtherPT" not in out_verbose
+    # Shows only healthy + runtime_compatible
+    assert "Runnable" in out_verbose
+    # Hides unhealthy
+    assert "Unhealthy" not in out_verbose
+    # Hides not runtime_compatible
+    assert "NotCompatible" not in out_verbose
 
 
 def test_list_human_all_shows_all_frameworks():
diff --git a/tests_2.0/test_issue_30_preflight.py b/tests_2.0/test_issue_30_preflight.py
index 722a57e..5496bed 100644
--- a/tests_2.0/test_issue_30_preflight.py
+++ b/tests_2.0/test_issue_30_preflight.py
@@ -4,6 +4,21 @@ import pytest
 from mlxk2.operations.pull import preflight_repo_access, pull_operation
 
 
+def _create_mock_response(status_code=403):
+    """Create a mock httpx.Response for huggingface-hub 1.x exceptions.
+
+    Hub 1.x requires response parameter to be a real httpx.Response object.
+    """
+    try:
+        import httpx
+        # Create minimal mock response
+        request = httpx.Request("GET", "https://huggingface.co/api/models/test")
+        return httpx.Response(status_code=status_code, request=request)
+    except ImportError:
+        # Fallback for older hub versions that don't need it
+        return None
+
+
 def test_preflight_private_model_without_token(monkeypatch):
     """Test preflight check with a known private model without token.
     
@@ -29,7 +44,8 @@ def test_preflight_private_model_without_token(monkeypatch):
         from huggingface_hub import errors as _hub_errors
         GatedRepoError = _hub_errors.GatedRepoError
         def _fake_model_info(self, repo_id, token=None):
-            raise GatedRepoError("Gated/private repository")
+            response = _create_mock_response(status_code=403)
+            raise GatedRepoError("Gated/private repository", response=response)
         monkeypatch.setattr(HfApi, "model_info", _fake_model_info, raising=True)
 
         success, error = preflight_repo_access("org/private-model")
@@ -53,7 +69,8 @@ def test_preflight_nonexistent_model(monkeypatch):
     from huggingface_hub import errors as _hub_errors
     RepositoryNotFoundError = _hub_errors.RepositoryNotFoundError
     def _fake_model_info(self, repo_id, token=None):
-        raise RepositoryNotFoundError("Not found")
+        response = _create_mock_response(status_code=404)
+        raise RepositoryNotFoundError("Not found", response=response)
     monkeypatch.setattr(HfApi, "model_info", _fake_model_info, raising=True)
 
     success, error = preflight_repo_access("definitely-not-existing-model-12345-xyz")
@@ -78,7 +95,8 @@ def test_preflight_integration_in_pull(isolated_cache, monkeypatch):
     from huggingface_hub import errors as _hub_errors
     RepositoryNotFoundError = _hub_errors.RepositoryNotFoundError
     def _fake_model_info(self, repo_id, token=None):
-        raise RepositoryNotFoundError("Not found")
+        response = _create_mock_response(status_code=404)
+        raise RepositoryNotFoundError("Not found", response=response)
     monkeypatch.setattr(HfApi, "model_info", _fake_model_info, raising=True)
 
     # Test with a non-existent model - should fail at preflight stage
@@ -145,7 +163,8 @@ def test_preflight_prevents_cache_pollution(isolated_cache, monkeypatch):
     from huggingface_hub import errors as _hub_errors
     GatedRepoError = _hub_errors.GatedRepoError
     def _fake_model_info(self, repo_id, token=None):
-        raise GatedRepoError("Gated/private repository")
+        response = _create_mock_response(status_code=403)
+        raise GatedRepoError("Gated/private repository", response=response)
     monkeypatch.setattr(HfApi, "model_info", _fake_model_info, raising=True)
 
     # Attempt to pull a gated/private model
diff --git a/tests_2.0/test_server_models_and_errors.py b/tests_2.0/test_server_models_and_errors.py
index 5f96191..a5d3733 100644
--- a/tests_2.0/test_server_models_and_errors.py
+++ b/tests_2.0/test_server_models_and_errors.py
@@ -63,45 +63,57 @@ def test_unknown_model_maps_to_404():
         assert resp.status_code == 404
 
 
-def test_models_endpoint_filters_non_mlx_and_unhealthy():
-    """Ensure /v1/models excludes non-MLX and unhealthy entries."""
+def test_models_endpoint_filters_unhealthy_and_not_runtime_compatible():
+    """Ensure /v1/models excludes unhealthy and non-runtime-compatible entries.
+
+    Filter logic: healthy == True AND runtime_compatible == True
+    Uses shared build_model_object from common.py (single source of truth).
+    """
     client = TestClient(app)
 
     with patch('mlxk2.core.server_base.get_current_model_cache') as mock_cache, \
          patch('mlxk2.core.cache.cache_dir_to_hf') as mock_cache_to_hf, \
-         patch('mlxk2.operations.common.detect_framework') as mock_framework, \
-         patch('mlxk2.operations.health.is_model_healthy') as mock_healthy:
+         patch('mlxk2.operations.common.build_model_object') as mock_build:
 
-        # Two cached dirs
-        d1 = MagicMock(); d1.name = "models--org--mlx"
-        d2 = MagicMock(); d2.name = "models--org--pt"
-        mock_cache.return_value.iterdir.return_value = [d1, d2]
+        # Three cached dirs with proper snapshot structure
+        d1 = MagicMock(); d1.name = "models--org--healthy-compatible"
+        d2 = MagicMock(); d2.name = "models--org--unhealthy"
+        d3 = MagicMock(); d3.name = "models--org--not-compatible"
+
+        # Setup snapshot paths for each model dir
+        for d in [d1, d2, d3]:
+            snapshot_dir = MagicMock()
+            snapshot_path = MagicMock()
+            snapshot_dir.exists.return_value = True
+            snapshot_dir.iterdir.return_value = [snapshot_path]
+            snapshot_path.is_dir.return_value = True
+            d.__truediv__ = lambda self, x, snap=snapshot_dir, spath=snapshot_path: snap if x == "snapshots" else spath
+
+        mock_cache.return_value.iterdir.return_value = [d1, d2, d3]
 
         # Map names
         def map_name(n):
-            if n == "models--org--mlx":
-                return "org/mlx"
-            return "org/pt"
-
+            return n.replace("models--", "").replace("--", "/")
         mock_cache_to_hf.side_effect = map_name
 
-        # Framework detection: d1 is MLX, d2 is not
-        def detect_fw(model_name, *_args, **_kwargs):
-            return "MLX" if model_name.endswith("/mlx") else "PyTorch"
-
-        mock_framework.side_effect = detect_fw
-
-        # Health: return False for the MLX one to ensure it is filtered, too
-        def health(model_name):
-            return (False, None) if model_name.endswith("/mlx") else (True, None)
-
-        mock_healthy.side_effect = health
+        # build_model_object returns different health/runtime_compatible
+        def build(model_name, model_dir, selected_path):
+            if "unhealthy" in model_name:
+                return {"health": "unhealthy", "runtime_compatible": True}
+            elif "not-compatible" in model_name:
+                return {"health": "healthy", "runtime_compatible": False}
+            else:
+                return {"health": "healthy", "runtime_compatible": True}
+        mock_build.side_effect = build
 
         resp = client.get("/v1/models")
         assert resp.status_code == 200
         data = resp.json()
-        # Both should be filtered: one not MLX, one unhealthy
-        assert data.get("data") == []
+        # Only d1 (healthy + runtime_compatible) should pass
+        model_ids = [m["id"] for m in data.get("data", [])]
+        assert "org/healthy-compatible" in model_ids
+        assert "org/unhealthy" not in model_ids
+        assert "org/not-compatible" not in model_ids
 
 
 def test_chat_unknown_model_maps_to_404():