diff --git a/.gitignore b/.gitignore index 1487cdf..a33ade5 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,7 @@ openwebui311/bin/ *_report.json test-img-collection/ small-img-collection +benchmarks/reports/*.html # Benchmark reports (ADR-013 Phase 0) # These reports ARE tracked in git for historical data diff --git a/CHANGELOG.md b/CHANGELOG.md index 49effbc..df79049 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,97 @@ # Changelog -## [2.0.4-beta.1] - WIP +## [2.0.4-beta.3] - 2025-12-23 + +### Added + +- **Benchmark Infrastructure v1.0 (ADR-013 Phase 0):** + - Template-based report generator: `benchmarks/generate_benchmark_report.py` + - Per-model statistics, per-test statistics, system health summary + - Schema validation: `benchmarks/validate_reports.py` + - Documentation: `benchmarks/README.md`, `benchmarks/TESTING.md` + - Quality tracking: Schema v0.2.0 with system_health (swap, RAM, zombies, quality_flags) + - Page-size fix: Corrected Apple Silicon 16KB page size (RAM values were 4x too low) + - Files: `tests_2.0/live/conftest.py`, `test_utils.py`, `test_vm_stat_parsing.py` + +- **Memory Timeline Visualization:** + - Interactive HTML visualizer: `benchmarks/tools/memplot.py` (500+ lines) + - Memory monitor enhanced: `benchmarks/tools/memmon.py` (memory pressure capture) + - Visual legend: Activity Monitor colors, memory pressure, test regions, model markers + - Documentation: Complete interpretation guide in `benchmarks/README.md` + - Schema learnings: Server test attribution problem + log-parsing solution documented + - File: `benchmarks/schemas/LEARNINGS-FOR-v1.0.md` + +### Fixed + +- **Server model switch log timing:** "Switched to model" now emitted only after successful load (past tense reflects completed action) + - File: `mlxk2/core/server_base.py:230` + +- **Unified model filter (Server + CLI):** Both `/v1/models` and `mlxk list` now use `build_model_object()` as single source of truth + - Filter: `healthy AND runtime_compatible` (no more code duplication) + - Framework gate: Non-MLX models (PyTorch, GGUF) now correctly marked `runtime_compatible=false` + - WebUI clients get consistent, runnable model lists + - Files: `mlxk2/core/server_base.py`, `mlxk2/output/human.py`, `mlxk2/operations/common.py` + +- **transformers 5.0 compatibility for vision models:** Removed `fix_mistral_regex` parameter from mlx-vlm load call + - transformers 5.0.0rc1 changed tokenizer initialization - `fix_mistral_regex` no longer accepted as kwarg + - Error was: `TypeError: _patch_mistral_regex() got multiple values for keyword argument 'fix_mistral_regex'` + - Removed deprecated parameter from vision model loading - all vision models now work with transformers 5.0 + - File: `mlxk2/core/vision_runner.py:101` + +- **huggingface-hub 1.x compatibility:** Updated preflight test mocks for hub 1.x exception API changes + - Hub 1.x changed exception signatures: `GatedRepoError/RepositoryNotFoundError` now require `response` parameter + - Added `_create_mock_response()` helper to create proper httpx.Response objects for test mocks + - **Test-only changes** - preflight production code works unchanged with hub 0.x and 1.x + - **Result:** mlx-knife now fully compatible with mlx 0.30.x, mlx-lm 0.30.0, transformers 5.0, hub 1.x + - All 494 unit tests pass, vision models functional with newest dependencies + - Files: `tests_2.0/test_issue_30_preflight.py`, `mlxk2/core/vision_runner.py` + +- **EXIF GPS 0° coordinate handling:** Fixed truthiness checks in `VisionRunner._extract_exif` that incorrectly dropped valid GPS coordinates + - Equator (0° latitude) and Prime Meridian (0° longitude) now correctly preserved + - Changed latitude/longitude negation checks from `if lat` to `if lat is not None` + - Changed EXIF retention check from `not any([...])` to `all(x is None for x in [...])` + - Ensures 0.0 is treated as valid coordinate, not as missing data + - File: `mlxk2/core/vision_runner.py:259-262, 283` + +- **Framework/Type detection for non-mlx-community models (Issue #48):** + - `detect_framework()`: Now reads front-matter internally and checks config.json `quantization` key (MLX-specific) + - `detect_model_type()`: Added `probe` parameter and checks for `chat_template.json` file (reliable chat indicator) + - Removed redundant PR #42 code from server_base.py (cleaner architecture) + - Fixes: Models like locally converted quantized models now correctly show "MLX" + "chat" instead of "PyTorch" + "base" + - Files: `mlxk2/operations/common.py:118-157, 180-208`, `mlxk2/core/server_base.py:114-120` + +- **Video model detection and exclusion:** + - Video models (require PyTorch/Torchvision) now excluded from vision capability detection + - mlx-vlm only supports image vision models, not video models + - Video indicators: `video_preprocessor_config.json`, `temporal_patch_size`, `AutoVideoProcessor` + - Video models fall back to mlx-lm for text-only (consistent with vision architecture) + - Example: `mlx-community/MiMo-VL-7B-RL-bf16` now classified as "chat" (not "chat+vision") + - Files: `mlxk2/operations/common.py:211-266`, `mlxk2/core/capabilities.py:169-238` + +### Documentation + +- **mlx-vlm beta.3 install guidance:** Recommend upstream commit `c4ea290e47e2155b67d94c708c662f8ab64e1b37` until mlx-vlm 0.3.10 is released + - Files: `README.md`, `docs/SERVER-HANDBOOK.md` + +## [2.0.4-beta.2] - 2025-12-16 + +**PyPI-only release** - Fixes Git dependency issue for PyPI compatibility. Not tagged on GitHub. + +### Fixed + +- **PyPI compatibility:** Changed `mlx-vlm` dependency from Git URL to PyPI version `mlx-vlm>=0.3.9` + - PyPI does not allow Git dependencies + - mlx-vlm 0.3.9 is available on PyPI + - File: `pyproject.toml:69` + +### Documentation + +- **Installation instructions:** Added Vision-specific installation to README.md + - Clear separation: Text models (Python 3.9+) vs Vision models (Python 3.10+) + - Installation command: `pip install mlx-knife[vision]` + - Updated all version references from 2.0.4-beta.1 → 2.0.4-beta.2 + +## [2.0.4-beta.1] - 2025-12-16 **Focus:** Unix Pipe Integration + Vision Support + Memory-Aware Loading + Python 3.14 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 18e6e53..3d2a54f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -129,18 +129,15 @@ For detailed testing options, troubleshooting, and advanced workflows, see **[TE ### Before Submitting PRs -Please ensure all tests pass locally: -```bash -# Complete test workflow -ruff check mlxk2/ --fix # Fix code style -mypy mlxk2/ # Check types -pytest -v # Run all 2.0 tests -``` +**All tests must pass:** +- ✅ Code quality: `ruff check mlxk2/ --fix && mypy mlxk2/` +- ✅ Unit tests: `pytest tests_2.0/ -v` (always required) +- ✅ Live E2E tests: Required for model/inference changes -Since we don't have CI/CD (MLX requires Apple Silicon), we rely on contributors to verify their changes locally. Please mention in your PR: -- Which Python version you tested with -- Which Mac model you tested on (M1/M2/M3) -- Test results summary +**PR requirements:** +- State your Python version + Mac chip in PR description +- For model/inference changes: Document which live tests you ran +- **Important:** Unit tests alone are NOT sufficient - see **[TESTING.md](TESTING.md)** for why and how ## Python Version Requirements diff --git a/README.md b/README.md index 3c3862c..804ebb2 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,9 @@ MLX Knife Demo

-**Current Version: 2.0.4-beta.2** (Stable: 2.0.3) +**Current Version: 2.0.4-beta.3** (Stable: 2.0.3) -[![GitHub Release](https://img.shields.io/badge/version-2.0.4--beta.2-blue.svg)](https://github.com/mzau/mlx-knife/releases) +[![GitHub Release](https://img.shields.io/badge/version-2.0.4--beta.3-blue.svg)](https://github.com/mzau/mlx-knife/releases) [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0) [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/) [![Apple Silicon](https://img.shields.io/badge/Apple%20Silicon-green.svg)](https://support.apple.com/en-us/HT211814) @@ -20,7 +20,7 @@ - **Model Information**: Detailed model metadata including quantization info - **Download Models**: Pull models from HuggingFace with progress tracking - **Run Models**: Native MLX execution with streaming and chat modes -- **Vision Models**: Image analysis (Python 3.10+, alpha) +- **Vision Models**: Image analysis (Python 3.10+, beta) - **Unix Pipes**: Chain models via stdin/stdout - no temp files (beta) - **Health Checks**: Verify model integrity and MLX runtime compatibility - **Cache Management**: Clean up and organize your model storage @@ -67,7 +67,7 @@ This license applies **only** to the `mlx-knife` code and **does not extend** to MLX Knife has been comprehensively tested and verified on: ✅ **Python 3.9.6 - 3.14** - Text LLMs fully supported (mlx-lm 0.28.4+) -✅ **Python 3.10 - 3.14** - Vision models supported (mlx-vlm 0.3.9+) +✅ **Python 3.10 - 3.14** - Vision models supported (mlx-vlm 0.3.9+; beta.3 recommends commit c4ea290e47e2155b67d94c708c662f8ab64e1b37) **Note:** Vision features require Python 3.10+. Native macOS Python 3.9.6 users need to upgrade (e.g., via Homebrew). @@ -85,12 +85,17 @@ pip install mlx-knife pip install mlx-knife[vision] # Verify installation -mlxk --version # → mlxk 2.0.3 (stable) or 2.0.4-beta.2 (dev) +mlxk --version # → mlxk 2.0.3 (stable) or 2.0.4-beta.3 (dev) ``` **Python Requirements:** - **Text models:** Python 3.9-3.14 -- **Vision models:** Python 3.10-3.14 (requires `mlx-vlm>=0.3.9`) +- **Vision models:** Python 3.10-3.14 (requires `mlx-vlm>=0.3.9`; beta.3 recommends commit c4ea290e47e2155b67d94c708c662f8ab64e1b37) + +**Beta.3 note:** Until mlx-vlm 0.3.10 is released, install the upstream commit before mlx-knife if you need the fix: +```bash +pip install "mlx-vlm @ git+https://github.com/Blaizzy/mlx-vlm.git@c4ea290e47e2155b67d94c708c662f8ab64e1b37" +``` ### Development Installation @@ -106,7 +111,7 @@ pip install -e ".[dev,test]" pip install -e ".[dev,test,vision]" # Verify installation -mlxk --version # → mlxk 2.0.4-beta.2 +mlxk --version # → mlxk 2.0.4-beta.3 # Run tests and quality checks (before committing) pytest -v @@ -182,6 +187,100 @@ open index.html | 🔒 `pipe mode` | **Beta feature** - Unix pipes with `mlxk run - ...`; requires `MLXK2_ENABLE_PIPES=1` | +## Multi-Modal Support + +MLX Knife supports multiple input modalities beyond text. All multi-modal features share a **common output pattern**: model responses are followed by collapsible metadata tables for transparency and traceability. + +### Vision (Beta) + +Image analysis via the `--image` flag (CLI and server). Requires Python 3.10+. + +#### Requirements + +- **Python 3.10+** (mlx-vlm dependency) +- **Installation:** `pip install mlx-knife[vision]` +- **Backend:** mlx-vlm 0.3.9+ from PyPI +- **Beta.3 note:** For upstream bugfixes, install commit `c4ea290e47e2155b67d94c708c662f8ab64e1b37` before mlx-knife: + ```bash + pip install "mlx-vlm @ git+https://github.com/Blaizzy/mlx-vlm.git@c4ea290e47e2155b67d94c708c662f8ab64e1b37" + pip install mlx-knife[vision] + ``` + +#### Usage + +```bash +# Image analysis with custom prompt +mlxk run "mlx-community/Llama-3.2-11B-Vision-Instruct-4bit" \ + --image photo.jpg "Describe what you see in detail" + +# Multiple images (space-separated or glob) +mlxk run vision-model --image img1.jpg img2.jpg img3.jpg "Compare these images" +mlxk run vision-model --image photos/*.jpg "Which images show outdoor scenes?" + +# Auto-prompt (default: "Describe the image.") +mlxk run vision-model --image cat.jpg + +# Text-only on vision model (no --image flag) +mlxk run "mlx-community/Llama-3.2-11B-Vision-Instruct-4bit" "What is 2+2?" +``` + +#### Metadata Output Format + +When processing images, MLX Knife automatically appends metadata in a **collapsible table** (collapsed by default): + +``` +A beach with palm trees and clear blue water. + +
+📸 Image Metadata (2 images) + +| Image | Filename | Original | Location | Date | Camera | +|-------|----------|----------|----------|------|--------| +| 1 | image_abc123.jpeg | beach.jpg | 📍 32.79°N, 16.92°W | 📅 2023-12-06 12:19 | 📷 Apple iPhone SE | +| 2 | image_def456.jpeg | mountain.jpg | 📍 32.87°N, 17.17°W | 📅 2023-12-10 15:42 | 📷 Apple iPhone SE | + +
+``` + +**Metadata includes:** +- **Image ID** → **Filename mapping** (identify which description belongs to which file) +- **GPS coordinates** (latitude/longitude, if available in EXIF) +- **Capture date/time** (ISO 8601 format) +- **Camera model** (device info) + +**Privacy control:** + +EXIF extraction is **enabled by default**. To disable (e.g., for privacy-sensitive images): + +```bash +export MLXK2_EXIF_METADATA=0 +mlxk run vision-model --image photo.jpg "describe" +``` + +**Output is the same for CLI and server** - metadata tables work in terminals, web UIs (nChat), and can be parsed programmatically. + +#### Limitations + +- **Non-streaming:** Vision runs always use batch mode (no streaming output) +- **Image limits:** 5 images max per request, 20 MB per image, 50 MB total + +#### Server API + +Vision models work with OpenAI-compatible `/v1/chat/completions` endpoint using base64-encoded images: + +```bash +curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "llama-vision", + "messages": [{ + "role": "user", + "content": [ + {"type": "text", "text": "What is in this image?"}, + {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,..."}} + ] + }] +}' +``` + ## JSON API @@ -211,28 +310,6 @@ mlxk show "Phi-3-mini" --json | jq '.data.model' ### Examples -#### Pipe mode (Alpha: set `MLXK2_ENABLE_PIPES=1`) - -```bash -# Read prompt from stdin and append trailing text (auto batch in pipes) -echo "from stdin" | MLXK2_ENABLE_PIPES=1 mlxk run "" - "append extra context" - -# JSON interactive guard (no prompt) emits JSON error on stdout, exit!=0 -MLXK2_ENABLE_PIPES=1 mlxk run "" --json - -# Pipe list JSON into run for summarization -MLXK2_ENABLE_PIPES=1 mlxk list --json \ - | MLXK2_ENABLE_PIPES=1 mlxk run "" - "Summarize the model list as a concise table." - -# Shortcut wrapper (same semantics) -MLXK2_ENABLE_PIPES=1 mlx-run "" - "translate into german" < README.md -``` - -Notes: -- Stdin requires `MLXK2_ENABLE_PIPES=1` (alpha gate). Without it, `-` is rejected. -- When stdout is a pipe (non-TTY), streaming is disabled automatically to keep clean output. -- Use full model IDs in place of ``; HF_HOME should point to your cache for live runs. - #### List Models ```bash mlxk list --json @@ -656,7 +733,7 @@ mlxk health --json | jq '.data.summary' ``` -## Hidden Alpha Features: `clone`, `push`, and pipe mode +## Feature Gates: `clone`, `push` (Alpha), `pipe mode` (Beta) ### `clone` - Model Workspace Creation @@ -710,38 +787,31 @@ These features are not final and may change or be removed in future releases. Pipe mode is beta (feature complete) and requires `MLXK2_ENABLE_PIPES=1`. It lets `mlxk run` (and `mlx-run`) read stdin when you pass `-` as the prompt. -- Gate: `MLXK2_ENABLE_PIPES=1` (will become default in a future stable release). -- Auto-batch: When stdout is a pipe (non-TTY), streaming is disabled automatically for clean output. -- Robust: Handles SIGPIPE and BrokenPipeError gracefully (`| head`, `| grep -m1` work correctly). -- Scope: Applies to `mlxk run` and `mlx-run`; other commands unchanged. +- **Status:** Beta (feature complete), API stable (syntax will not change) +- **Gate:** `MLXK2_ENABLE_PIPES=1` (will become default in a future stable release) +- **Auto-batch:** When stdout is a pipe (non-TTY), streaming is disabled automatically for clean output +- **Robust:** Handles SIGPIPE and BrokenPipeError gracefully (`| head`, `| grep -m1` work correctly) +- **Scope:** Applies to `mlxk run` and `mlx-run`; other commands unchanged - Usage examples (replace `` with a cached MLX chat model): ```bash # stdin + trailing text (batch when piped) MLXK2_ENABLE_PIPES=1 echo "from stdin" | mlxk run "" - "append extra context" -# JSON interactive guard (no prompt) → JSON error on stdout, exit 1 -MLXK2_ENABLE_PIPES=1 mlxk run "" --json - # list → run summarization MLXK2_ENABLE_PIPES=1 mlxk list --json \ - | MLXK2_ENABLE_PIPES=1 mlxk run "" - "Summarize the model list as a concise table." + | MLXK2_ENABLE_PIPES=1 mlxk run "" - "Summarize the model list as a concise table." >my-hf-table.md # Wrapper shorthand MLXK2_ENABLE_PIPES=1 mlx-run "" - "translate into german" < README.md + +# Vision → Text chain: Photo tour review +MLXK2_ENABLE_PIPES=1 mlxk run pixtral --image photos/*.jpg "Describe each picture" \ + | MLXK2_ENABLE_PIPES=1 mlxk run qwen3 - \ + "Write a tour review. Create a table with picture names, metadata, and descriptions." \ + > tour-review.md ``` -Pipe mode API is stable. - -### `vision` - mlx-vlm (Python 3.10+, non-streaming) - -- Install extras: `pip install -e .[vision]` (requires `mlx-vlm>=0.3.9` from PyPI, Python 3.10+). -- Backend: Uses `mlx-vlm` (vision); streaming is disabled for vision runs. -- Usage: - - Text-only on a vision model: `mlxk run "mlx-community/Llama-3.2-11B-Vision-Instruct-4bit" "what is 2+2"` - - Image + text: `mlxk run "" --image cat.jpg "describe the cat"` - - Image-only (auto prompt): `mlxk run "" --image cat.jpg` - ## Testing @@ -817,7 +887,7 @@ Apache License 2.0 — see `LICENSE` (root) and `mlxk2/NOTICE`.

Made with ❤️ by The BROKE team BROKE Logo
- Version 2.0.4-beta.2 | December 2025
+ Version 2.0.4-beta.3 | December 2025
💬 Web UI: nChat - lightweight chat interface🔮 Multi-node: BROKE Cluster

diff --git a/TESTING-DETAILS.md b/TESTING-DETAILS.md index d07fb2c..a2f5d7d 100644 --- a/TESTING-DETAILS.md +++ b/TESTING-DETAILS.md @@ -847,7 +847,7 @@ MLXK2_LIVE_PUSH=1 \ --- -### Complete Test File Structure (2.0.4-beta.1) +### Complete Test File Structure (2.0.4-beta.3) ``` tests_2.0/ @@ -885,7 +885,8 @@ tests_2.0/ │ ├── test_server_e2e.py # Server E2E tests with TEXT models (ADR-011 + Portfolio Separation, parametrized: text_XX) │ ├── test_streaming_parity.py # Streaming vs batch parity tests (Issue #20, ADR-011, parametrized) │ ├── test_vision_e2e_live.py # Vision CLI E2E tests with real models (ADR-012, 5 deterministic vision queries) -│ └── test_vision_server_e2e.py # Vision Server E2E tests with VISION models (ADR-012 Phase 3 + Portfolio Separation, parametrized: vision_XX) +│ ├── test_vision_server_e2e.py # Vision Server E2E tests with VISION models (ADR-012 Phase 3 + Portfolio Separation, parametrized: vision_XX) +│ └── test_vm_stat_parsing.py # vm_stat output parsing validation (macOS memory metrics) ├── test_adr004_error_logging.py # ADR-004 error logging and redaction (tokens, paths) ├── test_capabilities.py # Probe/Policy architecture (ADR-012, ADR-016, Session 18-19, 45 tests) ├── test_cli_log_json_flag.py # CLI --log-json flag behavior and JSON log format diff --git a/TESTING.md b/TESTING.md index 5d5903e..cb027c5 100644 --- a/TESTING.md +++ b/TESTING.md @@ -19,6 +19,17 @@ For current test counts, version-specific details, and complete file listings, s - Delete operations fail if not in test cache (`MLXK2_STRICT_TEST_DELETE=1`) - Live tests never modify user cache without explicit environment variables +**Unit Test Limitations:** + +MLX Knife has two test categories: +1. **Unit tests** (~500 tests, fast, mocked) - verify code structure +2. **Live E2E tests** (real models, slow) - verify actual functionality + +**Why both are needed:** +When dependencies like `transformers` or `mlx-lm` update their APIs, unit tests (which mock these libraries) continue to pass, but real model loading breaks. Only live E2E tests catch these issues. + +**Example:** transformers 5.0 changed tokenizer initialization - unit tests passed (mocked API), but vision models failed to load in production. Live E2E tests caught the issue immediately. + ## Quick Start ```bash diff --git a/benchmarks/README.md b/benchmarks/README.md index e44417c..437ee63 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,68 +1,288 @@ # MLX Knife Benchmarks -**Status:** Phase 0 - Organic Data Collection +**Status:** Phase 0 - Organic Data Collection (WIP) -## Architecture +## What's Here? -This directory tracks empirical performance and compatibility data from mlx-knife's test suite. +This directory contains benchmark infrastructure for mlx-knife: +- Empirical performance and compatibility data from E2E tests +- Tools for analysis and visualization +- Schema definitions for structured reports -### Phase 0 Goals (2.0.3+) +## Directory Structure + +``` +benchmarks/ +├── reports/ # JSONL test reports + Markdown analyses +│ ├── 2025-12-20-v2.0.4b3.jsonl # Raw data (one file per test run) +│ └── BENCHMARK-v1.0-*.md # Generated analysis reports +├── schemas/ # JSON Schema definitions +│ ├── report-v0.1.0.schema.json # lecacy schema +│ ├── report-v0.2.0.schema.json # Current schema +│ └── report-current.schema.json # Symlink → current schema +├── tools/ # Standalone tools +│ ├── memmon.py # Memory monitor (background sampling) +│ └── memplot.py # Memory timeline visualizer +├── generate_benchmark_report.py # Report generator (Template v1.0) +├── validate_reports.py # Schema validation +├── README.md # ← You are here +└── TESTING.md # Benchmark handbook (How-To) +``` + +## Tools + +| Tool | Purpose | +|------|---------| +| `generate_benchmark_report.py` | JSONL → Markdown report (Template v1.0) | +| `validate_reports.py` | Schema validation of JSONL files | +| `tools/memmon.py` | Memory monitoring during test runs | +| `tools/memplot.py` | Interactive memory timeline visualization (HTML) | + +## Schema + +**Current:** v0.2.0 (Phase 0 - Test Infrastructure) + +| Version | Release | Content | +|---------|---------|---------| +| v0.1.0 | 2.0.3 | Minimal: test, outcome, duration, model | +| v0.2.0 | 2.0.4 | + hardware_profile, system_health, quality_flags | +| v1.0.0 | Future | Model benchmarks (mlxk-benchmark package) | + +**Schema Strategy:** No v0.3.x planned. v0.2.0 → v1.0.0 directly. +- v0.x = Test infrastructure ("Was the test run clean?") +- v1.x = Model benchmarks ("How good is the model?") + +See `schemas/LEARNINGS-FOR-v1.0.md` for details. + +## Current Baseline + +**Report:** `reports/BENCHMARK-v1.0-2.0.4b3-2025-12-20.md` + +- Version: 2.0.4-beta.3 +- Hardware: Mac14,13 (M2 Max, 64 GB) +- Tests: 141/162 passed, 19.5 min +- Quality: 100% clean (0 MB swap, 0 zombies) + +## Phase 0 Goals 1. **Collect data organically** from E2E tests 2. **No perfect schema** - schema evolves with data 3. **Git-tracked reports** - historical trends -4. **Foundation for future** - community contributions, public database +4. **Foundation for Phase 1** - mlxk-benchmark package -### Directory Structure +## Memory Timeline Visualization -- `reports/` - JSONL test reports (one file per release) -- `schemas/` - JSON Schema definitions (versioned) +**Tool:** `tools/memplot.py` | **Created:** Session 45 (2025-12-21) -### Current Schema - -**Version:** 0.2.0 (Phase 0 - Scheduling-Enhanced) - -- **v0.1.0** (2.0.3+): Minimal schema - basic performance metrics -- **v0.2.0** (2.0.4+): Hardware profiling + detailed metrics for cluster scheduling - - `system.hardware_profile`: Mac model, cores, Metal version - - `performance.*_time_s`: model_load, time_to_first_token, cleanup - - `system_health`: swap, zombies, quality_flags - - Backward compatible: v0.1.0 reports still valid - -**Schema Files:** -- `schemas/report-current.schema.json` → always points to latest version -- `schemas/report-v0.2.schema.json` → current schema (2.0.4+) -- `schemas/report-v0.1.schema.json` → legacy schema (2.0.3) - -**Required fields:** -- `schema_version`, `timestamp`, `mlx_knife_version`, `test`, `outcome` - -**Optional sections:** -- `model` - Model metadata -- `performance` - tokens/sec, RAM usage -- `stop_tokens` - ADR-009 validation data -- `system` - Platform info -- `metadata` - Extensible (anything) - -### Generating Reports +### Quick Start ```bash -# During E2E tests -pytest -m live_e2e tests_2.0/live/ \ - --report-output benchmarks/reports/$(date +%Y-%m-%d)-v$(mlxk --version | cut -d' ' -f2).jsonl +# Collect data (memmon runs in background) +python benchmarks/tools/memmon.py --output memory.jsonl -- \ + pytest -m live_e2e tests_2.0/live/ --report-output benchmark.jsonl + +# Generate interactive HTML +python benchmarks/tools/memplot.py memory.jsonl benchmark.jsonl -o timeline.html ``` -### Schema Evolution +### Visual Legend -As we collect more data, the schema will evolve: -- New fields added (backward compatible) -- Optional → Required (when stable) -- Breaking changes documented in `schemas/MIGRATIONS.md` +#### Main Graph: RAM Free (GB) -### Future Phases +**Blue line with colored markers:** +- 🟢 **Green markers:** Healthy (≥32 GB free, ≥50% of 64 GB) +- 🟠 **Orange markers:** Warning (16-32 GB free, 25-50%) +- 🔴 **Red markers:** Critical (<16 GB free, <25%) -- **Phase 1 (2.1+):** Schema formalization, validation tooling -- **Phase 2 (2.2+):** `mlxk report` CLI for manual submissions -- **Phase 3 (2.3+):** Public database, community contributions +**Dashed threshold lines:** +- **Green line (32 GB):** 50% threshold - system healthy +- **Orange line (16 GB):** 25% threshold - warning level -See `docs/ADR/ADR-013-Community-Model-Quality-Database.md` for full roadmap. +#### Background Rectangles: Test Regions + +**Gray (rgba(200, 200, 200, 0.3)):** +- Model tests that load an LLM model +- Example: `test_run_command[text_00]`, `test_chat_completion[vision_01]` +- **Meaning:** Model is loaded in RAM during this time + +**Light Blue (rgba(173, 216, 230, 0.2)):** +- Infrastructure tests without model +- Example: `test_portfolio_discovery`, `test_health_check` +- **Meaning:** No model loaded, only test infrastructure active + +⚠️ **Known limitation (v0.2.0):** Server tests appear as "light blue" even when loading models (LocalServer fixture doesn't record model metadata). Recognizable by: high RAM usage + long duration in blue region. Example: `test_text_request_still_works_on_vision_model` (57 GB used, 16s duration). + +#### Memory Pressure Overlay + +**Yellow (rgba(255, 204, 0, 0.15)):** +- macOS Memory Pressure: WARN +- Source: `sysctl kern.memorystatus_vm_pressure_level = 2` + +**Red (rgba(255, 59, 48, 0.15)):** +- macOS Memory Pressure: CRITICAL +- Source: `sysctl kern.memorystatus_vm_pressure_level = 4` +- **Meaning:** System begins swapping, performance degradation + +**White/Transparent:** +- macOS Memory Pressure: NORMAL (level = 1) + +#### Labels + +**Top (90° rotated, black):** +- Model names at each model switch +- Example: `DeepHermes-3-Mistral`, `pixtral-12b-8bit` +- Position: Left-aligned with test start + +**Bottom (90° rotated, gray):** +- Test names for each test (model + infrastructure) +- Example: `test_run_command`, `test_chat_completion` +- Position: Left-aligned with test start + +**Vertical helper lines:** +- Thin gray lines at each test start +- Help correlate labels with timeline + +#### Secondary Y-Axis: Swap Used (MB) + +**Red line (right axis):** +- Only visible when swap > 0 MB +- **Meaning:** System paging RAM to SSD → performance loss +- **Normal:** 0 MB +- **Problematic:** >100 MB + +### Interpretation Patterns + +**Typical model load:** +``` +Pattern: RAM Free drops suddenly (e.g., 52 GB → 28 GB) +Duration: 2-5 seconds +Color: Gray rectangle begins +Label: Model name appears at top +→ Model loaded into RAM (24 GB) +``` + +**Typical model unload:** +``` +Pattern: RAM Free rises suddenly (e.g., 28 GB → 52 GB) +Duration: <1 second +Color: Gray rectangle ends (or switches to next) +Label: New model name (or none) +→ Model removed from RAM +``` + +**Memory pressure without swap:** +``` +Pattern: Yellow/Red background WITHOUT swap line +RAM Free: Still >10 GB +→ macOS preparing to swap, not yet active +→ Often during large model loads (temporary) +``` + +**Memory pressure with swap:** +``` +Pattern: Red background + Red swap line rises +RAM Free: <10 GB +Swap: >100 MB +→ System actually at limit +→ Performance significantly worse +→ Typical: Multiple large models in short time +``` + +**Infrastructure test with high RAM usage:** +``` +Pattern: Light blue rectangle + RAM drops significantly (>20 GB) +Duration: >10 seconds +Example: 57 GB used in test_text_request_still_works_on_vision_model +→ ⚠️ Schema bug: Server test loads model but "model": null +→ Should be gray, not light blue +→ Fix: v1.0 schema with log parsing +``` + +### Data Sources + +**RAM Free:** +- Source: `vm_stat` (macOS native) +- Calculation: `(free + inactive + purgeable + speculative) * page_size / 1e9` +- Sample rate: 500ms (2 samples/second) + +**Memory Pressure:** +- Source: `sysctl kern.memorystatus_vm_pressure_level` +- Values: 1=NORMAL, 2=WARN, 4=CRITICAL +- Sample rate: 500ms (synchronized with RAM) + +**Swap Used:** +- Source: `sysctl vm.swapusage` +- Unit: MB +- Sample rate: 500ms + +**Test Metadata:** +- Source: Benchmark JSONL (pytest-json-report format) +- Fields: `timestamp`, `duration`, `test`, `model` (optional), `outcome` +- Correlation: ISO timestamp → Unix timestamp → elapsed seconds + +### Known Limitations (v0.2.0) + +1. **Model load/unload events missing** + - Gray regions show "test with model", not "model is loaded" + - Pytest runs through ALL models 4x → each model loaded/unloaded 4x + - Regions overlap visually though sequential + - **Fix planned:** v1.0 schema with explicit events + +2. **Server tests without model attribution** + - Server tests (LocalServer fixture) load models internally + - Appear as "infrastructure" (light blue) instead of "model" (gray) + - Recognizable: High RAM + long duration in blue region + - **Fix planned:** Log parsing in v0.3.0/v1.0 + +3. **Dense test sequences** + - Tests shorter than 500ms sample rate → no coloring + - Typical: Fast infrastructure tests (<100ms) + - **Workaround:** Test labels show all tests + +4. **Label overlap** + - Many tests in short time (>10 tests/min) + - Labels may overlap (90° rotated) + - **Mitigation:** Zoom for detailed view + - **Future:** Adaptive label density or collapsing + +### Interactive Features + +- **Zoom & Pan:** Mouse wheel (vertical), Shift+wheel (horizontal), click+drag +- **Range Slider:** Quick navigation in long (>20 min) timelines +- **Hover:** X-axis unified mode shows all values at same time + +### Future Extensions (Ideas) + +**For plot:** +- [ ] Embedded legend in plot (not external file) +- [ ] Toggle show/hide infrastructure tests +- [ ] Hover shows full test names (not truncated) +- [ ] Color-blind mode (alternative palette) + +**For schema v1.0:** +- [ ] Model load/unload events → precise "in RAM" regions +- [ ] Log parsing for server tests → correct attribution +- [ ] GPU activity (Metal performance) +- [ ] Net T/S (tokens/second, pure inference) + +**For analysis:** +- [ ] Automatic anomaly detection (memory leaks, zombies) +- [ ] Per-model memory profiling (min/max/avg RAM) +- [ ] Scheduling optimization (avoid model-switch overlap) + +--- + +## Roadmap + +| Phase | Release | Description | +|-------|---------|-------------| +| **Phase 0** | 2.0.3-2.0.4 | Organic Data Collection ✅ | +| Phase 1 | 2.1+ | `mlxk-benchmark` package (separate tool) | +| Phase 2 | 2.2+ | Report aggregation, hardware correlation | +| Phase 3 | 2.3+ | Public database, community contributions | + +## Further Documentation + +- **[TESTING.md](TESTING.md)** - Benchmark handbook (How-To) +- **[schemas/LEARNINGS-FOR-v1.0.md](schemas/LEARNINGS-FOR-v1.0.md)** - Learnings for Phase 1 +- **[docs/ADR/ADR-013-Community-Model-Quality-Database.md](../docs/ADR/ADR-013-Community-Model-Quality-Database.md)** - Architecture vision diff --git a/benchmarks/TESTING.md b/benchmarks/TESTING.md index 59e44f2..6d8d063 100644 --- a/benchmarks/TESTING.md +++ b/benchmarks/TESTING.md @@ -1,155 +1,263 @@ -# Testing with Benchmark Reports (ADR-013 Phase 0) +# Benchmark Handbook -This document explains how to generate benchmark reports during E2E tests. +Step-by-step guide for running benchmarks and generating reports. + +## Quick Start + +```bash +# 1. Run E2E tests with report output +pytest -m live_e2e tests_2.0/live/ \ + --report-output benchmarks/reports/$(date +%Y-%m-%d)-v2.0.4b3.jsonl + +# 2. Generate analysis report +python benchmarks/generate_benchmark_report.py + +# 3. View results +cat benchmarks/reports/BENCHMARK-v1.0-2.0.4b3-*.md +``` + +--- + +## Running Benchmarks + +### Basic Test Run + +```bash +# Run all E2E tests, output to JSONL +pytest -m live_e2e tests_2.0/live/ \ + --report-output benchmarks/reports/$(date +%Y-%m-%d)-v2.0.4b3.jsonl +``` + +### With Custom HuggingFace Cache + +```bash +HF_HOME=/path/to/huggingface/cache \ + pytest -m live_e2e tests_2.0/live/ -v \ + --report-output benchmarks/reports/2025-12-20-v2.0.4b3.jsonl +``` + +### With Memory Monitoring + +```bash +# Run memmon in parallel to capture memory profile +python benchmarks/tools/memmon.py \ + --output benchmarks/reports/2025-12-20-memory.jsonl \ + -- pytest -m live_e2e tests_2.0/live/ \ + --report-output benchmarks/reports/2025-12-20-v2.0.4b3.jsonl +``` + +--- ## Generating Reports -### Basic Usage +### Auto-Detect Latest JSONL ```bash -# Run E2E tests with reporting -pytest -m live_e2e tests_2.0/live/ \ - --report-output benchmarks/reports/$(date +%Y-%m-%d)-v2.0.3.jsonl +python benchmarks/generate_benchmark_report.py +# → Finds most recent .jsonl in benchmarks/reports/ +# → Outputs: BENCHMARK-v1.0--.md ``` -### With Full Environment +### Explicit Input File ```bash -# Use specific HF cache + generate reports -HF_HOME=/Volumes/mz-SSD/huggingface/cache \ - pytest -m live_e2e tests_2.0/live/ -v \ - --report-output benchmarks/reports/2025-11-16-v2.0.3.jsonl +python benchmarks/generate_benchmark_report.py \ + benchmarks/reports/2025-12-20-v2.0.4b3.jsonl ``` -## Adding Report Data to Tests +### With Comparison (Regression Detection) -Tests can add structured data to reports using `request.node.user_properties`: +```bash +python benchmarks/generate_benchmark_report.py \ + benchmarks/reports/2025-12-20-new.jsonl \ + --compare benchmarks/reports/2025-12-19-old.jsonl +``` + +Output includes: +- Duration change (e.g., 20.5 min → 19.7 min, -3.8%) +- Per-model changes with Old/Δ/Change columns +- Per-test median time changes +- Status indicators: ⚠️ (>5% slower), ✅ (>1% faster) + +### Custom Output Location + +```bash +python benchmarks/generate_benchmark_report.py \ + --output /tmp/my-report.md \ + benchmarks/reports/2025-12-20-v2.0.4b3.jsonl +``` + +--- + +## Memory Monitoring + +### Standalone Monitor (Fixed Duration) + +```bash +python benchmarks/tools/memmon.py \ + --duration 60 \ + --interval 200 \ + --output memory.jsonl +``` + +### Wrap Any Command + +```bash +python benchmarks/tools/memmon.py \ + --output memory.jsonl \ + -- ./my-benchmark-script.sh +``` + +### Output Format + +```jsonl +{"ts": 1734567890.1, "ram_free_gb": 45.2, "swap_used_mb": 0, "elapsed_s": 0.2} +{"ts": 1734567890.3, "ram_free_gb": 42.1, "swap_used_mb": 0, "elapsed_s": 0.4} +... +{"summary": {"ram_free_min_gb": 21.3, "ram_free_max_gb": 45.2, "swap_max_mb": 0}} +``` + +### Correlating with Test Results + +Memory samples can be correlated with test results via timestamps: ```python -def test_example(model_info, request): - # ... test logic ... +# Test entry has: timestamp (end time), duration +# Calculate: started_at = timestamp - duration - # Add model info - request.node.user_properties.append(("model", { - "id": model_info["id"], - "size_gb": model_info["ram_needed_gb"], - "family": extract_family(model_info["id"]), - "variant": extract_variant(model_info["id"]) - })) +test_start = parse_iso(entry["timestamp"]) - entry["duration"] +test_end = parse_iso(entry["timestamp"]) - # Add performance metrics - request.node.user_properties.append(("performance", { - "tokens_per_sec": measure_tokens_per_sec(response), - "ram_peak_mb": get_peak_ram_usage(), - "duration_s": response.elapsed - })) - - # Add stop token data (ADR-009) - request.node.user_properties.append(("stop_tokens", { - "configured": model_stop_tokens, - "detected": find_stop_tokens_in_response(response), - "workaround": get_workaround_name(model_info["id"]), - "leaked": check_for_leaked_tokens(response) - })) - - # Add system info (optional) - request.node.user_properties.append(("system", { - "platform": platform.system().lower(), - "platform_version": get_os_version(), - "python_version": platform.python_version(), - "mlx_version": get_mlx_version(), - "hardware": get_hardware_model(), - "ram_total_gb": get_total_ram_gb() - })) - - # Anything else goes to metadata - request.node.user_properties.append(("custom_metric", "value")) +# Find matching memory samples +matching = [s for s in samples if test_start <= s["ts"] <= test_end] ``` -## Structured Sections +--- -Reports have predefined structured sections that map to schema fields: +## Validating Reports -| user_properties key | Maps to report field | Description | -|---------------------|----------------------|-------------| -| `model` | `model` object | Model metadata (id, size, family, variant) | -| `performance` | `performance` object | Performance metrics (tokens/sec, RAM, duration) | -| `stop_tokens` | `stop_tokens` object | Stop token behavior (ADR-009 validation) | -| `system` | `system` object | Platform information (OS, Python, MLX, hardware) | -| _anything else_ | `metadata` object | Extensible catch-all for experiments | - -## Schema Validation +### Validate Against Current Schema ```bash -# Validate reports against schema (requires jsonschema) -pip install jsonschema - -# Validate all reports -for report in benchmarks/reports/*.jsonl; do - echo "Validating $report..." - cat "$report" | while read line; do - echo "$line" | python3 -c " -import sys, json -from jsonschema import validate - -with open('benchmarks/schemas/report-v0.1.schema.json') as f: - schema = json.load(f) - -report = json.load(sys.stdin) -validate(instance=report, schema=schema) -print('✓ Valid') -" - done -done +python benchmarks/validate_reports.py benchmarks/reports/*.jsonl ``` -## Example Report +### Validate Specific File +```bash +python benchmarks/validate_reports.py benchmarks/reports/2025-12-20-v2.0.4b3.jsonl +``` + +--- + +## Schema Reference + +### Current Schema: v0.2.0 + +Required fields: ```json { - "schema_version": "0.1.0", - "timestamp": "2025-11-16T10:30:00Z", - "mlx_knife_version": "2.0.3", - "test": "tests_2.0/live/test_stop_tokens_live.py::test_stop_tokens[phi-3-mini]", + "schema_version": "0.2.0", + "timestamp": "2025-12-20T02:26:10.722510+00:00", + "mlx_knife_version": "2.0.4-beta.3", + "test": "tests_2.0/live/test_cli_e2e.py::test_run_command[discovered_00]", "outcome": "passed", - "duration": 12.3, + "duration": 12.3 +} +``` + +Optional sections: +```json +{ "model": { - "id": "mlx-community/phi-3-mini-4k-instruct", - "size_gb": 2.8, - "family": "phi-3", - "variant": "mini-4k-instruct" + "id": "mlx-community/Qwen3-32B-4bit", + "size_gb": 17.2, + "family": "qwen3" }, - "performance": { - "tokens_per_sec": 45.2, - "ram_peak_mb": 3200, - "prompt_tokens": 15, - "completion_tokens": 42 + "system": { + "hardware_profile": { + "model": "Mac14,13", + "cores_physical": 12 + } }, - "stop_tokens": { - "configured": ["<|end|>", "<|endoftext|>"], - "detected": ["<|end|>"], - "workaround": "phi-3-dual-eos", - "leaked": false + "system_health": { + "swap_used_mb": 0, + "ram_free_gb": 45.2, + "zombie_processes": 0, + "quality_flags": ["clean"] } } ``` -## Analyzing Reports +### Quality Flags -See `reports/README.md` for analysis examples (jq queries, statistics, trends). +| Flag | Meaning | Threshold | +|------|---------|-----------| +| `clean` | Test ran without issues | swap=0, zombies=0 | +| `degraded_swap` | Memory pressure detected | swap > 100 MB | +| `degraded_zombies` | Zombie processes present | zombies > 0 | + +--- ## Best Practices 1. **File Naming:** Use `YYYY-MM-DD-vX.Y.Z.jsonl` format 2. **Append Only:** Never edit existing reports (historical data) 3. **Commit Reports:** Reports are git-tracked for trend analysis -4. **Schema Version:** Always include `schema_version` for evolution tracking -5. **Optional Data:** Only add what you can measure reliably -6. **No PII:** Never include personal information in reports +4. **Clean State:** Reboot before important benchmark runs +5. **Close Apps:** Minimize background processes during tests +6. **Multiple Runs:** Run 2-3 times, compare for consistency -## Future Enhancements (Phase 1+) +--- -- Automatic validation during `pytest --report-output` -- Performance regression detection -- Report comparison tools (`mlxk report diff`) -- Schema migration utilities +## Troubleshooting + +### "No JSONL files found" + +```bash +# Check if reports exist +ls -la benchmarks/reports/*.jsonl + +# Run tests with output +pytest -m live_e2e tests_2.0/live/ --report-output benchmarks/reports/test.jsonl +``` + +### Schema Validation Fails + +```bash +# Check schema version in file +head -1 benchmarks/reports/file.jsonl | jq .schema_version + +# Validate manually +python -c " +import json +from jsonschema import validate +with open('benchmarks/schemas/report-current.schema.json') as f: + schema = json.load(f) +with open('benchmarks/reports/file.jsonl') as f: + for line in f: + validate(json.loads(line), schema) +print('OK') +" +``` + +### Comparison Shows "N/A" + +Model not found in comparison file. Check: +- Same models tested in both runs? +- Model ID spelling matches exactly? + +--- + +## Future: Phase 1 (mlxk-benchmark) + +Phase 1 will introduce a standalone benchmark package: + +```bash +pip install mlxk-benchmark +mlx-benchmark --model llama-3.2-3b --contribute +``` + +No pytest, no fixtures, no conftest.py - just simple CLI for community contributions. + +See `schemas/LEARNINGS-FOR-v1.0.md` for design notes. diff --git a/benchmarks/generate_benchmark_report.py b/benchmarks/generate_benchmark_report.py new file mode 100644 index 0000000..e49d79f --- /dev/null +++ b/benchmarks/generate_benchmark_report.py @@ -0,0 +1,575 @@ +#!/usr/bin/env python3 +"""Generate benchmark analysis report from JSONL test data. + +Reads JSONL benchmark reports and generates structured Markdown analysis. + +Usage: + # Auto-detect latest JSONL + python benchmarks/generate_benchmark_report.py + + # Explicit file + python benchmarks/generate_benchmark_report.py benchmarks/reports/2025-12-20-v2.0.4b3.jsonl + + # With comparison + python benchmarks/generate_benchmark_report.py new.jsonl --compare old.jsonl +""" + +import argparse +import json +import sys +from datetime import datetime, timezone +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +try: + import jsonschema +except ImportError: + print("Error: jsonschema not installed. Install with: pip install jsonschema") + sys.exit(1) + + +# Template version +TEMPLATE_VERSION = "1.0" +REPORTS_DIR = Path("benchmarks/reports") +SCHEMA_PATH = Path("benchmarks/schemas/report-current.schema.json") + + +def load_schema() -> dict: + """Load current JSON schema.""" + if not SCHEMA_PATH.exists(): + print(f"❌ Schema not found: {SCHEMA_PATH}") + sys.exit(1) + + with open(SCHEMA_PATH) as f: + return json.load(f) + + +def validate_jsonl(data: List[dict], schema: dict, filepath: Path) -> bool: + """Validate JSONL data against schema.""" + errors = [] + for i, entry in enumerate(data, 1): + try: + jsonschema.validate(instance=entry, schema=schema) + except jsonschema.ValidationError as e: + errors.append(f"Line {i}: {e.message}") + + if errors: + print(f"❌ Validation failed for {filepath}") + for error in errors[:5]: # Show first 5 errors + print(f" {error}") + if len(errors) > 5: + print(f" ... and {len(errors) - 5} more errors") + return False + + return True + + +def load_jsonl(filepath: Path) -> List[dict]: + """Load JSONL file.""" + data = [] + with open(filepath) as f: + for line in f: + line = line.strip() + if line: + data.append(json.loads(line)) + return data + + +def find_latest_jsonl() -> Optional[Path]: + """Find the most recent JSONL file in reports directory.""" + if not REPORTS_DIR.exists(): + return None + + jsonl_files = sorted(REPORTS_DIR.glob("*.jsonl"), key=lambda p: p.stat().st_mtime, reverse=True) + return jsonl_files[0] if jsonl_files else None + + +def extract_version_from_filename(filepath: Path) -> Optional[str]: + """Extract version string from filename like '2025-12-20-v2.0.4b3.jsonl'.""" + parts = filepath.stem.split("-v") + return parts[1].split("-")[0] if len(parts) > 1 else None + + +def calculate_statistics(data: List[dict]) -> Dict: + """Calculate all benchmark statistics from JSONL data.""" + # Separate by outcome + passed_tests = [e for e in data if e.get("outcome") == "passed"] + skipped_tests = [e for e in data if e.get("outcome") == "skipped"] + passed_with_model = [e for e in passed_tests if "model" in e] + passed_without_model = [e for e in passed_tests if "model" not in e] + + # System health metrics (optional for backward compatibility with older schemas) + swap_values = [] + ram_values = [] + zombie_values = [] + quality_flags = [] + + for e in data: + if "system_health" in e: + swap_values.append(e["system_health"].get("swap_used_mb", 0)) + ram_values.append(e["system_health"].get("ram_free_gb", 0)) + zombie_values.append(e["system_health"].get("zombie_processes", 0)) + quality_flags.append(e["system_health"].get("quality_flags", ["unknown"])) + + clean_count = sum(1 for flags in quality_flags if flags == ["clean"]) + degraded_swap = sum(1 for flags in quality_flags if "degraded_swap" in flags) + degraded_zombies = sum(1 for flags in quality_flags if "degraded_zombies" in flags) + + # Per-model statistics + model_stats = {} + for entry in passed_with_model: + model_id = entry["model"]["id"] + if model_id not in model_stats: + model_stats[model_id] = { + "id": model_id, + "size_gb": entry["model"]["size_gb"], + "count": 0, + "total_time": 0, + "ram_min": float("inf"), + "ram_max": 0, + "swap_max": 0, + "zombies_max": 0, + } + + stats = model_stats[model_id] + stats["count"] += 1 + stats["total_time"] += entry["duration"] + # Handle optional system_health (backward compatibility) + if "system_health" in entry: + stats["ram_min"] = min(stats["ram_min"], entry["system_health"].get("ram_free_gb", 0)) + stats["ram_max"] = max(stats["ram_max"], entry["system_health"].get("ram_free_gb", 0)) + stats["swap_max"] = max(stats["swap_max"], entry["system_health"].get("swap_used_mb", 0)) + stats["zombies_max"] = max(stats["zombies_max"], entry["system_health"].get("zombie_processes", 0)) + + # Per-test statistics + import statistics + test_stats = {} + for entry in passed_with_model: + # Extract test function name and normalize (remove parametrization) + test_full = entry["test"].split("::")[-1] + test_name = test_full.split("[")[0] # Remove [discovered_XX] part + + model_id = entry["model"]["id"] + model_short = model_id.replace("mlx-community/", "").split("-")[0] # Short name + duration = entry["duration"] + + if test_name not in test_stats: + test_stats[test_name] = { + "name": test_name, + "models": set(), + "runs": [], + } + + test_stats[test_name]["models"].add(model_id) + test_stats[test_name]["runs"].append({ + "model": model_id, + "model_short": model_short, + "duration": duration + }) + + # Calculate aggregates per test + for test_name, stats in test_stats.items(): + durations = [r["duration"] for r in stats["runs"]] + stats["model_count"] = len(stats["models"]) + stats["median_time"] = statistics.median(durations) if durations else 0 + + # Find fastest and slowest + sorted_runs = sorted(stats["runs"], key=lambda r: r["duration"]) + stats["fastest"] = sorted_runs[0] if sorted_runs else None + stats["slowest"] = sorted_runs[-1] if sorted_runs else None + + # Convert set to list for JSON serialization + stats["models"] = list(stats["models"]) + + # Hardware profile (from first entry, optional for backward compatibility) + hw_profile = {} + if data and "system" in data[0] and "hardware_profile" in data[0]["system"]: + hw_profile = data[0]["system"]["hardware_profile"] + + return { + "total_tests": len(data), + "passed": len(passed_tests), + "passed_with_model": len(passed_with_model), + "passed_infrastructure": len(passed_without_model), + "skipped": len(skipped_tests), + "total_duration": sum(e["duration"] for e in passed_tests), + "schema_version": data[0]["schema_version"] if data else "unknown", + "mlx_knife_version": data[0]["mlx_knife_version"] if data else "unknown", + "swap": { + "min": min(swap_values) if swap_values else 0, + "max": max(swap_values) if swap_values else 0, + "avg": sum(swap_values) / len(swap_values) if swap_values else 0, + }, + "ram": { + "min": min(ram_values) if ram_values else 0, + "max": max(ram_values) if ram_values else 0, + "avg": sum(ram_values) / len(ram_values) if ram_values else 0, + }, + "zombies": { + "min": min(zombie_values) if zombie_values else 0, + "max": max(zombie_values) if zombie_values else 0, + }, + "quality": { + "clean": clean_count, + "degraded_swap": degraded_swap, + "degraded_zombies": degraded_zombies, + "clean_percent": 100 * clean_count / len(data) if data else 0, + }, + "hardware": hw_profile, + "models": model_stats, + "tests": test_stats, + } + + +def generate_markdown(stats: Dict, input_file: Path, compare_file: Optional[Path] = None, compare_stats: Optional[Dict] = None) -> str: + """Generate Markdown report from statistics.""" + version = stats["mlx_knife_version"] + date = input_file.stem.split("-v")[0] # Extract date from filename + now = datetime.now(timezone.utc).isoformat() + + # Header + md = f"""# Benchmark Report v{TEMPLATE_VERSION}: {version} + +**Date:** {date} +**Generated:** {now} +**Generator:** generate_benchmark_report.py v{TEMPLATE_VERSION} +**Hardware:** {stats['hardware'].get('model', 'unknown')}, {stats['hardware'].get('cores_physical', '?')} cores + +--- + +## Input Files + +- **Primary:** `{input_file}` +- **Schema:** v{stats['schema_version']} +""" + + if compare_file: + md += f"- **Comparison:** `{compare_file}`\n" + + md += "\n---\n\n" + + # Executive Summary + md += "## Executive Summary\n\n" + md += f"**Tests:** {stats['total_tests']} total ({stats['passed']} passed, {stats['skipped']} skipped)\n" + md += f"**Duration:** {stats['total_duration']:.1f}s ({stats['total_duration']/60:.1f} min)\n" + md += f"**Quality:** {stats['quality']['clean_percent']:.1f}% clean ({stats['quality']['clean']}/{stats['total_tests']})\n" + md += f"**Models:** {len(stats['models'])} tested\n\n" + + # Comparison Summary + if compare_stats: + old_duration = compare_stats['total_duration'] + new_duration = stats['total_duration'] + duration_delta = new_duration - old_duration + duration_pct = (duration_delta / old_duration * 100) if old_duration > 0 else 0 + + # Count models by change direction + compare_models_dict = {m['id']: m for m in compare_stats['models'].values()} + slower_count = 0 + faster_count = 0 + for model in stats['models'].values(): + old_model = compare_models_dict.get(model['id']) + if old_model: + if model['total_time'] > old_model['total_time']: + slower_count += 1 + elif model['total_time'] < old_model['total_time']: + faster_count += 1 + + total_compared = slower_count + faster_count + change_icon = "⚠️" if duration_pct > 3 else "✅" if duration_pct < -1 else "➡️" + + md += f"### Comparison\n\n" + md += f"**vs:** `{compare_file.name}`\n" + md += f"**Duration:** {old_duration/60:.1f} min → {new_duration/60:.1f} min ({duration_pct:+.1f}%) {change_icon}\n" + if total_compared > 0: + md += f"**Models:** {slower_count}/{total_compared} slower ({100*slower_count/total_compared:.0f}%), {faster_count}/{total_compared} faster ({100*faster_count/total_compared:.0f}%)\n" + md += "\n" + + # Validation Status + quality_icon = "✅" if stats['quality']['clean_percent'] == 100 else "⚠️" + md += f"{quality_icon} **System Health:** " + if stats['quality']['clean_percent'] == 100: + md += "All tests clean (0 MB swap, 0 zombies)\n" + else: + md += f"{stats['quality']['degraded_swap']} degraded (swap), {stats['quality']['degraded_zombies']} degraded (zombies)\n" + + md += "\n---\n\n" + + # Test Summary + md += "## Test Summary\n\n" + md += f"""``` +Total tests: {stats['total_tests']} +Passed: {stats['passed']} + With model: {stats['passed_with_model']} + Infrastructure: {stats['passed_infrastructure']} +Skipped: {stats['skipped']} +Duration: {stats['total_duration']:.1f}s ({stats['total_duration']/60:.1f} min) +``` + +--- + +## System Health + +""" + md += f"""``` +Swap (MB): min={stats['swap']['min']}, max={stats['swap']['max']}, avg={stats['swap']['avg']:.1f} +RAM free (GB): min={stats['ram']['min']:.1f}, max={stats['ram']['max']:.1f}, avg={stats['ram']['avg']:.1f} +Zombies: min={stats['zombies']['min']}, max={stats['zombies']['max']} + +Quality Flags: + Clean: {stats['quality']['clean']}/{stats['total_tests']} ({stats['quality']['clean_percent']:.1f}%) + Degraded (swap): {stats['quality']['degraded_swap']} + Degraded (zombies): {stats['quality']['degraded_zombies']} +``` + +--- + +## Per-Model Statistics + +""" + + # Sort models by total time (descending), or by change if comparing + sorted_models = sorted(stats['models'].values(), key=lambda m: m['total_time'], reverse=True) + + # Build comparison lookup if available + compare_models = {} + if compare_stats: + compare_models = {m['id']: m for m in compare_stats['models'].values()} + # Re-sort by change percentage (biggest regression first) + def get_change_pct(model): + old = compare_models.get(model['id']) + if old and old['total_time'] > 0: + return (model['total_time'] - old['total_time']) / old['total_time'] * 100 + return 0 + sorted_models = sorted(stats['models'].values(), key=get_change_pct, reverse=True) + + if compare_stats: + md += f"""``` +{'Model':<42} {'Size':<7} {'Tests':<5} {'Time':<8} {'Old':<8} {'Δ':<8} {'Change':<10} {'RAM (GB)':<12} +{'='*42} {'='*7} {'='*5} {'='*8} {'='*8} {'='*8} {'='*10} {'='*12} +""" + else: + md += f"""``` +{'Model':<50} {'Size':<8} {'Tests':<6} {'Time':<10} {'RAM (GB)':<20} +{'='*50} {'='*8} {'='*6} {'='*10} {'='*20} +""" + + for model in sorted_models: + # Shorten model ID (remove mlx-community/ prefix) + model_short = model['id'].replace('mlx-community/', '') + max_len = 40 if compare_stats else 48 + if len(model_short) > max_len: + model_short = model_short[:max_len-3] + "..." + + ram_range = f"{model['ram_min']:.1f}-{model['ram_max']:.1f}" + + if compare_stats: + old_model = compare_models.get(model['id']) + if old_model: + old_time = old_model['total_time'] + delta = model['total_time'] - old_time + change_pct = (delta / old_time * 100) if old_time > 0 else 0 + # Status indicator + if change_pct > 5: + status = "⚠️" + elif change_pct < -1: + status = "✅" + else: + status = "" + change_str = f"{change_pct:+.1f}% {status}" + md += f"{model_short:<42} {model['size_gb']:>5.1f}GB {model['count']:<5} {model['total_time']:>6.1f}s {old_time:>6.1f}s {delta:>+6.1f}s {change_str:<10} {ram_range:<12}\n" + else: + md += f"{model_short:<42} {model['size_gb']:>5.1f}GB {model['count']:<5} {model['total_time']:>6.1f}s {'N/A':<8} {'N/A':<8} {'NEW':<10} {ram_range:<12}\n" + else: + md += f"{model_short:<50} {model['size_gb']:>6.1f}GB {model['count']:<6} {model['total_time']:>8.1f}s {ram_range:<20}\n" + + md += "```\n\n" + + # Model Categories + large_models = [m for m in sorted_models if m['size_gb'] >= 20] + medium_models = [m for m in sorted_models if 10 <= m['size_gb'] < 20] + small_models = [m for m in sorted_models if m['size_gb'] < 10] + + md += "### Model Categories\n\n" + md += f"""``` +LARGE MODELS (≥20 GB): {len(large_models)} models + Avg size: {sum(m['size_gb'] for m in large_models) / len(large_models):.1f} GB + Avg test time: {sum(m['total_time']/m['count'] for m in large_models) / len(large_models):.1f}s + Avg min RAM: {sum(m['ram_min'] for m in large_models) / len(large_models):.1f} GB + +MEDIUM MODELS (10-20 GB): {len(medium_models)} models + Avg size: {sum(m['size_gb'] for m in medium_models) / len(medium_models):.1f} GB + Avg test time: {sum(m['total_time']/m['count'] for m in medium_models) / len(medium_models):.1f}s + Avg min RAM: {sum(m['ram_min'] for m in medium_models) / len(medium_models):.1f} GB + +SMALL MODELS (<10 GB): {len(small_models)} models + Avg size: {sum(m['size_gb'] for m in small_models) / len(small_models):.1f} GB + Avg test time: {sum(m['total_time']/m['count'] for m in small_models) / len(small_models):.1f}s + Avg min RAM: {sum(m['ram_min'] for m in small_models) / len(small_models):.1f} GB +``` +""" if large_models and medium_models and small_models else "" + + md += "\n---\n\n" + + # Per-Test Statistics + md += "## Per-Test Statistics\n\n" + md += "Shows performance range across models for each test.\n\n" + + # Sort tests by model count (descending) - most representative tests first + sorted_tests = sorted(stats['tests'].values(), key=lambda t: t['model_count'], reverse=True) + + # Build comparison lookup for tests + compare_tests = {} + if compare_stats: + compare_tests = {t['name']: t for t in compare_stats['tests'].values()} + + if compare_stats: + md += f"""``` +{'Test Name':<40} {'Models':<7} {'Fastest':<20} {'Slowest':<20} {'Med':<6} {'Old':<6} {'Δ Med':<8} +{'='*40} {'='*7} {'='*20} {'='*20} {'='*6} {'='*6} {'='*8} +""" + else: + md += f"""``` +{'Test Name':<50} {'Models':<7} {'Fastest':<25} {'Slowest':<25} {'Med Time'} +{'='*50} {'='*7} {'='*25} {'='*25} {'='*8} +""" + + for test in sorted_tests: + # Shorten test name if needed + max_test_len = 38 if compare_stats else 48 + test_short = test['name'] + if len(test_short) > max_test_len: + test_short = test_short[:max_test_len-3] + "..." + + # Format fastest/slowest + fastest = test['fastest'] + slowest = test['slowest'] + + if fastest and slowest: + max_model_len = 18 if compare_stats else 23 + fastest_str = f"{fastest['model_short']} ({fastest['duration']:.1f}s)" + slowest_str = f"{slowest['model_short']} ({slowest['duration']:.1f}s)" + if len(fastest_str) > max_model_len: + fastest_str = fastest_str[:max_model_len-3] + "..." + if len(slowest_str) > max_model_len: + slowest_str = slowest_str[:max_model_len-3] + "..." + + med_time = test['median_time'] + + if compare_stats: + old_test = compare_tests.get(test['name']) + if old_test: + old_med = old_test['median_time'] + delta_pct = ((med_time - old_med) / old_med * 100) if old_med > 0 else 0 + delta_str = f"{delta_pct:+.1f}%" + md += f"{test_short:<40} {test['model_count']:<7} {fastest_str:<20} {slowest_str:<20} {med_time:<5.1f}s {old_med:<5.1f}s {delta_str:<8}\n" + else: + md += f"{test_short:<40} {test['model_count']:<7} {fastest_str:<20} {slowest_str:<20} {med_time:<5.1f}s {'N/A':<6} {'NEW':<8}\n" + else: + md += f"{test_short:<50} {test['model_count']:<7} {fastest_str:<25} {slowest_str:<25} {med_time:.1f}s\n" + + md += "```\n\n" + + md += "\n---\n\n" + md += "## Files\n\n" + md += f"- **Benchmark report:** `{input_file}`\n" + md += f"- **Schema:** `benchmarks/schemas/report-v{stats['schema_version']}.schema.json`\n" + + return md + + +def main(): + parser = argparse.ArgumentParser( + description="Generate benchmark analysis report from JSONL data", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__ + ) + parser.add_argument( + 'input', + nargs='?', + type=Path, + help='JSONL benchmark file (default: latest in benchmarks/reports/)' + ) + parser.add_argument( + '--compare', + type=Path, + help='Compare with this JSONL file (adds Old/Δ/Change columns)' + ) + parser.add_argument( + '--output', + type=Path, + help='Output markdown file (default: auto-generated in benchmarks/reports/)' + ) + + args = parser.parse_args() + + # Determine input file + if args.input: + input_file = args.input + else: + input_file = find_latest_jsonl() + if not input_file: + print("❌ No JSONL files found in benchmarks/reports/") + sys.exit(1) + print(f"📊 Auto-detected: {input_file}") + + if not input_file.exists(): + print(f"❌ File not found: {input_file}") + sys.exit(1) + + # Load and validate + print(f"📋 Loading: {input_file}") + schema = load_schema() + data = load_jsonl(input_file) + + print(f"✓ Loaded {len(data)} entries") + + # Validate against schema + if not validate_jsonl(data, schema, input_file): + sys.exit(1) + + print(f"✓ Schema validation passed") + + # Calculate statistics + stats = calculate_statistics(data) + + # Load and calculate comparison statistics if requested + compare_stats = None + if args.compare: + if not args.compare.exists(): + print(f"❌ Comparison file not found: {args.compare}") + sys.exit(1) + print(f"📊 Comparing with: {args.compare}") + compare_data = load_jsonl(args.compare) + if not validate_jsonl(compare_data, schema, args.compare): + sys.exit(1) + compare_stats = calculate_statistics(compare_data) + print(f"✓ Loaded {len(compare_data)} comparison entries") + + # Generate report + markdown = generate_markdown(stats, input_file, args.compare, compare_stats) + + # Determine output file + if args.output: + output_file = args.output + else: + # Auto-generate: BENCHMARK-v1--.md + version = extract_version_from_filename(input_file) or stats["mlx_knife_version"] + date = input_file.stem.split("-v")[0] # Extract date portion + output_file = REPORTS_DIR / f"BENCHMARK-v{TEMPLATE_VERSION}-{version}-{date}.md" + + # Write output + output_file.parent.mkdir(parents=True, exist_ok=True) + with open(output_file, 'w') as f: + f.write(markdown) + + print(f"✅ Generated: {output_file}") + print() + print(f"Summary:") + print(f" Tests: {stats['passed']}/{stats['total_tests']} passed") + print(f" Duration: {stats['total_duration']/60:.1f} min") + print(f" Quality: {stats['quality']['clean_percent']:.1f}% clean") + print(f" Models: {len(stats['models'])}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/reports/BENCHMARK-v1.0-2.0.4b3-2025-12-20.md b/benchmarks/reports/BENCHMARK-v1.0-2.0.4b3-2025-12-20.md new file mode 100644 index 0000000..87fb85b --- /dev/null +++ b/benchmarks/reports/BENCHMARK-v1.0-2.0.4b3-2025-12-20.md @@ -0,0 +1,125 @@ +# Benchmark Report v1.0: 2.0.4b3 + +**Date:** 2025-12-20 +**Generated:** 2025-12-20T14:43:01.786689+00:00 +**Generator:** generate_benchmark_report.py v1.0 +**Hardware:** Mac14,13, 12 cores + +--- + +## Input Files + +- **Primary:** `benchmarks/reports/2025-12-20-v2.0.4b3-2nd_0.2.0_schema.jsonl` +- **Schema:** v0.2.0 + +--- + +## Executive Summary + +**Tests:** 162 total (141 passed, 21 skipped) +**Duration:** 1169.3s (19.5 min) +**Quality:** 100.0% clean (162/162) +**Models:** 22 tested + +✅ **System Health:** All tests clean (0 MB swap, 0 zombies) + +--- + +## Test Summary + +``` +Total tests: 162 +Passed: 141 + With model: 84 + Infrastructure: 57 +Skipped: 21 +Duration: 1169.3s (19.5 min) +``` + +--- + +## System Health + +``` +Swap (MB): min=0, max=0, avg=0.0 +RAM free (GB): min=0.0, max=46.7, avg=19.0 +Zombies: min=0, max=0 + +Quality Flags: + Clean: 162/162 (100.0%) + Degraded (swap): 0 + Degraded (zombies): 0 +``` + +--- + +## Per-Model Statistics + +``` +Model Size Tests Time RAM (GB) +================================================== ======== ====== ========== ==================== +Mistral-Small-3.2-24B-Instruct-2506-8bit 23.3GB 4 102.2s 21.4-25.9 +Qwen3-Coder-30B-A3B-Instruct-6bit-DWQ-lr9e-8 24.9GB 4 97.5s 21.6-26.8 +Mixtral-8x7B-Instruct-v0.1-4bit 24.5GB 4 96.9s 2.7-26.4 +DeepHermes-3-Mistral-24B-Preview-8bit 23.3GB 4 63.0s 0.0-24.6 +OpenCodeInterpreter-DS-33B-hf-4bit-mlx 17.8GB 4 62.9s 17.9-33.0 +Qwen3-32B-4bit 17.2GB 4 48.7s 17.1-20.3 +Klear-46B-A2.5B-Instruct-3bit 18.9GB 4 40.7s 18.9-19.9 +MiMo-VL-7B-RL-bf16 15.5GB 4 38.9s 14.6-19.7 +gpt-oss-20b-MXFP4-Q8 11.3GB 4 36.6s 14.2-36.4 +Qwen3-30B-A3B-Instruct-2507-4bit 16.0GB 4 34.2s 16.2-23.7 +Qwen3-Coder-30B-A3B-Instruct-4bit 16.0GB 4 33.1s 16.3-17.1 +Mistral-Small-3.2-24B-Instruct-2506-4bit 12.4GB 4 32.9s 13.0-16.9 +Mistral-Small-Instruct-2409-4bit 11.7GB 4 27.6s 12.9-26.2 +Qwen2.5-Coder-7B-Instruct-8bit 7.5GB 4 19.9s 8.5-31.2 +DeepSeek-R1-Distill-Llama-8B-4bit 4.2GB 4 19.7s 20.2-37.6 +pixtral-12b-8bit 12.6GB 2 15.5s 14.3-14.4 +Mistral-7B-Instruct-v0.2-4bit 4.0GB 4 14.1s 8.9-26.2 +Gabliterated-Qwen3-0.6B-float32 2.2GB 4 12.7s 16.1-37.3 +Phi-3-mini-4k-instruct-4bit 2.0GB 4 11.5s 14.6-46.7 +Phi-3.5-mini-instruct-4bit 2.0GB 4 10.2s 12.6-44.6 +Qwen2.5-0.5B-Instruct-4bit 0.3GB 4 9.2s 13.8-46.0 +Llama-3.2-11B-Vision-Instruct-4bit 5.6GB 2 8.9s 10.3-12.1 +``` + +### Model Categories + +``` +LARGE MODELS (≥20 GB): 4 models + Avg size: 24.0 GB + Avg test time: 22.5s + Avg min RAM: 11.5 GB + +MEDIUM MODELS (10-20 GB): 10 models + Avg size: 14.9 GB + Avg test time: 9.7s + Avg min RAM: 15.6 GB + +SMALL MODELS (<10 GB): 8 models + Avg size: 3.5 GB + Avg test time: 3.6s + Avg min RAM: 13.1 GB +``` + +--- + +## Per-Test Statistics + +Shows performance range across models for each test. + +``` +Test Name Models Fastest Slowest Med Time +================================================== ======= ========================= ========================= ======== +test_run_command 22 Qwen2.5 (1.2s) DeepHermes (22.1s) 7.1s +test_run_json_output 22 Qwen2.5 (1.2s) Mistral (13.3s) 7.1s +test_chat_completions_batch 20 Phi (3.3s) Mixtral (30.9s) 8.7s +test_chat_completions_streaming 20 Qwen2.5 (3.4s) Qwen3 (51.3s) 10.6s +``` + + +--- + +## Files + +- **Benchmark report:** `benchmarks/reports/2025-12-20-v2.0.4b3-2nd_0.2.0_schema.jsonl` +- **Schema:** `benchmarks/schemas/report-v0.2.0.schema.json` diff --git a/benchmarks/tools/memmon.py b/benchmarks/tools/memmon.py new file mode 100644 index 0000000..bf57c3e --- /dev/null +++ b/benchmarks/tools/memmon.py @@ -0,0 +1,371 @@ +#!/usr/bin/env python3 +"""Memory Monitor - Standalone tool for tracking memory during subprocess execution. + +Samples RAM, swap, and memory pressure while running any command. +Outputs JSONL with per-sample data and final summary. + +Usage: + # Basic usage + python benchmarks/tools/memmon.py -- pytest -m live_e2e tests_2.0/live/ + + # With options + python benchmarks/tools/memmon.py --interval 200 --output memory.jsonl -- pytest -v + + # Just monitor (no subprocess) + python benchmarks/tools/memmon.py --duration 60 --output memory.jsonl + +Future: Will be part of mlxk-benchmark kit. +""" + +import argparse +import json +import subprocess +import sys +import threading +import time +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional + + +def get_memory_sample() -> dict: + """Get current memory state using psutil.""" + try: + import psutil + import subprocess + + # Get memory pressure from sysctl (macOS only) + # Values: 1=NORMAL (green), 2=WARN (yellow), 4=CRITICAL (red) + memory_pressure = 1 # Default to NORMAL + try: + result = subprocess.run( + ["sysctl", "-n", "kern.memorystatus_vm_pressure_level"], + capture_output=True, text=True, timeout=1 + ) + memory_pressure = int(result.stdout.strip()) + except Exception: + pass + + vm = psutil.virtual_memory() + swap = psutil.swap_memory() + return { + "ram_free_gb": round(vm.available / 1e9, 2), + "ram_used_gb": round(vm.used / 1e9, 2), + "ram_percent": vm.percent, + "swap_used_mb": round(swap.used / 1e6, 1), + "swap_percent": swap.percent, + "memory_pressure": memory_pressure, + } + except ImportError: + # Fallback without psutil + return get_memory_sample_native() + + +def get_memory_sample_native() -> dict: + """Get memory state using native macOS commands (no psutil).""" + import subprocess + + # Get memory pressure (1=NORMAL/green, 2=WARN/yellow, 4=CRITICAL/red) + memory_pressure = 1 # Default to NORMAL + try: + result = subprocess.run( + ["sysctl", "-n", "kern.memorystatus_vm_pressure_level"], + capture_output=True, text=True, timeout=1 + ) + memory_pressure = int(result.stdout.strip()) + except Exception: + pass + + # Get swap usage + swap_mb = 0 + try: + result = subprocess.run( + ["sysctl", "-n", "vm.swapusage"], + capture_output=True, text=True, timeout=1 + ) + # Parse: "total = 0.00M used = 0.00M free = 0.00M (encrypted)" + for part in result.stdout.split(): + if part.endswith("M") and "used" in result.stdout.split()[result.stdout.split().index(part)-2]: + swap_mb = float(part[:-1]) + break + # Simpler parsing + parts = result.stdout.replace("M", "").split() + for i, p in enumerate(parts): + if p == "used" and i + 2 < len(parts): + swap_mb = float(parts[i + 2]) + break + except Exception: + pass + + # Get RAM via vm_stat + ram_free_gb = 0 + try: + result = subprocess.run( + ["vm_stat"], + capture_output=True, text=True, timeout=1 + ) + # Parse page size and available pages + page_size = 16384 # Default for Apple Silicon + pages_free = 0 + pages_inactive = 0 + pages_purgeable = 0 + pages_speculative = 0 + + for line in result.stdout.splitlines(): + if "page size of" in line: + page_size = int(line.split()[-2]) + elif "Pages free:" in line: + pages_free = int(line.split()[-1].rstrip(".")) + elif "Pages inactive:" in line: + pages_inactive = int(line.split()[-1].rstrip(".")) + elif "Pages purgeable:" in line: + pages_purgeable = int(line.split()[-1].rstrip(".")) + elif "Pages speculative:" in line: + pages_speculative = int(line.split()[-1].rstrip(".")) + + # Total available = free + inactive + purgeable + speculative + total_available_pages = pages_free + pages_inactive + pages_purgeable + pages_speculative + ram_free_gb = round((total_available_pages * page_size) / 1e9, 2) + except Exception: + pass + + return { + "ram_free_gb": ram_free_gb, + "ram_used_gb": 0, # Not available without psutil + "ram_percent": 0, + "swap_used_mb": swap_mb, + "swap_percent": 0, + "memory_pressure": memory_pressure, + } + + +class MemoryMonitor: + """Background memory sampler. + + Usage: + monitor = MemoryMonitor(interval_ms=200) + monitor.start() + # ... do work ... + summary = monitor.stop() + """ + + def __init__(self, interval_ms: int = 200): + self.interval = interval_ms / 1000 + self.samples: list[dict] = [] + self.running = False + self.thread: Optional[threading.Thread] = None + self.start_time: float = 0 + + def start(self): + """Start background sampling.""" + self.running = True + self.samples = [] + self.start_time = time.time() + self.thread = threading.Thread(target=self._sample_loop, daemon=True) + self.thread.start() + + def stop(self) -> dict: + """Stop sampling and return summary.""" + self.running = False + if self.thread: + self.thread.join(timeout=1.0) + + if not self.samples: + return {"error": "No samples collected"} + + ram_values = [s["ram_free_gb"] for s in self.samples] + swap_values = [s["swap_used_mb"] for s in self.samples] + + return { + "duration_s": round(time.time() - self.start_time, 2), + "samples": len(self.samples), + "interval_ms": int(self.interval * 1000), + "ram_free_min_gb": min(ram_values), + "ram_free_max_gb": max(ram_values), + "ram_free_avg_gb": round(sum(ram_values) / len(ram_values), 2), + "swap_max_mb": max(swap_values), + "swap_avg_mb": round(sum(swap_values) / len(swap_values), 1), + } + + def get_samples(self) -> list[dict]: + """Get all collected samples.""" + return self.samples.copy() + + def _sample_loop(self): + """Background sampling loop.""" + while self.running: + sample = get_memory_sample() + sample["ts"] = round(time.time(), 3) + sample["elapsed_s"] = round(time.time() - self.start_time, 2) + self.samples.append(sample) + time.sleep(self.interval) + + +def run_with_monitoring( + command: list[str], + interval_ms: int = 200, + output_file: Optional[Path] = None, + verbose: bool = False +) -> dict: + """Run a command while monitoring memory. + + Args: + command: Command and arguments to run + interval_ms: Sampling interval in milliseconds + output_file: Optional JSONL output file + verbose: Print samples as they're collected + + Returns: + Summary dict with memory statistics + """ + monitor = MemoryMonitor(interval_ms=interval_ms) + + print(f"Starting memory monitor (interval: {interval_ms}ms)") + print(f"Running: {' '.join(command)}") + print("-" * 60) + + monitor.start() + + # Run subprocess + try: + result = subprocess.run(command) + exit_code = result.returncode + except KeyboardInterrupt: + exit_code = 130 + print("\nInterrupted") + except Exception as e: + exit_code = 1 + print(f"\nError: {e}") + + summary = monitor.stop() + summary["exit_code"] = exit_code + summary["command"] = " ".join(command) + summary["timestamp"] = datetime.now(timezone.utc).isoformat() + + print("-" * 60) + print(f"Memory Monitor Summary:") + print(f" Duration: {summary['duration_s']:.1f}s ({summary['samples']} samples)") + print(f" RAM free: {summary['ram_free_min_gb']:.1f} - {summary['ram_free_max_gb']:.1f} GB") + print(f" Swap peak: {summary['swap_max_mb']:.1f} MB") + print(f" Exit code: {exit_code}") + + # Write output + if output_file: + with open(output_file, "w") as f: + # Write samples + for sample in monitor.get_samples(): + f.write(json.dumps(sample) + "\n") + # Write summary as last line + f.write(json.dumps({"summary": summary}) + "\n") + print(f" Output: {output_file}") + + return summary + + +def monitor_only( + duration_s: float, + interval_ms: int = 200, + output_file: Optional[Path] = None +) -> dict: + """Monitor memory for a fixed duration (no subprocess). + + Args: + duration_s: How long to monitor + interval_ms: Sampling interval in milliseconds + output_file: Optional JSONL output file + + Returns: + Summary dict with memory statistics + """ + monitor = MemoryMonitor(interval_ms=interval_ms) + + print(f"Monitoring memory for {duration_s}s (interval: {interval_ms}ms)") + print("-" * 60) + + monitor.start() + + try: + time.sleep(duration_s) + except KeyboardInterrupt: + print("\nInterrupted") + + summary = monitor.stop() + summary["timestamp"] = datetime.now(timezone.utc).isoformat() + + print("-" * 60) + print(f"Memory Monitor Summary:") + print(f" Duration: {summary['duration_s']:.1f}s ({summary['samples']} samples)") + print(f" RAM free: {summary['ram_free_min_gb']:.1f} - {summary['ram_free_max_gb']:.1f} GB") + print(f" Swap peak: {summary['swap_max_mb']:.1f} MB") + + if output_file: + with open(output_file, "w") as f: + for sample in monitor.get_samples(): + f.write(json.dumps(sample) + "\n") + f.write(json.dumps({"summary": summary}) + "\n") + print(f" Output: {output_file}") + + return summary + + +def main(): + parser = argparse.ArgumentParser( + description="Monitor memory while running a command", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__ + ) + parser.add_argument( + "--interval", "-i", + type=int, + default=200, + help="Sampling interval in milliseconds (default: 200)" + ) + parser.add_argument( + "--output", "-o", + type=Path, + help="Output JSONL file for samples and summary" + ) + parser.add_argument( + "--duration", "-d", + type=float, + help="Monitor for fixed duration (seconds), no subprocess" + ) + parser.add_argument( + "--verbose", "-v", + action="store_true", + help="Print samples as they're collected" + ) + parser.add_argument( + "command", + nargs="*", + help="Command to run (after --)" + ) + + args = parser.parse_args() + + if args.duration: + # Monitor-only mode + summary = monitor_only( + duration_s=args.duration, + interval_ms=args.interval, + output_file=args.output + ) + elif args.command: + # Run command with monitoring + summary = run_with_monitoring( + command=args.command, + interval_ms=args.interval, + output_file=args.output, + verbose=args.verbose + ) + sys.exit(summary.get("exit_code", 0)) + else: + parser.print_help() + print("\nExamples:") + print(" python benchmarks/tools/memmon.py -- pytest -m live_e2e") + print(" python benchmarks/tools/memmon.py --duration 10 --output mem.jsonl") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/tools/memplot.py b/benchmarks/tools/memplot.py new file mode 100644 index 0000000..3da6883 --- /dev/null +++ b/benchmarks/tools/memplot.py @@ -0,0 +1,522 @@ +#!/usr/bin/env python3 +"""Memory Timeline Visualization - Generate interactive HTML charts from benchmark data. + +Correlates memory samples (memmon.py) with test results to show RAM/swap usage +over time with model markers. + +Usage: + # Basic usage + python benchmarks/tools/memplot.py memory.jsonl benchmark.jsonl + + # Custom output + python benchmarks/tools/memplot.py memory.jsonl benchmark.jsonl -o report.html + + # PNG export (requires kaleido) + python benchmarks/tools/memplot.py memory.jsonl benchmark.jsonl --format png + +Requires: plotly (pip install plotly) +Optional: kaleido (pip install kaleido) for PNG export +""" + +import argparse +import json +import sys +from datetime import datetime +from pathlib import Path +from typing import Optional + + +def parse_memory_samples(path: Path) -> tuple[list[dict], dict]: + """Parse memmon JSONL output. + + Returns: + Tuple of (samples list, summary dict) + """ + samples = [] + summary = {} + + with open(path) as f: + for line in f: + line = line.strip() + if not line: + continue + entry = json.loads(line) + if "summary" in entry: + summary = entry["summary"] + else: + samples.append(entry) + + return samples, summary + + +def parse_benchmark_results(path: Path) -> tuple[list[dict], list[dict]]: + """Parse benchmark JSONL output. + + Returns: + Tuple of (tests with models, tests without models) + """ + tests_with_model = [] + tests_without_model = [] + + with open(path) as f: + for line in f: + line = line.strip() + if not line: + continue + entry = json.loads(line) + if "timestamp" not in entry or "duration" not in entry: + continue + + if "model" in entry and entry.get("outcome") == "passed": + tests_with_model.append(entry) + elif "model" not in entry and entry.get("outcome") == "passed": + tests_without_model.append(entry) + + return tests_with_model, tests_without_model + + +def parse_iso_timestamp(ts_str: str) -> float: + """Convert ISO timestamp to Unix timestamp.""" + # Handle timezone suffix + if ts_str.endswith("Z"): + ts_str = ts_str[:-1] + "+00:00" + dt = datetime.fromisoformat(ts_str) + return dt.timestamp() + + +def correlate_tests_with_timeline( + samples: list[dict], + tests: list[dict], + memory_start_ts: float +) -> list[dict]: + """Calculate test time ranges relative to memory timeline. + + Returns: + List of dicts with model_id, start_elapsed, end_elapsed + """ + if not samples or not tests: + return [] + + markers = [] + + for test in tests: + if "timestamp" not in test or "duration" not in test: + continue + + test_end_ts = parse_iso_timestamp(test["timestamp"]) + test_start_ts = test_end_ts - test["duration"] + + # Convert to elapsed time relative to memory monitoring start + start_elapsed = test_start_ts - memory_start_ts + end_elapsed = test_end_ts - memory_start_ts + + # Get model info if available (for model tests) + model_id = test.get("model", {}).get("id", None) + model_short = model_id.split("/")[-1][:20] if model_id else None + + markers.append({ + "model_id": model_id, + "model_short": model_short, + "start_elapsed": start_elapsed, + "end_elapsed": end_elapsed, + "duration": test["duration"], + "test": test.get("test", ""), + }) + + return markers + + +def get_ram_color(ram_free_gb: float) -> str: + """Get color based on RAM availability.""" + if ram_free_gb >= 32: + return "rgb(52, 199, 89)" # Green - healthy + elif ram_free_gb >= 16: + return "rgb(255, 149, 0)" # Orange - warning + else: + return "rgb(255, 59, 48)" # Red - critical + + +def create_timeline_chart( + samples: list[dict], + summary: dict, + model_markers: list[dict], + infra_markers: list[dict], + title: str = "Memory Timeline" +) -> "Figure": + """Create interactive plotly timeline chart.""" + try: + import plotly.graph_objects as go + from plotly.subplots import make_subplots + except ImportError: + print("Error: plotly not installed. Run: pip install plotly") + sys.exit(1) + + # Extract data series + elapsed = [s["elapsed_s"] for s in samples] + ram_free = [s["ram_free_gb"] for s in samples] + swap_used = [s["swap_used_mb"] for s in samples] + memory_pressure = [s.get("memory_pressure", 1) for s in samples] # Default: 1=NORMAL + + # Convert elapsed to minutes for readability + elapsed_min = [e / 60 for e in elapsed] + + # Create figure with secondary y-axis for swap + fig = make_subplots(specs=[[{"secondary_y": True}]]) + + # RAM trace - use marker color based on threshold + # Color each point based on RAM level + colors = [get_ram_color(ram) for ram in ram_free] + + fig.add_trace( + go.Scatter( + x=elapsed_min, + y=ram_free, + mode="lines+markers", + name="RAM Free (GB)", + line=dict(color="rgb(52, 150, 235)", width=1.5), # Blue line + marker=dict( + color=colors, + size=3, + line=dict(width=0), + ), + hovertemplate="Time: %{x:.1f} min
RAM Free: %{y:.1f} GB", + ), + secondary_y=False, + ) + + # Threshold lines (assuming 64 GB total RAM) + max_elapsed_min = max(elapsed_min) if elapsed_min else 20 + total_ram = 64 # GB - could be made configurable later + + fig.add_trace( + go.Scatter( + x=[0, max_elapsed_min], + y=[32, 32], + mode="lines", + name=f"32 GB (50% of {total_ram} GB - healthy)", + line=dict(color="green", width=1, dash="dash"), + hoverinfo="skip", + ), + secondary_y=False, + ) + + fig.add_trace( + go.Scatter( + x=[0, max_elapsed_min], + y=[16, 16], + mode="lines", + name=f"16 GB (25% of {total_ram} GB - warning)", + line=dict(color="orange", width=1, dash="dash"), + hoverinfo="skip", + ), + secondary_y=False, + ) + + # Swap trace (secondary y-axis) + if any(s > 0 for s in swap_used): + fig.add_trace( + go.Scatter( + x=elapsed_min, + y=swap_used, + mode="lines", + name="Swap Used (MB)", + line=dict(color="red", width=2), + hovertemplate="Time: %{x:.1f} min
Swap: %{y:.0f} MB", + ), + secondary_y=True, + ) + + # Model test regions (gray background for each test with model) + # Sort markers by time + model_markers_sorted = sorted(model_markers, key=lambda m: m["start_elapsed"]) + + test_shapes = [] + prev_model_id = None # Track previous model for switch detection + + for i, marker in enumerate(model_markers_sorted): + start_min = marker["start_elapsed"] / 60 + end_min = marker["end_elapsed"] / 60 + + if start_min < 0 or start_min > max_elapsed_min: + continue + + # Add gray rectangle for this individual test + test_shapes.append(dict( + type="rect", + xref="x", yref="y", + x0=start_min, + x1=end_min, + y0=0, y1=70, + fillcolor="rgba(200, 200, 200, 0.3)", # Gray for model tests + layer="below", + line=dict(width=0), + )) + + # Add model label when model CHANGES (not just first occurrence) + model_id = marker["model_id"] + if model_id != prev_model_id: + fig.add_annotation( + x=start_min, + y=1.0, + xref="x", yref="paper", + text=marker["model_short"], + textangle=-90, + font=dict(size=9, color="rgba(0, 0, 0, 0.7)"), + showarrow=False, + xanchor="left", + yanchor="top", + xshift=2, + ) + prev_model_id = model_id + + # Infrastructure test regions (light blue background) + infra_markers_sorted = sorted(infra_markers, key=lambda m: m["start_elapsed"]) + + for marker in infra_markers_sorted: + start_min = marker["start_elapsed"] / 60 + end_min = marker["end_elapsed"] / 60 + + if start_min < 0 or start_min > max_elapsed_min: + continue + + # Add very light blue rectangle for infrastructure tests + test_shapes.append(dict( + type="rect", + xref="x", yref="y", + x0=start_min, + x1=end_min, + y0=0, y1=70, + fillcolor="rgba(173, 216, 230, 0.2)", # Very light blue for infra tests + layer="below", + line=dict(width=0), + )) + + region_shapes = test_shapes + + # Add test markers (small vertical lines) and labels at bottom for both marker types + all_markers = model_markers_sorted + infra_markers_sorted + all_markers_sorted = sorted(all_markers, key=lambda m: m["start_elapsed"]) + + for marker in all_markers_sorted: + start_min = marker["start_elapsed"] / 60 + + if start_min < 0 or start_min > max_elapsed_min: + continue + + # Extract test name (shorten if needed) + test_name = marker["test"].split("::")[-1].split("[")[0] + if len(test_name) > 25: + test_name = test_name[:22] + "..." + + fig.add_vline( + x=start_min, + line=dict(color="rgba(128, 128, 128, 0.2)", width=0.5), + ) + + # Add test label at bottom (aligned with start time like model labels) + fig.add_annotation( + x=start_min, + y=0.0, + xref="x", yref="paper", + text=test_name, + textangle=-90, + font=dict(size=9, color="rgba(0, 0, 0, 0.6)"), # Same size as model labels + showarrow=False, + xanchor="left", # Same as model labels (aligned at start) + yanchor="bottom", + xshift=2, # Same offset as model labels + ) + + # Add memory pressure backgrounds (1=normal/white, 2=warn/yellow, 4=critical/red) + pressure_shapes = [] + i = 0 + while i < len(memory_pressure): + pressure = memory_pressure[i] + + if pressure > 1: # 2=WARN or 4=CRITICAL + # Find end of this pressure region + start_min = elapsed_min[i] + j = i + while j < len(memory_pressure) and memory_pressure[j] == pressure: + j += 1 + end_min = elapsed_min[j - 1] if j > i else start_min + + # Color based on pressure level + if pressure == 2: + color = "rgba(255, 204, 0, 0.15)" # Yellow (WARN) + else: # pressure == 4 + color = "rgba(255, 59, 48, 0.15)" # Red (CRITICAL) + + pressure_shapes.append(dict( + type="rect", + xref="x", yref="y", # Changed from "paper" to "y" for rangeslider compatibility + x0=start_min, x1=end_min, + y0=0, y1=70, # Use actual y-axis values + fillcolor=color, + layer="below", + line=dict(width=0), + )) + i = j + else: + i += 1 + + # Combine all shapes (regions first, then pressure on top) + shapes = region_shapes + pressure_shapes + + # Debug output + print(f" Test shapes (gray): {len(region_shapes)}") + print(f" Pressure shapes (yellow/red): {len(pressure_shapes)}") + print(f" Total shapes: {len(shapes)}") + if region_shapes: + print(f" Sample test shape: {region_shapes[0]}") + + # Layout (without shapes - we'll add them individually) + fig.update_layout( + title=dict( + text=title, + font=dict(size=16), + ), + xaxis=dict( + title="Time (minutes)", + showgrid=True, + gridcolor="rgba(128,128,128,0.2)", + rangeslider=dict(visible=True, yaxis=dict(rangemode="match")), + ), + yaxis=dict( + title="RAM Free (GB)", + showgrid=True, + gridcolor="rgba(128,128,128,0.2)", + range=[0, 70], # Typical max for 64GB system + ), + yaxis2=dict( + title="Swap Used (MB)", + showgrid=False, + range=[0, max(swap_used) * 1.2] if any(s > 0 for s in swap_used) else [0, 100], + ), + legend=dict( + orientation="h", + yanchor="bottom", + y=1.02, + xanchor="right", + x=1, + ), + hovermode="x unified", + template="plotly_white", + plot_bgcolor="rgba(0,0,0,0)", # Transparent plot background so shapes show through + height=500, + margin=dict(t=80, b=60, l=60, r=60), + ) + + # Add shapes individually using fig.add_shape() method + # This is more explicit than passing shapes array to update_layout + for shape in shapes: + fig.add_shape(**shape) + + # Debug: Check shapes after adding individually + print(f" Shapes in fig.layout after add_shape: {len(fig.layout.shapes)}") + + # Add summary annotation + if summary: + summary_text = ( + f"Duration: {summary.get('duration_s', 0)/60:.1f} min | " + f"Samples: {summary.get('samples', 0)} | " + f"RAM: {summary.get('ram_free_min_gb', 0):.1f}-{summary.get('ram_free_max_gb', 0):.1f} GB | " + f"Swap peak: {summary.get('swap_max_mb', 0):.0f} MB" + ) + fig.add_annotation( + text=summary_text, + xref="paper", yref="paper", + x=0, y=-0.12, + showarrow=False, + font=dict(size=10, color="gray"), + align="left", + ) + + return fig + + +def main(): + parser = argparse.ArgumentParser( + description="Generate memory timeline visualization from benchmark data", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "memory_file", + type=Path, + help="Memory samples JSONL from memmon.py", + ) + parser.add_argument( + "benchmark_file", + type=Path, + nargs="?", + help="Benchmark results JSONL (optional, for model markers)", + ) + parser.add_argument( + "-o", "--output", + type=Path, + help="Output file (default: memory_timeline.html)", + ) + parser.add_argument( + "--format", + choices=["html", "png", "svg"], + default="html", + help="Output format (default: html)", + ) + parser.add_argument( + "--title", + default="Memory Timeline", + help="Chart title", + ) + + args = parser.parse_args() + + # Default output filename + if not args.output: + args.output = Path(f"memory_timeline.{args.format}") + + # Parse inputs + print(f"Reading memory samples: {args.memory_file}") + samples, summary = parse_memory_samples(args.memory_file) + print(f" Found {len(samples)} samples") + + model_markers = [] + infra_markers = [] + if args.benchmark_file: + print(f"Reading benchmark results: {args.benchmark_file}") + tests_with_model, tests_without_model = parse_benchmark_results(args.benchmark_file) + print(f" Found {len(tests_with_model)} test entries with models") + print(f" Found {len(tests_without_model)} infrastructure test entries") + + # Get memory start timestamp from first sample + if samples: + memory_start_ts = samples[0]["ts"] + model_markers = correlate_tests_with_timeline(samples, tests_with_model, memory_start_ts) + infra_markers = correlate_tests_with_timeline(samples, tests_without_model, memory_start_ts) + print(f" Correlated {len(model_markers)} model test markers") + print(f" Correlated {len(infra_markers)} infrastructure test markers") + + # Create chart + print(f"Generating {args.format.upper()} chart...") + fig = create_timeline_chart(samples, summary, model_markers, infra_markers, title=args.title) + + # Export + if args.format == "html": + fig.write_html( + args.output, + include_plotlyjs="cdn", + full_html=True, + ) + else: + try: + fig.write_image(args.output, scale=2) + except Exception as e: + print(f"Error: PNG/SVG export requires kaleido: pip install kaleido") + print(f"Details: {e}") + sys.exit(1) + + print(f"Output: {args.output}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/validate_reports.py b/benchmarks/validate_reports.py index 8f13c75..e91e4b0 100644 --- a/benchmarks/validate_reports.py +++ b/benchmarks/validate_reports.py @@ -77,14 +77,16 @@ def main(): print("Usage: python benchmarks/validate_reports.py [ ...]") sys.exit(1) - # Load schema - schema_path = Path("benchmarks/schemas/report-v0.1.schema.json") + # Load schema (always use current version) + schema_path = Path("benchmarks/schemas/report-current.schema.json") if not schema_path.exists(): print(f"Error: Schema not found at {schema_path}") sys.exit(1) schema = load_schema(schema_path) - print(f"📋 Loaded schema: {schema_path}") + # Resolve symlink for display + resolved = schema_path.resolve() + print(f"📋 Loaded schema: {schema_path} → {resolved.name}") print() # Validate each file diff --git a/docs/ADR/ADR-016-Memory-Aware-Model-Loading.md b/docs/ADR/ADR-016-Memory-Aware-Model-Loading.md index bf4c4c1..601ae26 100644 --- a/docs/ADR/ADR-016-Memory-Aware-Model-Loading.md +++ b/docs/ADR/ADR-016-Memory-Aware-Model-Loading.md @@ -89,27 +89,11 @@ This is a **hardware fact** (from `sysctl -n hw.memsize`), not a heuristic. - Vision >70%: HTTP 507 Insufficient Storage + JSON error response - Text >70%: `logger.warning("Model size XX.X GB exceeds 70% of YY.Y GB system memory. Expect extreme slowness due to swapping.")` → visible via `--log-level warning` (default) and `--log-json` if enabled -## TODO +## Status -### Phase 1 (2.0.4-beta.1) ✅ COMPLETE -- [x] Add `system.memory_total_bytes` to JSON-API -- [x] Schema bump to 0.1.6 -- [x] Document in json-api-specification.md +**Phase 1+2:** ✅ Complete (2.0.4-beta.1) - See CHANGELOG.md -### Phase 2 (2.0.4-beta.1) ✅ COMPLETE -- [x] Implement pre-load memory check in `run.py` - - `_get_system_memory_bytes()` via `sysctl -n hw.memsize` - - `check_memory_before_load()` for CLI path - - `check_memory_for_server()` for server path -- [x] Vision: ERROR + abort if size > 70% total (empirically confirmed: crash at 73%) - - CLI: stderr error + exit 1 - - Server: HTTP 507 + JSON error (via `ErrorType.INSUFFICIENT_MEMORY`) -- [x] Text: Internal log only if size > 70% total (empirically confirmed: no crash at 95-97%) - - CLI: No user-facing action (backwards compatible) - - Server: `logger.warning()` only (uses existing `--log-level`/`--log-json` infrastructure) -- [x] Unit tests: 18 tests in `tests_2.0/test_memory_checks.py` - -### Phase 3 (Future) +**Phase 3 (Future):** Issue #46 - [ ] Configurable threshold (env var or CLI flag) - [ ] Vision overhead estimation based on model architecture - [ ] KV-Cache size estimation based on context length diff --git a/docs/ADR/README.md b/docs/ADR/README.md index 4c5497b..30ef4a5 100644 --- a/docs/ADR/README.md +++ b/docs/ADR/README.md @@ -15,16 +15,16 @@ This directory contains Architecture Decision Records (ADRs) that document signi | [ADR-005](ADR-005-Clone-Implementation-Beta3.md) | Clone Implementation Beta3 | Superseded by ADR-007 | 2025-09-18 | | [ADR-006](ADR-006-Clone-Implementation-Revised.md) | Clone Implementation Revised | Superseded by ADR-007 | 2025-09-18 | | [ADR-007](ADR-007-Clone-Implementation-Fixed.md) | Clone Implementation Fixed Strategy | Accepted | 2025-09-18 | -| [ADR-008](ADR-008-MLXModel-Package-Format.md) | MLXModel Package Format | Proposed | 2025-10-17 | +| ADR-008 | MLXModel Package Format | Proposed | (not committed) | | [ADR-009](ADR-009-Stop-Token-Detection-Fix.md) | Stop Token Detection Fix | Implemented | 2025-10-21 | -| [ADR-010](ADR-010-Reasoning-Content-API.md) | Reasoning Content API | Draft | 2025-10-21 | +| ADR-010 | Reasoning Content API | Draft | (not committed) | | [ADR-011](ADR-011-E2E-Live-Test-Architecture.md) | E2E Live Test Architecture | Implemented | 2025-10-21 | | [ADR-012](ADR-012-Vision-Support-Roadmap.md) | Vision Support Roadmap | Implemented (Phase 1-3) | 2025-11-12 | -| [ADR-013](ADR-013-Community-Model-Quality-Database.md) | Community Model Quality Database | Planned | 2025-11-13 | +| ADR-013 | Community Model Quality Database | Planned | (not committed) | | [ADR-014](ADR-014-Unix-Pipe-Integration.md) | Unix Pipe Integration | Implemented (Phase 1) | 2025-11-16 | -| [ADR-016](ADR-016-Memory-Aware-Model-Loading.md) | Memory-Aware Model Loading | Implemented | 2025-11-20 | -| ADR-015 | Embeddings API | Planned | (future) | -| ADR-017 | Image Metadata & RAG | Proposed | (future) | +| ADR-015 | Embeddings API | Planned | (not committed) | +| [ADR-016](ADR-016-Memory-Aware-Model-Loading.md) | Memory-Aware Model Loading | Implemented (Phase 1-2) | 2025-12-05 | +| ADR-017 | Image Metadata Extraction (EXIF) | Implemented | (not committed) | ## ADR Format diff --git a/docs/SERVER-HANDBOOK.md b/docs/SERVER-HANDBOOK.md index 78986f8..54c1559 100644 --- a/docs/SERVER-HANDBOOK.md +++ b/docs/SERVER-HANDBOOK.md @@ -26,7 +26,7 @@ mlxk serve --host 0.0.0.0 --port 8000 - Python 3.9+ (Text models) - Python 3.10+ (Vision models) - mlx-lm 0.28.4+ -- mlx-vlm 0.3.9+ (optional, for vision) +- mlx-vlm 0.3.9+ (optional, for vision; beta.3 recommends commit c4ea290e47e2155b67d94c708c662f8ab64e1b37) --- @@ -380,6 +380,9 @@ python -m mlxk2.core.server_base pyenv install 3.10 pyenv local 3.10 pip install mlx-lm mlx-vlm + +# Beta.3 (pre-0.3.10 fix) +pip install mlx-lm "mlx-vlm @ git+https://github.com/Blaizzy/mlx-vlm.git@c4ea290e47e2155b67d94c708c662f8ab64e1b37" ``` ### Memory Constraint Errors (HTTP 507) diff --git a/docs/json-api-specification.md b/docs/json-api-specification.md index f69ed01..8bc0592 100644 --- a/docs/json-api-specification.md +++ b/docs/json-api-specification.md @@ -1,8 +1,8 @@ # MLX-Knife 2.0 JSON API Specification **Specification Version:** 0.1.6 -**Status:** Alpha - Subject to change -**Target:** MLX-Knife 2.0.4-beta.1 +**Status:** Stable (backward-compatible) +**Released:** MLX-Knife 2.0.4-beta.1 > Based on [GitHub Issue #8](https://github.com/mzau/mlx-knife/issues/8) - Comprehensive JSON output support for all commands @@ -551,29 +551,31 @@ mlxk-json show "Phi-3-mini" --config --json # Include config.json content } ``` -## Changes in 0.1.6 (Alpha) +## Changes in 0.1.6 (Stable, 2.0.4-beta.1) -**ADR-016 Preparation: System Memory Information** +**System Memory Information** - Added `system` object to `version` command response - `system.memory_total_bytes`: Total physical RAM in bytes (from `sysctl hw.memsize`) - `system` is `null` on non-macOS platforms where sysctl is unavailable -- Enables ADR-016 Memory-Aware Model Loading (pre-load memory checks) +- Enables memory-aware model loading (ADR-016) -**ADR-012: Vision Support - Model Discovery** +**Model Discovery: Vision capability flag** - Vision models detected via `preprocessor_config.json` presence -- `vision` capability added to model discovery (backward-compatible enum extension) +- `vision` added to `capabilities` enum (backward-compatible extension) - Visible in `mlxk list --json`, `mlxk show --json`, `mlxk health --json` - Example: `"capabilities": ["text-generation", "chat", "vision"]` -**Note on `mlxk run --image` (CLI):** -- `mlxk run --image` command exists for vision models (ADR-012 Phase 1b) -- Current output: Text mode only (Markdown table with filename mapping) -- JSON output: Deferred to ADR-017 Phase 2 (requires formal schema extension) -- Server OpenAI Vision API documented in `docs/SERVER-HANDBOOK.md` +**Note:** Vision runtime support (`mlxk run --image`, Server API) is documented in README.md "Multi-Modal Support" and `docs/SERVER-HANDBOOK.md`. -## Changes in 0.1.5 (Alpha) +## Changes in 0.1.5 (Stable, 2.0.0) + +**Foundation: Model Object Schema** + +- Standardized `modelObject` across all commands +- Machine-readable fields: `size_bytes`, `last_modified` (ISO-8601 UTC with `Z`) +- No human-readable `size` or `modified` fields (JSON consumers parse structured data) **Issue #36: Separate Integrity and Runtime Compatibility Checks** @@ -587,12 +589,6 @@ mlxk-json show "Phi-3-mini" --config --json # Include config.json content - Gate logic: Runtime check requires passing integrity check first - `reason` field describes first problem found (integrity > runtime priority) -## Changes in 0.1.2 (Alpha) - -- Introduced a common minimal Model Object for consistency across commands. -- Replaced human-readable `size` with machine-friendly `size_bytes`. -- Removed human-readable `modified`; `last_modified` (ISO-8601 UTC) is authoritative. - ## Operations ### `mlxk-json pull --json` diff --git a/mlxk2/__init__.py b/mlxk2/__init__.py index d68e38b..8b9039b 100644 --- a/mlxk2/__init__.py +++ b/mlxk2/__init__.py @@ -7,4 +7,4 @@ import warnings # Issue parity with 1.1.0 (Issue #22) warnings.filterwarnings('ignore', message='urllib3 v2 only supports OpenSSL 1.1.1+') -__version__ = "2.0.4b2" +__version__ = "2.0.4b3" diff --git a/mlxk2/core/capabilities.py b/mlxk2/core/capabilities.py index 09427a2..e8516fd 100644 --- a/mlxk2/core/capabilities.py +++ b/mlxk2/core/capabilities.py @@ -167,7 +167,11 @@ def _has_any(path: Path, patterns: Tuple[str, ...]) -> bool: def _detect_vision_from_config(config: Optional[Dict[str, Any]]) -> bool: - """Detect vision capability from config.json content.""" + """Detect vision capability from config.json content. + + Video models (AutoVideoProcessor) are excluded as they require PyTorch/Torchvision. + mlx-vlm only supports image vision models (AutoImageProcessor). + """ if not isinstance(config, dict): return False @@ -181,15 +185,29 @@ def _detect_vision_from_config(config: Optional[Dict[str, Any]]) -> bool: return True # Check for embedded preprocessor_config - if isinstance(config.get("preprocessor_config"), dict): + preprocessor_cfg = config.get("preprocessor_config") + if isinstance(preprocessor_cfg, dict): + # Exclude video processors (requires PyTorch/Torchvision) + if preprocessor_cfg.get("processor_class") == "AutoVideoProcessor": + return False + if "temporal_patch_size" in preprocessor_cfg: + return False return True return False def _detect_vision_from_files(model_path: Path) -> bool: - """Detect vision capability from file presence.""" - return _has_any( + """Detect vision capability from file presence. + + Video models (AutoVideoProcessor) are excluded as they require PyTorch/Torchvision. + mlx-vlm only supports image vision models (AutoImageProcessor). + """ + # Check if it's a video model (requires PyTorch/Torchvision) + if (model_path / "video_preprocessor_config.json").exists(): + return False + + if _has_any( model_path, ( "preprocessor_config.json", @@ -199,7 +217,25 @@ def _detect_vision_from_files(model_path: Path) -> bool: "**/processor_config.json", "**/image_processor_config.json", ), - ) + ): + # Found vision-related files, but check if it's a video processor + preprocessor_path = model_path / "preprocessor_config.json" + if preprocessor_path.exists(): + try: + import json + with open(preprocessor_path) as f: + preprocessor_data = json.load(f) + if isinstance(preprocessor_data, dict): + # Video model indicators + if preprocessor_data.get("processor_class") == "AutoVideoProcessor": + return False + if "temporal_patch_size" in preprocessor_data: + return False + except Exception: + pass + return True + + return False def _check_mlx_vlm_available() -> bool: diff --git a/mlxk2/core/server_base.py b/mlxk2/core/server_base.py index f9d6cfd..be7db39 100644 --- a/mlxk2/core/server_base.py +++ b/mlxk2/core/server_base.py @@ -118,8 +118,6 @@ def get_or_load_model(model_spec: str, verbose: bool = False) -> Any: raise HTTPException(status_code=503, detail="Server is shutting down") # Simple approach like run command - let MLXRunner handle everything if _current_model_path != model_spec: - logger.info(f"Switching to model: {model_spec}", model=model_spec) - # Clean up previous model if _model_cache: try: @@ -229,8 +227,7 @@ def get_or_load_model(model_spec: str, verbose: bool = False) -> Any: _model_cache[model_spec] = runner _current_model_path = model_spec - backend_name = "vision" if policy.backend == Backend.MLX_VLM else "text" - logger.info(f"Model loaded successfully ({backend_name}): {model_spec}", model=model_spec) + logger.info(f"Switched to model: {model_spec}", model=model_spec) except HTTPException: # Re-raise HTTP exceptions (501, 507, etc.) from vision/memory checks @@ -767,11 +764,10 @@ async def list_models(): """List available MLX models in the cache. Returns models sorted with preloaded model first (if set), then alphabetically. - Filters to healthy MLX models (runtime compatibility deferred to P2 refactoring). + Filters to healthy + runtime_compatible models. """ from .cache import cache_dir_to_hf - from ..operations.common import detect_framework, read_front_matter - from ..operations.health import is_model_healthy + from ..operations.common import build_model_object model_list = [] model_cache = get_current_model_cache() @@ -783,8 +779,7 @@ async def list_models(): model_name = cache_dir_to_hf(model_dir.name) try: - # Check if it's a healthy MLX model - # Get the latest snapshot for detection + # Get snapshot path snapshots_dir = model_dir / "snapshots" selected_path = None if snapshots_dir.exists(): @@ -792,27 +787,21 @@ async def list_models(): if snapshots: selected_path = snapshots[0] - # Read front-matter for framework detection (align with CLI behavior) - probe = selected_path if selected_path is not None else model_dir - fm = read_front_matter(probe) + # Use shared build_model_object (single source of truth) + model_obj = build_model_object(model_name, model_dir, selected_path) - framework = detect_framework(model_name, model_dir, selected_path, fm) - healthy, _ = is_model_healthy(model_name) - - # Filter: Only MLX + healthy models - # TODO P2: Add runtime_compatible check (needs refactoring to avoid duplication) - if framework != "MLX" or not healthy: + # Filter: healthy AND runtime_compatible + if model_obj.get("health") != "healthy": + continue + if not model_obj.get("runtime_compatible"): continue # Get model context length (best effort) context_length = None try: - snapshots_dir = model_dir / "snapshots" - if snapshots_dir.exists(): - snapshots = [d for d in snapshots_dir.iterdir() if d.is_dir()] - if snapshots: - from .runner import get_model_context_length - context_length = get_model_context_length(str(snapshots[0])) + if selected_path: + from .runner import get_model_context_length + context_length = get_model_context_length(str(selected_path)) except Exception: pass diff --git a/mlxk2/core/vision_runner.py b/mlxk2/core/vision_runner.py index bce9071..e41409d 100644 --- a/mlxk2/core/vision_runner.py +++ b/mlxk2/core/vision_runner.py @@ -97,9 +97,8 @@ class VisionRunner: raise RuntimeError("mlx-vlm is missing load()/generate() API") # mlx-vlm expects HF repo_id, not local path - # fix_mistral_regex=True: Suppress tokenizer regex warning for Mistral-based models # local_files_only=True: Use mlx-knife's cache only, never download (pull's responsibility) - loaded = self._load(self.model_name, fix_mistral_regex=True, local_files_only=True) + loaded = self._load(self.model_name, local_files_only=True) if isinstance(loaded, tuple): # Common pattern: (model, processor) self.model = loaded[0] if len(loaded) > 0 else None @@ -256,9 +255,9 @@ class VisionRunner: lat = convert_to_degrees(gps_dict.get("GPSLatitude")) lon = convert_to_degrees(gps_dict.get("GPSLongitude")) - if lat and gps_dict.get("GPSLatitudeRef") == "S": + if lat is not None and gps_dict.get("GPSLatitudeRef") == "S": lat = -lat - if lon and gps_dict.get("GPSLongitudeRef") == "W": + if lon is not None and gps_dict.get("GPSLongitudeRef") == "W": lon = -lon exif.gps_lat = lat @@ -280,7 +279,7 @@ class VisionRunner: exif.camera = str(camera).strip() # Return None if no useful EXIF found - if not any([exif.gps_lat, exif.gps_lon, exif.datetime, exif.camera]): + if all(x is None for x in [exif.gps_lat, exif.gps_lon, exif.datetime, exif.camera]): return None return exif diff --git a/mlxk2/operations/common.py b/mlxk2/operations/common.py index 3e96519..1b16d5a 100644 --- a/mlxk2/operations/common.py +++ b/mlxk2/operations/common.py @@ -144,7 +144,8 @@ def detect_framework(hf_name: str, model_root: Path, selected_path: Optional[Pat MLX if: - org is mlx-community/*, or - README front-matter tags include 'mlx', or - - README front-matter library_name == 'mlx'. + - README front-matter library_name == 'mlx', or + - config.json contains 'quantization' key (MLX-specific). Else GGUF if any *.gguf present under selected_path or snapshots. Else PyTorch if any *.safetensors or pytorch_model.bin present under snapshots. @@ -154,6 +155,13 @@ def detect_framework(hf_name: str, model_root: Path, selected_path: Optional[Pat if "mlx-community/" in hf_name: return "MLX" + # Search location preference: selected snapshot, else model root + root = selected_path if selected_path is not None else model_root + + # Read front-matter if not provided (Issue #48: self-contained detection) + if fm is None: + fm = read_front_matter(root) + # Front-matter signals if fm is not None: tags = [t.lower() for t in (fm.tags or [])] @@ -161,8 +169,10 @@ def detect_framework(hf_name: str, model_root: Path, selected_path: Optional[Pat if "mlx" in tags or lib == "mlx": return "MLX" - # Search location preference: selected snapshot, else model root - root = selected_path if selected_path is not None else model_root + # Config-based detection: 'quantization' key is MLX-specific (Issue #48) + config = _load_config_json(root) + if config and "quantization" in config: + return "MLX" if _has_any(root, ("**/*.gguf",)): return "GGUF" @@ -176,7 +186,7 @@ def detect_framework(hf_name: str, model_root: Path, selected_path: Optional[Pat return "Unknown" -def detect_model_type(hf_name: str, config: Optional[Dict[str, Any]], tok_hints: Dict[str, Any]) -> str: +def detect_model_type(hf_name: str, config: Optional[Dict[str, Any]], tok_hints: Dict[str, Any], probe: Optional[Path] = None) -> str: name = hf_name.lower() if "embed" in name: return "embedding" @@ -190,13 +200,20 @@ def detect_model_type(hf_name: str, config: Optional[Dict[str, Any]], tok_hints: ct = tok_hints.get("chat_template") if isinstance(ct, str) and ct.strip(): return "chat" + # Check for chat_template.json file (Issue #48: reliable indicator) + if probe is not None and (probe / "chat_template.json").exists(): + return "chat" if "instruct" in name or "chat" in name: return "chat" return "base" def detect_vision_capability(probe: Path, config: Optional[Dict[str, Any]]) -> bool: - """Detect whether the model snapshot supports vision inputs.""" + """Detect whether the model snapshot supports vision inputs. + + Video models (AutoVideoProcessor) are excluded as they require PyTorch/Torchvision. + mlx-vlm only supports image vision models (AutoImageProcessor). + """ try: if isinstance(config, dict): mt = config.get("model_type") @@ -208,6 +225,9 @@ def detect_vision_capability(probe: Path, config: Optional[Dict[str, Any]]) -> b preprocessor_cfg = config.get("preprocessor_config") if isinstance(preprocessor_cfg, dict): + # Exclude video processors (requires PyTorch/Torchvision) + if preprocessor_cfg.get("processor_class") == "AutoVideoProcessor": + return False return True if _has_any( @@ -221,6 +241,25 @@ def detect_vision_capability(probe: Path, config: Optional[Dict[str, Any]]) -> b "**/image_processor_config.json", ), ): + # Check if it's a video processor (requires PyTorch/Torchvision) + # Video models have video_preprocessor_config.json or temporal_patch_size + if (probe / "video_preprocessor_config.json").exists(): + return False + + preprocessor_path = probe / "preprocessor_config.json" + if preprocessor_path.exists(): + try: + import json + with open(preprocessor_path) as f: + preprocessor_data = json.load(f) + if isinstance(preprocessor_data, dict): + # Video model indicators + if preprocessor_data.get("processor_class") == "AutoVideoProcessor": + return False + if "temporal_patch_size" in preprocessor_data: + return False + except Exception: + pass return True except Exception: return False @@ -308,7 +347,7 @@ def build_model_object(hf_name: str, model_root: Path, selected_path: Optional[P config = _load_config_json(probe) framework = detect_framework(hf_name, model_root, selected_path=selected_path, fm=fm) - model_type = detect_model_type(hf_name, config, tok) + model_type = detect_model_type(hf_name, config, tok, probe) capabilities = detect_capabilities(model_type, hf_name, tok, config, probe) has_vision = "vision" in capabilities @@ -316,17 +355,21 @@ def build_model_object(hf_name: str, model_root: Path, selected_path: Optional[P healthy, health_reason = is_model_healthy(hf_name) # Runtime compatibility: ALWAYS computed (gate logic applies) - # Gate: Only check runtime if file integrity is healthy + # Gate 1: File integrity must be healthy + # Gate 2: Framework must be MLX (only backend supported) runtime_reason: Optional[str] = None - if healthy: - if has_vision: - runtime_compatible, runtime_reason = vision_runtime_compatibility() - else: - runtime_compatible, runtime_reason = check_runtime_compatibility(probe, framework) - else: + if not healthy: # File integrity failed → skip runtime check runtime_compatible = False runtime_reason = None # health_reason takes precedence + elif framework != "MLX": + # Non-MLX frameworks not supported (PyTorch, GGUF, etc.) + runtime_compatible = False + runtime_reason = f"Incompatible framework: {framework}" + elif has_vision: + runtime_compatible, runtime_reason = vision_runtime_compatibility() + else: + runtime_compatible, runtime_reason = check_runtime_compatibility(probe, framework) # Reason field: First problem encountered (health → runtime) reason = health_reason if not healthy else runtime_reason diff --git a/mlxk2/output/human.py b/mlxk2/output/human.py index 7c56dc3..ef59c9b 100644 --- a/mlxk2/output/human.py +++ b/mlxk2/output/human.py @@ -134,25 +134,20 @@ def render_list(data: Dict[str, Any], show_health: bool, show_all: bool, verbose headers.append("Health") # Human filter: - # - --all: show everything - # - default: show only MLX chat models (safer for run/server selection) - # - --verbose (without --all): show all MLX models (chat + base) + # - --all: show everything (no filter) + # - default/verbose: only healthy + runtime_compatible (runnable models) + # Same filter as Server /v1/models - single source of truth via build_model_object filtered: List[Dict[str, Any]] = [] for m in models: - fw = str(m.get("framework", "")).upper() - typ = str(m.get("model_type", "")).lower() if show_all: filtered.append(m) else: - if fw != "MLX": + # Filter: healthy AND runtime_compatible + if m.get("health") != "healthy": continue - if verbose: - # In verbose mode, show all MLX models - filtered.append(m) - else: - # Default compact mode: only MLX chat - if typ == "chat": - filtered.append(m) + if not m.get("runtime_compatible"): + continue + filtered.append(m) rows: List[List[str]] = [] for m in filtered: diff --git a/pyproject.toml b/pyproject.toml index f397ce9..7dea8f1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ classifiers = [ "License :: OSI Approved :: Apache Software License", ] dependencies = [ - "huggingface-hub>=0.34.0,<1.0", + "huggingface-hub>=0.34.0", "requests>=2.32.0", "mlx-lm>=0.28.4", "mlx>=0.29.0", @@ -66,7 +66,7 @@ dev = [ "mypy>=1.5.0", ] vision = [ - "mlx-vlm>=0.3.9", # Vision Language Models support (ADR-012, requires Python 3.10+) + "mlx-vlm>=0.3.9", # Vision Language Models support (ADR-012, requires Python 3.10+; beta.3 recommends mlx-vlm commit c4ea290e47e2155b67d94c708c662f8ab64e1b37) ] [tool.setuptools] diff --git a/tests_2.0/live/conftest.py b/tests_2.0/live/conftest.py index dda3ac2..b7ff59e 100644 --- a/tests_2.0/live/conftest.py +++ b/tests_2.0/live/conftest.py @@ -21,6 +21,7 @@ from .test_utils import ( discover_mlx_models_in_user_cache, discover_text_models, discover_vision_models, + parse_vm_stat_page_size, TEST_MODELS, ) @@ -499,9 +500,176 @@ def report_benchmark(request): # ============================================================================ -# Benchmark Reporting (ADR-013 Phase 0) +# Benchmark Reporting (ADR-013 Phase 0 + 0.5) # ============================================================================ +def _get_macos_system_health() -> Dict[str, Any]: + """Collect macOS system health metrics (ADR-013 Phase 0.5 - v0.2.0). + + Uses macOS-native tools (sysctl, vm_stat, ps) - ZERO new dependencies. + Enables automatic regression quality assessment via quality_flags. + + Returns: + dict: System health metrics with keys: + - swap_used_mb: Current swap usage in MB + - ram_free_gb: Available RAM in GB + - zombie_processes: Count of zombie processes + - quality_flags: List of quality indicators + ["clean"] = healthy system + ["degraded_swap"] = swap usage detected (memory pressure) + ["degraded_zombies"] = zombie processes detected + + Quality Thresholds (empirically derived from Session 43 analysis): + - Swap: >100 MB indicates memory pressure (beta2→beta3: 1.8 GB swap = +3.4% slowdown) + - Zombies: >0 indicates stuck processes (REGRESSION-2025-12-08: 14 zombies = +90% slowdown) + """ + import subprocess + + health = { + "swap_used_mb": 0, + "ram_free_gb": 0.0, + "zombie_processes": 0, + "quality_flags": [] + } + + try: + # Get swap usage via sysctl (macOS native) + # sysctl vm.swapusage returns: "vm.swapusage: total = 0.00M used = 0.00M free = 0.00M (encrypted)" + result = subprocess.run( + ["sysctl", "vm.swapusage"], + capture_output=True, + text=True, + timeout=2 + ) + if result.returncode == 0: + # Parse: "total = X.XXM used = Y.YYM free = Z.ZZM" + for part in result.stdout.split(): + if part.endswith("M") and "used" in result.stdout: + # Extract used value (appears after "used = ") + parts = result.stdout.split("used = ") + if len(parts) > 1: + used_str = parts[1].split()[0] + # Parse size (can be M or G suffix) + if used_str.endswith("G"): + health["swap_used_mb"] = int(float(used_str[:-1]) * 1024) + elif used_str.endswith("M"): + health["swap_used_mb"] = int(float(used_str[:-1])) + break + except Exception: + pass # Swap metric is optional (not critical if it fails) + + try: + # Get free RAM via vm_stat (macOS native) + # vm_stat reports page size in the header (Apple Silicon uses 16KB pages). + result = subprocess.run( + ["vm_stat"], + capture_output=True, + text=True, + timeout=2 + ) + if result.returncode == 0: + page_size = parse_vm_stat_page_size(result.stdout) + # Parse "Pages free: 12345." + for line in result.stdout.splitlines(): + if "Pages free:" in line: + pages_free = int(line.split(":")[1].strip().rstrip(".")) + health["ram_free_gb"] = round(pages_free * page_size / (1024**3), 2) + break + except Exception: + pass # RAM metric is optional + + try: + # Get zombie process count via ps aux (macOS native) + # Zombies show as "" in ps output + result = subprocess.run( + ["ps", "aux"], + capture_output=True, + text=True, + timeout=2 + ) + if result.returncode == 0: + # Count lines containing "" + health["zombie_processes"] = result.stdout.count("") + except Exception: + pass # Zombie count is optional + + # Determine quality flags (empirical thresholds from regression analysis) + flags = [] + if health["swap_used_mb"] > 100: + flags.append("degraded_swap") + if health["zombie_processes"] > 0: + flags.append("degraded_zombies") + + # If no degradation detected, mark as clean + if not flags: + flags.append("clean") + + health["quality_flags"] = flags + return health + + +def _get_macos_hardware_profile() -> Dict[str, Any]: + """Collect macOS hardware profile (ADR-013 Phase 0.5 - v0.2.0). + + Uses macOS-native sysctl - ZERO new dependencies. + Enables hardware-specific performance analysis (M1 vs M2 vs M3 vs M4). + + Returns: + dict: Hardware profile with keys: + - model: Mac model identifier (e.g., "Mac14,9" = M3 Max) + - cores_physical: Physical CPU cores (P-cores only) + - cores_logical: Logical CPU cores (P+E cores with hyperthreading) + """ + import subprocess + + profile = { + "model": "unknown", + "cores_physical": 0, + "cores_logical": 0, + } + + try: + # Get Mac model identifier + result = subprocess.run( + ["sysctl", "-n", "hw.model"], + capture_output=True, + text=True, + timeout=2 + ) + if result.returncode == 0: + profile["model"] = result.stdout.strip() + except Exception: + pass + + try: + # Get physical cores (P-cores) + result = subprocess.run( + ["sysctl", "-n", "hw.physicalcpu"], + capture_output=True, + text=True, + timeout=2 + ) + if result.returncode == 0: + profile["cores_physical"] = int(result.stdout.strip()) + except Exception: + pass + + try: + # Get logical cores (P+E cores with hyperthreading) + result = subprocess.run( + ["sysctl", "-n", "hw.logicalcpu"], + capture_output=True, + text=True, + timeout=2 + ) + if result.returncode == 0: + profile["cores_logical"] = int(result.stdout.strip()) + except Exception: + pass + + return profile + + def pytest_addoption(parser): """Add --report-output option for benchmark reporting.""" parser.addoption( @@ -509,7 +677,7 @@ def pytest_addoption(parser): action="store", default=None, metavar="PATH", - help="Generate benchmark reports to JSONL file (ADR-013 Phase 0)" + help="Generate benchmark reports to JSONL file (ADR-013 Phase 0.5)" ) @@ -534,11 +702,16 @@ def pytest_runtest_makereport(item, call): Reports are written as JSONL (one JSON object per line) to allow streaming and easy appending across test runs. - Schema version: 0.1.0 (Phase 0 - Experimental) - See: benchmarks/schemas/report-v0.1.schema.json + Schema version: 0.2.0 (Phase 0.5 - System Health + Hardware Profile) + See: ADR-013 Phase 0.5 implementation + + Changelog from 0.1.0 → 0.2.0: + - Added: system.hardware_profile (Mac model, cores) + - Added: system_health (swap, RAM, zombies, quality_flags) + - Backward compatible: All 0.1.0 fields preserved """ import json - from datetime import datetime + from datetime import datetime, timezone outcome = yield report = outcome.get_result() @@ -553,8 +726,8 @@ def pytest_runtest_makereport(item, call): # Build report data (required fields) data = { - "schema_version": "0.1.0", - "timestamp": datetime.utcnow().isoformat() + "Z", + "schema_version": "0.2.0", + "timestamp": datetime.now(timezone.utc).isoformat(), "mlx_knife_version": __version__, "test": item.nodeid, "outcome": report.outcome, @@ -581,6 +754,20 @@ def pytest_runtest_makereport(item, call): # Everything else goes to metadata data.setdefault("metadata", {})[key] = value + # ADR-013 Phase 0.5: Collect system health metrics (v0.2.0) + # Enables automatic regression quality assessment + system_health = _get_macos_system_health() + data["system_health"] = system_health + + # ADR-013 Phase 0.5: Collect hardware profile (v0.2.0) + # Enables hardware-specific performance analysis (M1 vs M2 vs M3 vs M4) + hardware_profile = _get_macos_hardware_profile() + + # Add hardware_profile to system section (create if not exists) + if "system" not in data: + data["system"] = {} + data["system"]["hardware_profile"] = hardware_profile + # Write JSONL (one line per report) try: item.config.report_file.write(json.dumps(data) + "\n") @@ -588,4 +775,3 @@ def pytest_runtest_makereport(item, call): except Exception as e: # Don't fail tests if reporting fails print(f"\n⚠️ Benchmark report write failed: {e}") - diff --git a/tests_2.0/live/test_server_e2e.py b/tests_2.0/live/test_server_e2e.py index a877fa7..5803ee6 100644 --- a/tests_2.0/live/test_server_e2e.py +++ b/tests_2.0/live/test_server_e2e.py @@ -42,6 +42,8 @@ from .test_utils import ( # Server request timeout (increased from 30s to 45s in Session 22) # Accounts for: baseline (15s) + probe/policy overhead (2.7s) + generation + safety margin SERVER_REQUEST_TIMEOUT = 45.0 +# /v1/models can be slower due to cache scans + runtime checks +MODEL_LIST_TIMEOUT = 20.0 # Opt-in markers pytestmark = [ @@ -100,7 +102,7 @@ class TestServerHealthEndpoints: pytest.skip("No text models available within RAM budget") with LocalServer(test_model) as server_url: - response = httpx.get(f"{server_url}/v1/models") + response = httpx.get(f"{server_url}/v1/models", timeout=MODEL_LIST_TIMEOUT) assert response.status_code == 200 data = response.json() diff --git a/tests_2.0/live/test_utils.py b/tests_2.0/live/test_utils.py index 3022f23..c6f2141 100644 --- a/tests_2.0/live/test_utils.py +++ b/tests_2.0/live/test_utils.py @@ -8,6 +8,7 @@ Provides: from __future__ import annotations +import re import sys from pathlib import Path from typing import Dict, Any, Tuple @@ -108,6 +109,14 @@ def get_system_memory_bytes() -> int: return 0 +def parse_vm_stat_page_size(output: str) -> int: + """Extract vm_stat page size in bytes, falling back to 4096.""" + match = re.search(r"page size of (\d+) bytes", output) + if match: + return int(match.group(1)) + return 4096 + + def discover_text_models() -> list[Dict[str, Any]]: """Discover text-only models (filter out Vision models). diff --git a/tests_2.0/live/test_vm_stat_parsing.py b/tests_2.0/live/test_vm_stat_parsing.py new file mode 100644 index 0000000..500a6ab --- /dev/null +++ b/tests_2.0/live/test_vm_stat_parsing.py @@ -0,0 +1,13 @@ +"""Unit tests for vm_stat parsing helpers.""" + +from .test_utils import parse_vm_stat_page_size + + +def test_parse_vm_stat_page_size_apple_silicon(): + output = "Mach Virtual Memory Statistics: (page size of 16384 bytes)\nPages free: 12345." + assert parse_vm_stat_page_size(output) == 16384 + + +def test_parse_vm_stat_page_size_fallback(): + output = "Pages free: 12345." + assert parse_vm_stat_page_size(output) == 4096 diff --git a/tests_2.0/test_human_output.py b/tests_2.0/test_human_output.py index bcd3d21..fec5e06 100644 --- a/tests_2.0/test_human_output.py +++ b/tests_2.0/test_human_output.py @@ -18,6 +18,7 @@ def sample_list_data(): "model_type": "chat", "capabilities": ["text-generation", "chat"], "health": "healthy", + "runtime_compatible": True, "cached": True, }, { @@ -29,6 +30,7 @@ def sample_list_data(): "model_type": "base", "capabilities": ["text-generation"], "health": "unhealthy", + "runtime_compatible": False, "cached": True, }, ], @@ -98,6 +100,7 @@ def test_list_human_filters_mlx_base_default(): "model_type": "chat", "capabilities": ["text-generation", "chat"], "health": "healthy", + "runtime_compatible": True, "cached": True, }, { @@ -109,26 +112,42 @@ def test_list_human_filters_mlx_base_default(): "model_type": "base", "capabilities": ["text-generation"], "health": "healthy", + "runtime_compatible": True, + "cached": True, + }, + { + "name": "org/Unhealthy", + "hash": None, + "size_bytes": 500, + "last_modified": "2025-08-30T12:00:00Z", + "framework": "MLX", + "model_type": "chat", + "capabilities": ["text-generation"], + "health": "unhealthy", + "runtime_compatible": False, "cached": True, }, ], - "count": 2, + "count": 3, }, "error": None, } - # Default (compact) should hide MLX base + # Default: shows healthy + runtime_compatible models (both MLXChat and MLXBase) out_default = render_list(data, show_health=False, show_all=False, verbose=False) assert "MLXChat" in out_default - assert "MLXBase" not in out_default + assert "MLXBase" in out_default + assert "Unhealthy" not in out_default - # Verbose (without --all) shows all MLX (chat + base) + # Verbose: same filter, more columns out_verbose = render_list(data, show_health=False, show_all=False, verbose=True) assert "MLXChat" in out_verbose assert "MLXBase" in out_verbose + assert "Unhealthy" not in out_verbose -def test_list_human_verbose_shows_all_mlx_only(): +def test_list_human_filters_by_healthy_and_runtime_compatible(): + """Test that default/verbose filters by healthy + runtime_compatible.""" from mlxk2.output.human import render_list data = { @@ -136,9 +155,9 @@ def test_list_human_verbose_shows_all_mlx_only(): "command": "list", "data": { "models": [ - {"name": "org/MLXChat", "hash": None, "size_bytes": 1, "last_modified": "2025-08-30T12:00:00Z", "framework": "MLX", "model_type": "chat", "capabilities": ["text-generation", "chat"], "health": "healthy", "cached": True}, - {"name": "org/MLXBase", "hash": None, "size_bytes": 1, "last_modified": "2025-08-30T12:00:00Z", "framework": "MLX", "model_type": "base", "capabilities": ["text-generation"], "health": "healthy", "cached": True}, - {"name": "org/OtherPT", "hash": None, "size_bytes": 1, "last_modified": "2025-08-30T12:00:00Z", "framework": "PyTorch", "model_type": "base", "capabilities": ["text-generation"], "health": "healthy", "cached": True}, + {"name": "org/Runnable", "hash": None, "size_bytes": 1, "last_modified": "2025-08-30T12:00:00Z", "framework": "MLX", "model_type": "chat", "capabilities": ["text-generation", "chat"], "health": "healthy", "runtime_compatible": True, "cached": True}, + {"name": "org/Unhealthy", "hash": None, "size_bytes": 1, "last_modified": "2025-08-30T12:00:00Z", "framework": "MLX", "model_type": "base", "capabilities": ["text-generation"], "health": "unhealthy", "runtime_compatible": True, "cached": True}, + {"name": "org/NotCompatible", "hash": None, "size_bytes": 1, "last_modified": "2025-08-30T12:00:00Z", "framework": "PyTorch", "model_type": "base", "capabilities": ["text-generation"], "health": "healthy", "runtime_compatible": False, "cached": True}, ], "count": 3, }, @@ -146,11 +165,12 @@ def test_list_human_verbose_shows_all_mlx_only(): } out_verbose = render_list(data, show_health=False, show_all=False, verbose=True) - # Shows both MLX models (chat+base) - assert "MLXChat" in out_verbose - assert "MLXBase" in out_verbose - # Hides non-MLX - assert "OtherPT" not in out_verbose + # Shows only healthy + runtime_compatible + assert "Runnable" in out_verbose + # Hides unhealthy + assert "Unhealthy" not in out_verbose + # Hides not runtime_compatible + assert "NotCompatible" not in out_verbose def test_list_human_all_shows_all_frameworks(): diff --git a/tests_2.0/test_issue_30_preflight.py b/tests_2.0/test_issue_30_preflight.py index 722a57e..5496bed 100644 --- a/tests_2.0/test_issue_30_preflight.py +++ b/tests_2.0/test_issue_30_preflight.py @@ -4,6 +4,21 @@ import pytest from mlxk2.operations.pull import preflight_repo_access, pull_operation +def _create_mock_response(status_code=403): + """Create a mock httpx.Response for huggingface-hub 1.x exceptions. + + Hub 1.x requires response parameter to be a real httpx.Response object. + """ + try: + import httpx + # Create minimal mock response + request = httpx.Request("GET", "https://huggingface.co/api/models/test") + return httpx.Response(status_code=status_code, request=request) + except ImportError: + # Fallback for older hub versions that don't need it + return None + + def test_preflight_private_model_without_token(monkeypatch): """Test preflight check with a known private model without token. @@ -29,7 +44,8 @@ def test_preflight_private_model_without_token(monkeypatch): from huggingface_hub import errors as _hub_errors GatedRepoError = _hub_errors.GatedRepoError def _fake_model_info(self, repo_id, token=None): - raise GatedRepoError("Gated/private repository") + response = _create_mock_response(status_code=403) + raise GatedRepoError("Gated/private repository", response=response) monkeypatch.setattr(HfApi, "model_info", _fake_model_info, raising=True) success, error = preflight_repo_access("org/private-model") @@ -53,7 +69,8 @@ def test_preflight_nonexistent_model(monkeypatch): from huggingface_hub import errors as _hub_errors RepositoryNotFoundError = _hub_errors.RepositoryNotFoundError def _fake_model_info(self, repo_id, token=None): - raise RepositoryNotFoundError("Not found") + response = _create_mock_response(status_code=404) + raise RepositoryNotFoundError("Not found", response=response) monkeypatch.setattr(HfApi, "model_info", _fake_model_info, raising=True) success, error = preflight_repo_access("definitely-not-existing-model-12345-xyz") @@ -78,7 +95,8 @@ def test_preflight_integration_in_pull(isolated_cache, monkeypatch): from huggingface_hub import errors as _hub_errors RepositoryNotFoundError = _hub_errors.RepositoryNotFoundError def _fake_model_info(self, repo_id, token=None): - raise RepositoryNotFoundError("Not found") + response = _create_mock_response(status_code=404) + raise RepositoryNotFoundError("Not found", response=response) monkeypatch.setattr(HfApi, "model_info", _fake_model_info, raising=True) # Test with a non-existent model - should fail at preflight stage @@ -145,7 +163,8 @@ def test_preflight_prevents_cache_pollution(isolated_cache, monkeypatch): from huggingface_hub import errors as _hub_errors GatedRepoError = _hub_errors.GatedRepoError def _fake_model_info(self, repo_id, token=None): - raise GatedRepoError("Gated/private repository") + response = _create_mock_response(status_code=403) + raise GatedRepoError("Gated/private repository", response=response) monkeypatch.setattr(HfApi, "model_info", _fake_model_info, raising=True) # Attempt to pull a gated/private model diff --git a/tests_2.0/test_server_models_and_errors.py b/tests_2.0/test_server_models_and_errors.py index 5f96191..a5d3733 100644 --- a/tests_2.0/test_server_models_and_errors.py +++ b/tests_2.0/test_server_models_and_errors.py @@ -63,45 +63,57 @@ def test_unknown_model_maps_to_404(): assert resp.status_code == 404 -def test_models_endpoint_filters_non_mlx_and_unhealthy(): - """Ensure /v1/models excludes non-MLX and unhealthy entries.""" +def test_models_endpoint_filters_unhealthy_and_not_runtime_compatible(): + """Ensure /v1/models excludes unhealthy and non-runtime-compatible entries. + + Filter logic: healthy == True AND runtime_compatible == True + Uses shared build_model_object from common.py (single source of truth). + """ client = TestClient(app) with patch('mlxk2.core.server_base.get_current_model_cache') as mock_cache, \ patch('mlxk2.core.cache.cache_dir_to_hf') as mock_cache_to_hf, \ - patch('mlxk2.operations.common.detect_framework') as mock_framework, \ - patch('mlxk2.operations.health.is_model_healthy') as mock_healthy: + patch('mlxk2.operations.common.build_model_object') as mock_build: - # Two cached dirs - d1 = MagicMock(); d1.name = "models--org--mlx" - d2 = MagicMock(); d2.name = "models--org--pt" - mock_cache.return_value.iterdir.return_value = [d1, d2] + # Three cached dirs with proper snapshot structure + d1 = MagicMock(); d1.name = "models--org--healthy-compatible" + d2 = MagicMock(); d2.name = "models--org--unhealthy" + d3 = MagicMock(); d3.name = "models--org--not-compatible" + + # Setup snapshot paths for each model dir + for d in [d1, d2, d3]: + snapshot_dir = MagicMock() + snapshot_path = MagicMock() + snapshot_dir.exists.return_value = True + snapshot_dir.iterdir.return_value = [snapshot_path] + snapshot_path.is_dir.return_value = True + d.__truediv__ = lambda self, x, snap=snapshot_dir, spath=snapshot_path: snap if x == "snapshots" else spath + + mock_cache.return_value.iterdir.return_value = [d1, d2, d3] # Map names def map_name(n): - if n == "models--org--mlx": - return "org/mlx" - return "org/pt" - + return n.replace("models--", "").replace("--", "/") mock_cache_to_hf.side_effect = map_name - # Framework detection: d1 is MLX, d2 is not - def detect_fw(model_name, *_args, **_kwargs): - return "MLX" if model_name.endswith("/mlx") else "PyTorch" - - mock_framework.side_effect = detect_fw - - # Health: return False for the MLX one to ensure it is filtered, too - def health(model_name): - return (False, None) if model_name.endswith("/mlx") else (True, None) - - mock_healthy.side_effect = health + # build_model_object returns different health/runtime_compatible + def build(model_name, model_dir, selected_path): + if "unhealthy" in model_name: + return {"health": "unhealthy", "runtime_compatible": True} + elif "not-compatible" in model_name: + return {"health": "healthy", "runtime_compatible": False} + else: + return {"health": "healthy", "runtime_compatible": True} + mock_build.side_effect = build resp = client.get("/v1/models") assert resp.status_code == 200 data = resp.json() - # Both should be filtered: one not MLX, one unhealthy - assert data.get("data") == [] + # Only d1 (healthy + runtime_compatible) should pass + model_ids = [m["id"] for m in data.get("data", [])] + assert "org/healthy-compatible" in model_ids + assert "org/unhealthy" not in model_ids + assert "org/not-compatible" not in model_ids def test_chat_unknown_model_maps_to_404():