From 05f1c30486bae549d7d9d7ae33c9d349875b70b6 Mon Sep 17 00:00:00 2001
From: The BROKE Cluster Team <broke@gmx.eu>
Date: Mon, 17 Nov 2025 22:54:06 +0100
Subject: [PATCH] Release 2.0.3: Foundation for pipes

Foundation release for Unix pipe integration with stderr separation,
benchmark infrastructure, and reasoning control improvements.

Breaking Changes:
- stdout/stderr separation (Issue #43) - errors to stderr in human mode
- JSON mode unchanged (all output to stdout)

Features:
- Benchmark reporting infrastructure (ADR-013 Phase 0)
- --no-reasoning flag (Issue #40 partial - GPT-OSS/QwQ only)
- Interactive mode reasoning control (review_report.md fixes)

Bug Fixes:
- huggingface-hub 1.x incompatibility (critical dependency fix)
- Streaming parity tests refactored (Portfolio Discovery)

Testing:
- 308 tests passing (Python 3.9-3.13)
- 35 skipped (opt-in live tests)
- 79/91 E2E tests passing with HF_HOME

See CHANGELOG.md for complete details and migration guide.
---
 .claude/agents/code-reviewer.md               |  34 ---
 .gitignore                                    |   5 +-
 CHANGELOG.md                                  |  97 ++++++++
 README.md                                     |  16 +-
 SECURITY.md                                   |   3 +-
 TESTING-DETAILS.md                            |  45 ++--
 benchmarks/README.md                          |  58 +++++
 benchmarks/TESTING.md                         | 155 ++++++++++++
 benchmarks/reports/.gitkeep                   |   0
 benchmarks/reports/README.md                  |  50 ++++
 .../reports/example-2025-11-16-v2.0.2.jsonl   |   4 +
 benchmarks/schemas/MIGRATIONS.md              | 113 +++++++++
 benchmarks/schemas/report-v0.1.schema.json    | 152 ++++++++++++
 benchmarks/validate_reports.py                | 131 ++++++++++
 mlxk2/__init__.py                             |   2 +-
 mlxk2/cli.py                                  | 105 ++++----
 mlxk2/core/runner/__init__.py                 |  11 +-
 mlxk2/core/runner/reasoning_format.py         |  15 ++
 mlxk2/core/runner/stop_tokens.py              |   5 +-
 mlxk2/operations/run.py                       |  64 +++--
 pyproject.toml                                |   2 +-
 tests_2.0/live/conftest.py                    | 225 ++++++++++++++++++
 tests_2.0/live/test_cli_e2e.py                |  20 +-
 tests_2.0/live/test_server_e2e.py             |  30 ++-
 tests_2.0/live/test_streaming_parity.py       | 155 +++++++++---
 tests_2.0/test_cli_push_args.py               |  24 +-
 tests_2.0/test_cli_run_exit_codes.py          |  25 +-
 tests_2.0/test_interactive_mode.py            |  40 ++--
 tests_2.0/test_run_complete.py                |  60 ++++-
 29 files changed, 1425 insertions(+), 221 deletions(-)
 delete mode 100644 .claude/agents/code-reviewer.md
 create mode 100644 benchmarks/README.md
 create mode 100644 benchmarks/TESTING.md
 create mode 100644 benchmarks/reports/.gitkeep
 create mode 100644 benchmarks/reports/README.md
 create mode 100644 benchmarks/reports/example-2025-11-16-v2.0.2.jsonl
 create mode 100644 benchmarks/schemas/MIGRATIONS.md
 create mode 100644 benchmarks/schemas/report-v0.1.schema.json
 create mode 100644 benchmarks/validate_reports.py
diff --git a/.claude/agents/code-reviewer.md b/.claude/agents/code-reviewer.md
deleted file mode 100644
index 13bba63..0000000
--- a/.claude/agents/code-reviewer.md
+++ /dev/null
@@ -1,34 +0,0 @@
----
-name: code-reviewer
-description: Use this agent when you need to review recently written code for quality, best practices, potential bugs, and improvements. This agent should be called after completing a logical chunk of code development, such as implementing a new function, class, or feature. Examples: <example>Context: The user has just implemented a new function and wants it reviewed. user: "I just wrote this function to calculate prime numbers: def is_prime(n): if n < 2: return False; for i in range(2, int(n**0.5) + 1): if n % i == 0: return False; return True" assistant: "Let me use the code-reviewer agent to analyze this implementation for correctness and potential improvements." <commentary>Since the user has written code and wants it reviewed, use the code-reviewer agent to provide detailed feedback on the prime number function.</commentary></example> <example>Context: User has completed a class implementation and wants feedback. user: "Here's my new UserManager class with authentication methods" assistant: "I'll use the code-reviewer agent to review your UserManager class implementation for security best practices and code quality." <commentary>The user has implemented a class and needs review, so use the code-reviewer agent to examine the authentication logic and overall design.</commentary></example>
-tools: MultiEdit, Write, NotebookEdit, Grep, LS, Read
-model: sonnet
-color: blue
----
-
-You are an expert code reviewer with deep knowledge across multiple programming languages, frameworks, and software engineering best practices. Your role is to provide thorough, constructive code reviews that help developers write better, more maintainable, and more secure code.
-
-When reviewing code, you will:
-
-1. **Analyze Code Quality**: Examine the code for readability, maintainability, and adherence to language-specific conventions and best practices. Look for proper naming conventions, appropriate code organization, and clear logic flow.
-
-2. **Identify Potential Issues**: Scan for bugs, logic errors, edge cases that aren't handled, potential security vulnerabilities, performance bottlenecks, and resource management issues (memory leaks, unclosed resources, etc.).
-
-3. **Assess Architecture and Design**: Evaluate whether the code follows solid design principles (SOLID, DRY, KISS), has appropriate separation of concerns, uses suitable design patterns, and maintains good abstraction levels.
-
-4. **Check Error Handling**: Verify that the code properly handles exceptions, validates inputs, provides meaningful error messages, and fails gracefully when appropriate.
-
-5. **Review Testing Considerations**: Identify areas that need testing, suggest test cases for edge conditions, and evaluate whether the code is written in a testable manner.
-
-6. **Provide Specific Recommendations**: Offer concrete, actionable suggestions for improvement with code examples when helpful. Prioritize recommendations by impact and importance.
-
-7. **Consider Context**: Take into account the project's coding standards, technology stack, performance requirements, and any specific constraints mentioned in project documentation (like CLAUDE.md files).
-
-Your review format should include:
-- **Summary**: Brief overall assessment
-- **Strengths**: What the code does well
-- **Issues Found**: Categorized by severity (Critical, Major, Minor)
-- **Recommendations**: Specific improvements with examples
-- **Additional Considerations**: Testing, documentation, or architectural suggestions
-
-Be constructive and educational in your feedback. Explain the 'why' behind your suggestions to help the developer learn. When code is well-written, acknowledge the good practices used. Always maintain a professional, helpful tone that encourages improvement rather than criticism.
diff --git a/.gitignore b/.gitignore
index 7ac97e4..133d64f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,9 +19,12 @@ CLAUDE.md
 TODO_REAL_TESTS.md
 server.log
 install_*.log
-.claude/
 openwebui311/bin/
 .gitignore
 
 # Test artifacts (generated reports)
 *_report.json
+
+# Benchmark reports (ADR-013 Phase 0)
+# These reports ARE tracked in git for historical data
+!benchmarks/reports/*.jsonl
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ce9fc49..559cc73 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,102 @@
 # Changelog
 
+## [2.0.3] - 2025-11-17
+
+**Stable Release**: Benchmark infrastructure + Unix stderr fix + reasoning control + dependency hardening.
+
+### Features
+
+- **Benchmark Reporting Infrastructure**:
+  - `--report-output` flag in E2E tests: Writes JSONL benchmark reports with model metadata
+  - `report_benchmark()` fixture: Easy model metadata reporting (family, variant, size_gb, stop_tokens, skip_reason)
+  - Model family detection: `_parse_model_family()` helper extracts family/variant from model IDs
+  - Schema validation: `benchmarks/schemas/report-v0.1.schema.json` + validation script
+  - **Validated**: 17 models with full metadata (Phi-3, Qwen, Llama, Mistral, DeepSeek, etc.)
+  - **Files**: `tests_2.0/live/conftest.py`, `test_cli_e2e.py`, `test_server_e2e.py`
+  - **Documentation**: `benchmarks/README.md`, `benchmarks/TESTING.md`, `benchmarks/schemas/MIGRATIONS.md`
+
+- **Reasoning Model Control** (`--no-reasoning` flag):
+  - CLI toggle to hide reasoning output (Issue #40 Option 1 - partial implementation)
+  - Works in both streaming and batch modes (single-shot + interactive)
+  - Default: Show reasoning (backward compatible)
+  - **Limitations**: Only works with models that auto-generate reasoning tags (GPT-OSS, QwQ-32B via chat templates)
+  - **Not supported**: DeepSeek-R1, Qwen3, and most other reasoning models (require system prompts from Issue #33)
+  - **Issue #40 remains open**: Requires structured API (Option 2) and depends on #33 (System Prompts) for broad model support
+  - **Technical fix**: `review_report.md` - Flag now correctly propagates through `interactive_chat()` in both streaming and batch modes
+  - **Files**: `mlxk2/cli.py`, `mlxk2/operations/run.py`, `mlxk2/core/runner/__init__.py`, `mlxk2/core/runner/reasoning_format.py`
+
+### BREAKING CHANGES
+
+**Error Output to stderr (Human Mode Only)**
+
+Errors are now printed to stderr instead of stdout in human mode. This follows Unix conventions and enables clean pipe workflows. JSON mode remains unchanged (all output to stdout) for scripting/automation use cases.
+
+**What changed:**
+- **Human mode**: Errors → stderr (was stdout)
+- **JSON mode**: Unchanged - all output to stdout (errors + success, for scripting)
+- Exit codes: Unchanged (0=success, 1=error)
+- **Affected commands**: list, health, show, pull, rm, clone, push, run (all commands)
+
+**Migration:**
+
+```bash
+# Human mode: Capture both stdout and stderr if needed
+OUTPUT=$(mlxk list 2>&1)
+
+# Recommended: Separate success and error streams (human mode)
+if mlxk pull model > output.txt 2> error.log; then
+    echo "Success: $(cat output.txt)"
+else
+    echo "Error: $(cat error.log)"
+fi
+
+# JSON mode: No change needed (all output still on stdout)
+OUTPUT=$(mlxk show model --json)
+echo "$OUTPUT" | jq .status  # Works as before
+```
+
+**Not affected:**
+- Interactive terminal users (stderr visible)
+- JSON mode users (all output on stdout)
+- Exit code checks
+- Pipe workflows (actually **fixed** in human mode)
+
+### Bug Fixes
+
+- **stdout/stderr separation** (Issue #43): Errors now correctly go to stderr (human mode only)
+  - **Central implementation**: `print_result()` helper in `cli.py` for consistent error handling
+  - **Run command**: 5 error print statements changed to `file=sys.stderr` in `operations/run.py:57, 88, 102, 132, 229`
+  - **All other commands**: Unified via `print_result()` (list, health, show, pull, rm, clone, push)
+  - **Human mode errors**: Generic format `command: Error: message` (stderr, consistent across all commands)
+  - **JSON mode errors**: Structured JSON on stdout (unchanged, for scripting/jq workflows)
+  - **Rationale**: JSON is for automation/scripting (not piping), human mode is for interactive + pipes
+  - **Test updates**: 2 interactive mode tests updated to check stderr
+    - `tests_2.0/test_interactive_mode.py`: 2 assertions (template fallback, generation error recovery)
+
+- **huggingface-hub 1.x incompatibility** (Critical dependency fix):
+  - **Problem**: `huggingface-hub>=0.34.0` allowed upgrades to 1.x, breaking `transformers` compatibility
+  - **Impact**: All models showed `healthy*` (integrity OK, but runtime failed)
+  - **Fix**: Pin `huggingface-hub>=0.34.0,<1.0` in dependencies
+  - **File**: `pyproject.toml:30`
+
+### Testing Improvements
+
+- **Streaming parity tests refactored to use portfolio discovery**:
+  - **Problem**: Tests had hardcoded model IDs (e.g., `Llama-3.2-3B-Instruct-4bit`) that may not exist in user's cache
+  - **Impact**: Tests failed with cryptic "mock path" errors when models not downloaded
+  - **Fix**: Tests now use portfolio discovery (`mlxk list --json`) to select 2-3 available small models (<6GB)
+  - **Selection strategy**: Smallest models first, exclude reasoning models (known batch/stream inconsistency, fixed in ADR-010)
+  - **Result**: Tests automatically adapt to available models, no more hardcoded dependencies
+  - **File**: `tests_2.0/live/test_streaming_parity.py`
+  - **Custom hook**: `pytest_generate_tests()` parametrizes tests over discovered models at collection time
+
+### Known Issues
+
+- **Large model downloads (>30 GB) can fail overnight**: Connection resets during multi-hour downloads
+- **External SSD I/O deadlocks with parallel downloads**: `huggingface-cli` default `max_workers=8` causes stalls at 99%. Workaround: `--max-workers 1`
+
+---
+
 ## [2.0.2] - 2025-11-15
 
 **Stable Release**: Test infrastructure hardening, stop token validation with 17 models, and web API improvements.
diff --git a/README.md b/README.md
index 856e1d9..f0b58f7 100644
--- a/README.md
+++ b/README.md
@@ -4,9 +4,9 @@
   <img src="https://github.com/mzau/mlx-knife/raw/main/mlxk-demo.gif" alt="MLX Knife Demo" width="900">
 </p>
 
-**Current Stable Version: 2.0.2**
+**Current Stable Version: 2.0.3**
 
-[![GitHub Release](https://img.shields.io/badge/version-2.0.2-green.svg)](https://github.com/mzau/mlx-knife/releases)
+[![GitHub Release](https://img.shields.io/badge/version-2.0.3-green.svg)](https://github.com/mzau/mlx-knife/releases)
 [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0)
 [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
 [![Apple Silicon](https://img.shields.io/badge/Apple%20Silicon-green.svg)](https://support.apple.com/en-us/HT211814)
@@ -46,7 +46,7 @@ MLX Knife has been comprehensively tested and verified on:
 pip install mlx-knife
 
 # Verify installation
-mlxk --version  # → mlxk 2.0.2
+mlxk --version  # → mlxk 2.0.3
 ```
 
 ### Development Installation
@@ -60,7 +60,7 @@ cd mlx-knife
 pip install -e ".[dev,test]"
 
 # Verify installation
-mlxk --version  # → mlxk 2.0.2
+mlxk --version  # → mlxk 2.0.3
 
 # Run tests and quality checks (before committing)
 pytest -v
@@ -282,6 +282,12 @@ done
 
 MLX Knife provides rich human-readable output by default (without `--json` flag).
 
+**Error Handling (2.0.3+):** Errors print to stderr for clean pipe workflows:
+```bash
+mlxk show badmodel | grep ...      # Errors don't contaminate stdout
+mlxk pull badmodel > log 2> err    # Capture errors separately
+```
+
 ### Basic Usage
 
 ```bash
@@ -574,6 +580,6 @@ Apache License 2.0 — see `LICENSE` (root) and `mlxk2/NOTICE`.
 
 <p align="center">
   <b>Made with ❤️ by The BROKE team <img src="broke-logo.png" alt="BROKE Logo" width="30" align="middle"></b><br>
-  <i>Version 2.0.2 | November 2025</i><br>
+  <i>Version 2.0.3 | November 2025</i><br>
   <a href="https://github.com/mzau/broke-cluster">🔮 Next: BROKE Cluster for multi-node deployments</a>
 </p>
diff --git a/SECURITY.md b/SECURITY.md
index 9572af2..144797a 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -142,7 +142,8 @@ We provide security updates for these versions:
 
 | Version | Security Support   |
 | ------- | ------------------ |
-| 2.0.2   | :white_check_mark: Current stable |
+| 2.0.3   | :white_check_mark: Current stable |
+| 2.0.2   | :white_check_mark: Supported |
 | 2.0.1   | :white_check_mark: Supported |
 | 2.0.0   | :white_check_mark: Supported |
 | < 2.0.0 | :x: Upgrade recommended |
diff --git a/TESTING-DETAILS.md b/TESTING-DETAILS.md
index fefa8a9..97cac02 100644
--- a/TESTING-DETAILS.md
+++ b/TESTING-DETAILS.md
@@ -4,7 +4,7 @@ This document contains version-specific details, complete file listings, and imp
 
 ## Current Status
 
-✅ **306/306 unit tests passing** (November 2025) — 2.0.2 Stable; 20 skipped (opt-in)
+✅ **308/308 unit tests passing** (November 2025) — 2.0.3 Stable; 35 skipped (opt-in)
 ✅ **73/81 E2E tests passing** (November 2025) — ADR-011 completed; 8 skipped (RAM budget)
 ✅ **Test environment:** macOS 14.x, M2 Max, Python 3.9-3.13
 ✅ **Production verified & reported:** M1, M1 Max, M2 Max in real-world use
@@ -12,14 +12,16 @@ This document contains version-specific details, complete file listings, and imp
 ✅ **Isolated test system** - user cache stays pristine with temp cache isolation
 ✅ **3-category test strategy** - optimized for performance and safety
 
-### Skipped Tests Breakdown (20 total, standard run without HF_HOME)
+### Skipped Tests Breakdown (35 total, standard run without HF_HOME)
+- **20 Live E2E tests** - Server/HTTP/CLI validation with real models (requires `pytest -m live_e2e`, ADR-011)
 - **4 Live Stop Tokens tests** - Stop token validation with real models (requires `pytest -m live_stop_tokens`, ADR-009)
-- **1 Live Run test** - Private/org model detection (requires `pytest -m live_run`, Issue #37)
 - **3 Live Clone tests** - APFS same-volume clone workflow (requires `MLXK2_LIVE_CLONE=1`)
+- **2 Issue #37 tests** - Private/org model detection (requires `pytest -m live_run`, Issue #37)
+- **2 Runtime Compatibility tests** - Reason chain validation (requires specific model types)
 - **1 Live List test** - Tests against user cache (requires HF_HOME with models)
 - **1 Live Push test** - Real HuggingFace push (requires `MLXK2_LIVE_PUSH=1`)
+- **1 Show Portfolio test** - Convenience test to display E2E test models (requires HF_HOME)
 - **7 Issue #27 tests** - Real-model health validation (requires HF_HOME or MLXK2_USER_HF_HOME setup)
-- **3 Additional opt-in tests** - Various live validation scenarios
 
 **Portfolio Discovery** (ADR-009) is implemented in `tests_2.0/test_stop_tokens_live.py`. When `HF_HOME` is set, tests auto-discover all MLX chat models in user cache using `mlxk list --json` (production command). This ensures Issue #32 fix is validated across the full model portfolio. **Current validation:** 17 models discovered, 15 testable (60% RAM budget), 73/81 tests passing, 0 failures. Portfolio includes: Phi-3, DeepSeek-R1, GPT-oss, Llama, Qwen, Mistral, Mixtral families.
 
@@ -134,16 +136,15 @@ HF_HOME=/path/to/cache pytest -m live_e2e -n auto  # ← NEVER DO THIS!
 
 | Python Version | Status | Tests Passing | Skipped |
 |----------------|--------|---------------|---------|
-| 3.9.6 (macOS)  | ✅ Verified | 306/306 | 20 |
-| 3.10.x         | ✅ Verified | 306/306 | 20 |
-| 3.11.x         | ✅ Verified | 306/306 | 20 |
-| 3.12.x         | ✅ Verified | 306/306 | 20 |
-| 3.13.x         | ✅ Verified | 306/306 | 20 |
+| 3.9.6 (macOS)  | ✅ Verified | 308/308 | 35 |
+| 3.10.x         | ✅ Verified | 308/308 | 35 |
+| 3.11.x         | ✅ Verified | 308/308 | 35 |
+| 3.12.x         | ✅ Verified | 308/308 | 35 |
+| 3.13.x         | ✅ Verified | 308/308 | 35 |
 
-**Note:** 20 skipped tests are opt-in (live tests, alpha features). Skipped count may vary by environment:
-- Without `HF_TOKEN`: +1 skip (live push test)
-- Without `MLXK2_ENABLE_ALPHA_FEATURES=1`: +3 skips (alpha feature tests)
-- Without `jsonschema`: +1 skip (spec validation test)
+**Note:** 35 skipped tests are opt-in (live tests, alpha features). Skipped count may vary by environment:
+- Without `HF_HOME`: Standard 35 skipped (live E2E tests use fallback parametrization)
+- With `HF_HOME`: Live E2E tests run with discovered models (20+ additional tests executed)
 
 All versions tested with `isolated_cache` system and MLX stubs for fast execution without model downloads.
 
@@ -667,6 +668,22 @@ mlxk run mlx-community/Phi-3-mini-4k-instruct-4bit "Write one sentence about cat
 
 ### Version History
 
+### 2.0.3 (2025-11-17)
+- ✅ **Test updates for stderr separation:** 4 test files modified to verify errors go to stderr (human mode)
+  - `test_interactive_mode.py`: 2 tests patching stderr for ERROR messages
+  - `test_run_complete.py`: 2 tests validating stderr error handling
+  - `test_cli_run_exit_codes.py`: 3 tests checking stdout/stderr separation in JSON mode
+  - `test_cli_push_args.py`: 2 tests verifying push stdout/stderr returns
+- ✅ **Benchmark reporting infrastructure:** 4 live test files updated with benchmark fixtures
+  - `live/conftest.py`: +225 lines - `report_benchmark()` fixture, `_parse_model_family()` helper
+  - `live/test_cli_e2e.py`: Benchmark metadata reporting (family, variant, stop_tokens)
+  - `live/test_server_e2e.py`: Benchmark metadata + performance (usage) data
+  - `live/test_streaming_parity.py`: Portfolio Discovery refactoring (uses `mlxk list --json`)
+- ✅ **Interactive mode reasoning control:** 2 new tests added (review_report.md)
+  - `test_interactive_mode.py`: 1 test for `hide_reasoning` parameter passing
+  - `test_run_complete.py`: 1 test for `TestRunReasoningControl` class
+- ✅ **Test count:** 308 passed, 35 skipped (+2 tests from review_report.md fixes)
+
 ### 2.0.2 (2025-11-14)
 - ✅ Test infrastructure hardening (TOKENIZERS_PARALLELISM, active polling, gc.collect())
 - ✅ Portfolio Discovery validation complete (73/81 E2E tests, 17 models discovered)
@@ -693,4 +710,4 @@ mlxk run mlx-community/Phi-3-mini-4k-instruct-4bit "Write one sentence about cat
 
 ---
 
-*MLX-Knife 2.0.2 Testing Details*
+*MLX-Knife 2.0.3 Testing Details*
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 0000000..f606e2a
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,58 @@
+# MLX Knife Benchmarks
+
+**Status:** Phase 0 - Organic Data Collection
+
+## Architecture
+
+This directory tracks empirical performance and compatibility data from mlx-knife's test suite.
+
+### Phase 0 Goals (2.0.3+)
+
+1. **Collect data organically** from E2E tests
+2. **No perfect schema** - schema evolves with data
+3. **Git-tracked reports** - historical trends
+4. **Foundation for future** - community contributions, public database
+
+### Directory Structure
+
+- `reports/` - JSONL test reports (one file per release)
+- `schemas/` - JSON Schema definitions (versioned)
+
+### Current Schema
+
+**Version:** 0.1.0 (Phase 0 - Minimal)
+
+See `schemas/report-v0.1.schema.json` for details.
+
+**Required fields:**
+- `schema_version`, `timestamp`, `mlx_knife_version`, `test`, `outcome`
+
+**Optional sections:**
+- `model` - Model metadata
+- `performance` - tokens/sec, RAM usage
+- `stop_tokens` - ADR-009 validation data
+- `system` - Platform info
+- `metadata` - Extensible (anything)
+
+### Generating Reports
+
+```bash
+# During E2E tests
+pytest -m live_e2e tests_2.0/live/ \
+  --report-output benchmarks/reports/$(date +%Y-%m-%d)-v$(mlxk --version | cut -d' ' -f2).jsonl
+```
+
+### Schema Evolution
+
+As we collect more data, the schema will evolve:
+- New fields added (backward compatible)
+- Optional → Required (when stable)
+- Breaking changes documented in `schemas/MIGRATIONS.md`
+
+### Future Phases
+
+- **Phase 1 (2.1+):** Schema formalization, validation tooling
+- **Phase 2 (2.2+):** `mlxk report` CLI for manual submissions
+- **Phase 3 (2.3+):** Public database, community contributions
+
+See `docs/ADR/ADR-013-Community-Model-Quality-Database.md` for full roadmap.
diff --git a/benchmarks/TESTING.md b/benchmarks/TESTING.md
new file mode 100644
index 0000000..59e44f2
--- /dev/null
+++ b/benchmarks/TESTING.md
@@ -0,0 +1,155 @@
+# Testing with Benchmark Reports (ADR-013 Phase 0)
+
+This document explains how to generate benchmark reports during E2E tests.
+
+## Generating Reports
+
+### Basic Usage
+
+```bash
+# Run E2E tests with reporting
+pytest -m live_e2e tests_2.0/live/ \
+  --report-output benchmarks/reports/$(date +%Y-%m-%d)-v2.0.3.jsonl
+```
+
+### With Full Environment
+
+```bash
+# Use specific HF cache + generate reports
+HF_HOME=/Volumes/mz-SSD/huggingface/cache \
+  pytest -m live_e2e tests_2.0/live/ -v \
+  --report-output benchmarks/reports/2025-11-16-v2.0.3.jsonl
+```
+
+## Adding Report Data to Tests
+
+Tests can add structured data to reports using `request.node.user_properties`:
+
+```python
+def test_example(model_info, request):
+    # ... test logic ...
+
+    # Add model info
+    request.node.user_properties.append(("model", {
+        "id": model_info["id"],
+        "size_gb": model_info["ram_needed_gb"],
+        "family": extract_family(model_info["id"]),
+        "variant": extract_variant(model_info["id"])
+    }))
+
+    # Add performance metrics
+    request.node.user_properties.append(("performance", {
+        "tokens_per_sec": measure_tokens_per_sec(response),
+        "ram_peak_mb": get_peak_ram_usage(),
+        "duration_s": response.elapsed
+    }))
+
+    # Add stop token data (ADR-009)
+    request.node.user_properties.append(("stop_tokens", {
+        "configured": model_stop_tokens,
+        "detected": find_stop_tokens_in_response(response),
+        "workaround": get_workaround_name(model_info["id"]),
+        "leaked": check_for_leaked_tokens(response)
+    }))
+
+    # Add system info (optional)
+    request.node.user_properties.append(("system", {
+        "platform": platform.system().lower(),
+        "platform_version": get_os_version(),
+        "python_version": platform.python_version(),
+        "mlx_version": get_mlx_version(),
+        "hardware": get_hardware_model(),
+        "ram_total_gb": get_total_ram_gb()
+    }))
+
+    # Anything else goes to metadata
+    request.node.user_properties.append(("custom_metric", "value"))
+```
+
+## Structured Sections
+
+Reports have predefined structured sections that map to schema fields:
+
+| user_properties key | Maps to report field | Description |
+|---------------------|----------------------|-------------|
+| `model` | `model` object | Model metadata (id, size, family, variant) |
+| `performance` | `performance` object | Performance metrics (tokens/sec, RAM, duration) |
+| `stop_tokens` | `stop_tokens` object | Stop token behavior (ADR-009 validation) |
+| `system` | `system` object | Platform information (OS, Python, MLX, hardware) |
+| _anything else_ | `metadata` object | Extensible catch-all for experiments |
+
+## Schema Validation
+
+```bash
+# Validate reports against schema (requires jsonschema)
+pip install jsonschema
+
+# Validate all reports
+for report in benchmarks/reports/*.jsonl; do
+  echo "Validating $report..."
+  cat "$report" | while read line; do
+    echo "$line" | python3 -c "
+import sys, json
+from jsonschema import validate
+
+with open('benchmarks/schemas/report-v0.1.schema.json') as f:
+    schema = json.load(f)
+
+report = json.load(sys.stdin)
+validate(instance=report, schema=schema)
+print('✓ Valid')
+"
+  done
+done
+```
+
+## Example Report
+
+```json
+{
+  "schema_version": "0.1.0",
+  "timestamp": "2025-11-16T10:30:00Z",
+  "mlx_knife_version": "2.0.3",
+  "test": "tests_2.0/live/test_stop_tokens_live.py::test_stop_tokens[phi-3-mini]",
+  "outcome": "passed",
+  "duration": 12.3,
+  "model": {
+    "id": "mlx-community/phi-3-mini-4k-instruct",
+    "size_gb": 2.8,
+    "family": "phi-3",
+    "variant": "mini-4k-instruct"
+  },
+  "performance": {
+    "tokens_per_sec": 45.2,
+    "ram_peak_mb": 3200,
+    "prompt_tokens": 15,
+    "completion_tokens": 42
+  },
+  "stop_tokens": {
+    "configured": ["<|end|>", "<|endoftext|>"],
+    "detected": ["<|end|>"],
+    "workaround": "phi-3-dual-eos",
+    "leaked": false
+  }
+}
+```
+
+## Analyzing Reports
+
+See `reports/README.md` for analysis examples (jq queries, statistics, trends).
+
+## Best Practices
+
+1. **File Naming:** Use `YYYY-MM-DD-vX.Y.Z.jsonl` format
+2. **Append Only:** Never edit existing reports (historical data)
+3. **Commit Reports:** Reports are git-tracked for trend analysis
+4. **Schema Version:** Always include `schema_version` for evolution tracking
+5. **Optional Data:** Only add what you can measure reliably
+6. **No PII:** Never include personal information in reports
+
+## Future Enhancements (Phase 1+)
+
+- Automatic validation during `pytest --report-output`
+- Performance regression detection
+- Report comparison tools (`mlxk report diff`)
+- Schema migration utilities
diff --git a/benchmarks/reports/.gitkeep b/benchmarks/reports/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/benchmarks/reports/README.md b/benchmarks/reports/README.md
new file mode 100644
index 0000000..27be2e2
--- /dev/null
+++ b/benchmarks/reports/README.md
@@ -0,0 +1,50 @@
+# Test Reports
+
+This directory contains JSONL test reports from mlx-knife's E2E test suite.
+
+## File Naming Convention
+
+```
+YYYY-MM-DD-vX.Y.Z.jsonl
+```
+
+Example: `2025-11-16-v2.0.3.jsonl`
+
+## Format
+
+- **One JSON object per line** (JSONL)
+- **Schema version:** Each report has `schema_version` field
+- **Appending:** New releases append new files (never edit old ones)
+
+## Historical Data
+
+Reports are git-tracked to preserve historical trends:
+- Performance changes over releases
+- Model compatibility evolution
+- Stop token workaround stability
+
+## Analysis Examples
+
+```bash
+# Count tests by outcome
+jq -r '.outcome' benchmarks/reports/*.jsonl | sort | uniq -c
+
+# Average tokens/sec by model family
+jq -r 'select(.performance) | "\(.model.family) \(.performance.tokens_per_sec)"' \
+  benchmarks/reports/*.jsonl | \
+  awk '{sum[$1]+=$2; count[$1]++} END {for (f in sum) print f, sum[f]/count[f]}'
+
+# List models with workarounds
+jq -r 'select(.stop_tokens.workaround != "none") | "\(.model.id): \(.stop_tokens.workaround)"' \
+  benchmarks/reports/*.jsonl | sort -u
+
+# Performance regression detection
+jq -r 'select(.performance) | "\(.timestamp) \(.model.id) \(.performance.tokens_per_sec)"' \
+  benchmarks/reports/*.jsonl | sort
+```
+
+## Schema Version
+
+Current: **0.1.0** (Phase 0 - Experimental)
+
+See `../schemas/report-v0.1.schema.json` for details.
diff --git a/benchmarks/reports/example-2025-11-16-v2.0.2.jsonl b/benchmarks/reports/example-2025-11-16-v2.0.2.jsonl
new file mode 100644
index 0000000..278cf38
--- /dev/null
+++ b/benchmarks/reports/example-2025-11-16-v2.0.2.jsonl
@@ -0,0 +1,4 @@
+{"schema_version": "0.1.0", "timestamp": "2025-11-16T10:30:00Z", "mlx_knife_version": "2.0.2", "test": "tests_2.0/live/test_stop_tokens_live.py::test_stop_tokens[phi-3-mini]", "outcome": "passed", "duration": 12.3, "model": {"id": "mlx-community/phi-3-mini-4k-instruct", "size_gb": 2.8, "family": "phi-3", "variant": "mini-4k-instruct"}, "performance": {"tokens_per_sec": 45.2, "ram_peak_mb": 3200, "prompt_tokens": 15, "completion_tokens": 42}, "stop_tokens": {"configured": ["<|end|>", "<|endoftext|>"], "detected": ["<|end|>"], "workaround": "phi-3-dual-eos", "leaked": false}}
+{"schema_version": "0.1.0", "timestamp": "2025-11-16T10:32:15Z", "mlx_knife_version": "2.0.2", "test": "tests_2.0/live/test_stop_tokens_live.py::test_stop_tokens[deepseek-r1]", "outcome": "passed", "duration": 18.7, "model": {"id": "mlx-community/DeepSeek-R1-Distill-Qwen-14B", "size_gb": 8.1, "family": "deepseek", "variant": "r1-distill"}, "performance": {"tokens_per_sec": 12.1, "ram_peak_mb": 8100, "prompt_tokens": 20, "completion_tokens": 55}, "stop_tokens": {"configured": ["<|im_end|>"], "detected": ["<|im_end|>"], "workaround": "none", "leaked": false}}
+{"schema_version": "0.1.0", "timestamp": "2025-11-16T10:35:42Z", "mlx_knife_version": "2.0.2", "test": "tests_2.0/live/test_stop_tokens_live.py::test_stop_tokens[llama-70b]", "outcome": "skipped", "duration": 0.1, "metadata": {"skip_reason": "RAM budget exceeded (44GB required)", "ram_budget_gb": 40}}
+{"schema_version": "0.1.0", "timestamp": "2025-11-16T10:40:21Z", "mlx_knife_version": "2.0.2", "test": "tests_2.0/live/test_cli_e2e.py::test_cli_run_text_mode[qwen-2.5]", "outcome": "passed", "duration": 8.5, "model": {"id": "mlx-community/Qwen2.5-7B-Instruct-4bit", "size_gb": 4.2, "family": "qwen", "variant": "2.5-7b"}, "performance": {"tokens_per_sec": 32.7, "ram_peak_mb": 5800}, "system": {"platform": "darwin", "platform_version": "macOS 14.6", "python_version": "3.11.5", "mlx_version": "0.20.0", "hardware": "M2 Max", "ram_total_gb": 64}}
diff --git a/benchmarks/schemas/MIGRATIONS.md b/benchmarks/schemas/MIGRATIONS.md
new file mode 100644
index 0000000..ac8c61f
--- /dev/null
+++ b/benchmarks/schemas/MIGRATIONS.md
@@ -0,0 +1,113 @@
+# Schema Migrations
+
+This document tracks schema evolution for MLX Knife test reports.
+
+## Version History
+
+### 0.1.0 (2025-11-16) - Phase 0 Initial
+
+**Status:** Experimental, evolving organically
+
+**Required fields:**
+- `schema_version`: "0.1.0"
+- `timestamp`: ISO 8601 datetime
+- `mlx_knife_version`: SemVer string
+- `test`: pytest nodeid
+- `outcome`: passed|failed|skipped
+
+**Optional fields:**
+- `duration`: Test duration (seconds)
+- `model`: Model metadata (id, size, family, variant)
+- `performance`: Performance metrics (tokens/sec, RAM, duration)
+- `stop_tokens`: Stop token data (configured, detected, workaround, leaked)
+- `system`: Platform info (OS, Python, MLX, hardware)
+- `metadata`: Catch-all for experiments (no constraints)
+
+**Design rationale:**
+- Minimal required fields keep reporting lightweight
+- Optional sections allow gradual data collection improvement
+- `metadata` object enables experimentation without schema changes
+
+**Breaking changes from nothing:** N/A (initial version)
+
+**Migration:** N/A
+
+---
+
+## Future Versions (Planned)
+
+### 0.2.0 (TBD - Phase 1, when model field stabilizes)
+
+**Proposed changes:**
+- Make `model.id` required when `outcome == "passed"` (enforce for model tests)
+- Add `model.framework_version` (mlx-lm version for reproducibility)
+- Standardize `stop_tokens.workaround` enum (based on collected data)
+- Add `test_type` enum (stop_tokens, performance, health, etc.)
+
+**Migration:**
+- Scripts will backfill `model.framework_version` from git history
+- `stop_tokens.workaround` will be normalized (free text → enum)
+- Old reports remain valid (historical data preserved)
+
+**Breaking changes:**
+- TBD based on Phase 0 learnings
+
+---
+
+### 1.0.0 (TBD - Phase 3, community-ready)
+
+**Proposed changes:**
+- Stabilize all core fields (no more optional → required migrations)
+- Add `contributor` object (for community submissions)
+- Add digital signatures (for trust/verification)
+- Formal deprecation policy (2-release grace period)
+
+**Migration:**
+- Full validation tooling (`mlxk report validate`)
+- Automatic upgrades for old reports (`mlxk report migrate`)
+
+---
+
+## Schema Evolution Policy
+
+### Phase 0 (current): Experimental
+- Rapid iteration based on collected data
+- Breaking changes allowed (no backward compatibility guarantees)
+- Focus: Learn what data is useful
+
+### Phase 1 (2.1+): Stabilization
+- Core fields stabilize
+- Backward-compatible additions only
+- Deprecation warnings for breaking changes (2 releases ahead)
+
+### Phase 2 (2.2+): Community-ready
+- Strict versioning (SemVer for schemas)
+- Migration scripts for all breaking changes
+- Validation tooling (`mlxk report validate`)
+
+### Phase 3 (2.3+): Production
+- No breaking changes without major version bump
+- Formal governance (review process, audit log)
+- Long-term support (LTS) for stable schema versions
+
+---
+
+## Deprecation Process (Phase 2+)
+
+1. **Announcement:** Deprecation warning in schema, docs, and CLI
+2. **Grace Period:** 2 releases (e.g., 2.2 → 2.3 → 2.4)
+3. **Migration Tools:** `mlxk report migrate` auto-upgrades
+4. **Breaking Change:** New major version (e.g., 2.0.0 → 3.0.0)
+5. **Legacy Support:** Old reports remain queryable (read-only)
+
+---
+
+## Contributing
+
+Schema evolution is driven by **empirical data**:
+1. Collect reports with current schema
+2. Analyze: What fields are useful? What's missing?
+3. Propose changes in GitHub issues (with data evidence)
+4. Iterate in next schema version
+
+**Rule:** Schema follows data, not speculation.
diff --git a/benchmarks/schemas/report-v0.1.schema.json b/benchmarks/schemas/report-v0.1.schema.json
new file mode 100644
index 0000000..423671d
--- /dev/null
+++ b/benchmarks/schemas/report-v0.1.schema.json
@@ -0,0 +1,152 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "MLX Knife Test Report (Phase 0 - Organic)",
+  "description": "Minimal schema for E2E test reports. Designed to evolve organically based on collected data.",
+  "type": "object",
+  "required": ["schema_version", "timestamp", "mlx_knife_version", "test", "outcome"],
+  "properties": {
+    "schema_version": {
+      "type": "string",
+      "const": "0.1.0",
+      "description": "Schema version (SemVer). Allows evolution tracking and backward compatibility."
+    },
+    "timestamp": {
+      "type": "string",
+      "format": "date-time",
+      "description": "ISO 8601 timestamp of test execution (UTC recommended)"
+    },
+    "mlx_knife_version": {
+      "type": "string",
+      "pattern": "^\\d+\\.\\d+\\.\\d+",
+      "description": "mlx-knife version that generated this report (SemVer, e.g., '2.0.3')"
+    },
+    "test": {
+      "type": "string",
+      "description": "Test identifier (pytest nodeid format: path::test_name[params])"
+    },
+    "outcome": {
+      "type": "string",
+      "enum": ["passed", "failed", "skipped"],
+      "description": "Test execution result"
+    },
+    "duration": {
+      "type": "number",
+      "minimum": 0,
+      "description": "Test duration in seconds"
+    },
+    "model": {
+      "type": "object",
+      "description": "Model under test (if applicable). Optional in Phase 0.",
+      "properties": {
+        "id": {
+          "type": "string",
+          "description": "HuggingFace model ID (org/name format)"
+        },
+        "size_gb": {
+          "type": "number",
+          "minimum": 0,
+          "description": "Model size in gigabytes (disk)"
+        },
+        "family": {
+          "type": "string",
+          "description": "Model family (e.g., 'phi-3', 'llama', 'qwen')"
+        },
+        "variant": {
+          "type": "string",
+          "description": "Model variant (e.g., '4k-instruct', 'chat', 'base')"
+        }
+      }
+    },
+    "performance": {
+      "type": "object",
+      "description": "Performance metrics. Optional, evolves based on collected data.",
+      "properties": {
+        "tokens_per_sec": {
+          "type": "number",
+          "minimum": 0,
+          "description": "Generation speed (tokens/second)"
+        },
+        "ram_peak_mb": {
+          "type": "number",
+          "minimum": 0,
+          "description": "Peak RAM usage in megabytes"
+        },
+        "duration_s": {
+          "type": "number",
+          "minimum": 0,
+          "description": "Total inference duration in seconds"
+        },
+        "prompt_tokens": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "Number of prompt tokens"
+        },
+        "completion_tokens": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "Number of generated tokens"
+        }
+      }
+    },
+    "stop_tokens": {
+      "type": "object",
+      "description": "Stop token behavior (ADR-009 validation data). Tracks configured vs. detected tokens.",
+      "properties": {
+        "configured": {
+          "type": "array",
+          "items": {"type": "string"},
+          "description": "Stop tokens configured for the model"
+        },
+        "detected": {
+          "type": "array",
+          "items": {"type": "string"},
+          "description": "Stop tokens actually found in response"
+        },
+        "workaround": {
+          "type": "string",
+          "description": "Workaround identifier (e.g., 'phi-3-dual-eos', 'none')"
+        },
+        "leaked": {
+          "type": "boolean",
+          "description": "Whether stop tokens leaked into output (bug indicator)"
+        }
+      }
+    },
+    "system": {
+      "type": "object",
+      "description": "System information. Optional in Phase 0.",
+      "properties": {
+        "platform": {
+          "type": "string",
+          "description": "OS platform (e.g., 'darwin', 'linux')"
+        },
+        "platform_version": {
+          "type": "string",
+          "description": "OS version (e.g., 'macOS 14.6', 'Ubuntu 22.04')"
+        },
+        "python_version": {
+          "type": "string",
+          "description": "Python version (e.g., '3.11.5')"
+        },
+        "mlx_version": {
+          "type": "string",
+          "description": "MLX framework version"
+        },
+        "hardware": {
+          "type": "string",
+          "description": "Hardware identifier (e.g., 'M2 Max', 'M1')"
+        },
+        "ram_total_gb": {
+          "type": "number",
+          "description": "Total system RAM in GB"
+        }
+      }
+    },
+    "metadata": {
+      "type": "object",
+      "description": "Extensible metadata for experimentation. No schema constraints in Phase 0.",
+      "additionalProperties": true
+    }
+  },
+  "additionalProperties": false
+}
diff --git a/benchmarks/validate_reports.py b/benchmarks/validate_reports.py
new file mode 100644
index 0000000..8f13c75
--- /dev/null
+++ b/benchmarks/validate_reports.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+"""Validate JSONL benchmark reports against schema (ADR-013 Phase 0).
+
+Usage:
+    python benchmarks/validate_reports.py benchmarks/reports/2ndtest.jsonl
+    python benchmarks/validate_reports.py benchmarks/reports/*.jsonl
+"""
+
+import json
+import sys
+from pathlib import Path
+from typing import List, Tuple
+
+try:
+    import jsonschema
+except ImportError:
+    print("Error: jsonschema not installed. Install with: pip install jsonschema")
+    sys.exit(1)
+
+
+def load_schema(schema_path: Path) -> dict:
+    """Load JSON schema from file."""
+    with open(schema_path, "r") as f:
+        return json.load(f)
+
+
+def validate_report(report: dict, schema: dict, line_num: int) -> Tuple[bool, str]:
+    """Validate single report against schema.
+
+    Returns:
+        (valid, error_message) tuple
+    """
+    try:
+        jsonschema.validate(instance=report, schema=schema)
+        return True, ""
+    except jsonschema.ValidationError as e:
+        return False, f"Line {line_num}: {e.message}"
+    except jsonschema.SchemaError as e:
+        return False, f"Line {line_num}: Schema error: {e.message}"
+
+
+def validate_jsonl_file(jsonl_path: Path, schema: dict) -> Tuple[int, int, List[str]]:
+    """Validate JSONL file against schema.
+
+    Returns:
+        (total_reports, valid_reports, errors) tuple
+    """
+    total = 0
+    valid = 0
+    errors = []
+
+    with open(jsonl_path, "r") as f:
+        for line_num, line in enumerate(f, start=1):
+            line = line.strip()
+            if not line:
+                continue
+
+            total += 1
+
+            try:
+                report = json.loads(line)
+            except json.JSONDecodeError as e:
+                errors.append(f"Line {line_num}: Invalid JSON: {e}")
+                continue
+
+            is_valid, error_msg = validate_report(report, schema, line_num)
+            if is_valid:
+                valid += 1
+            else:
+                errors.append(error_msg)
+
+    return total, valid, errors
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python benchmarks/validate_reports.py <jsonl_file> [<jsonl_file> ...]")
+        sys.exit(1)
+
+    # Load schema
+    schema_path = Path("benchmarks/schemas/report-v0.1.schema.json")
+    if not schema_path.exists():
+        print(f"Error: Schema not found at {schema_path}")
+        sys.exit(1)
+
+    schema = load_schema(schema_path)
+    print(f"📋 Loaded schema: {schema_path}")
+    print()
+
+    # Validate each file
+    all_valid = True
+    total_reports = 0
+    total_valid = 0
+
+    for jsonl_file in sys.argv[1:]:
+        jsonl_path = Path(jsonl_file)
+        if not jsonl_path.exists():
+            print(f"❌ File not found: {jsonl_path}")
+            all_valid = False
+            continue
+
+        print(f"📊 Validating: {jsonl_path}")
+
+        total, valid, errors = validate_jsonl_file(jsonl_path, schema)
+        total_reports += total
+        total_valid += valid
+
+        if errors:
+            all_valid = False
+            print(f"   ❌ {valid}/{total} reports valid")
+            for error in errors:
+                print(f"      {error}")
+        else:
+            print(f"   ✅ {valid}/{total} reports valid")
+
+        print()
+
+    # Summary
+    print("=" * 60)
+    print(f"Total: {total_valid}/{total_reports} reports valid across {len(sys.argv) - 1} file(s)")
+
+    if all_valid:
+        print("✅ All reports passed schema validation!")
+        sys.exit(0)
+    else:
+        print("❌ Some reports failed validation")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mlxk2/__init__.py b/mlxk2/__init__.py
index 3c715bf..0ac704e 100644
--- a/mlxk2/__init__.py
+++ b/mlxk2/__init__.py
@@ -7,4 +7,4 @@ import warnings
 # Issue parity with 1.1.0 (Issue #22)
 warnings.filterwarnings('ignore', message='urllib3 v2 only supports OpenSSL 1.1.1+')
 
-__version__ = "2.0.2"
+__version__ = "2.0.3"
diff --git a/mlxk2/cli.py b/mlxk2/cli.py
index 8fa986a..3ba965b 100644
--- a/mlxk2/cli.py
+++ b/mlxk2/cli.py
@@ -31,6 +31,34 @@ def format_json_output(data: Dict[str, Any]) -> str:
     return json.dumps(data, indent=2)
 
 
+def print_result(result: Dict[str, Any], render_func=None, json_mode=False, **render_kwargs):
+    """Print command result to stdout (JSON, success) or stderr (human errors).
+
+    Args:
+        result: Command result dict with 'status' field
+        render_func: Human-mode rendering function (if json_mode=False)
+        json_mode: If True, output JSON format (always to stdout)
+        **render_kwargs: Additional arguments for render_func
+    """
+    is_error = result.get("status") == "error"
+
+    if json_mode:
+        # JSON mode: Always stdout (for scripting/jq)
+        print(format_json_output(result), file=sys.stdout)
+    elif is_error:
+        # Human-mode error: stderr (for pipes)
+        error_info = result.get("error", {})
+        message = error_info.get("message", "Unknown error")
+        command = result.get("command", "command")
+        print(f"{command}: Error: {message}", file=sys.stderr)
+    elif render_func:
+        # Human-mode success: stdout
+        print(render_func(result, **render_kwargs), file=sys.stdout)
+    else:
+        # Fallback: print JSON to stdout
+        print(format_json_output(result), file=sys.stdout)
+
+
 def handle_error(error_type: str, message: str) -> Dict[str, Any]:
     """Format error as JSON response."""
     return {
@@ -55,7 +83,7 @@ class MLXKArgumentParser(argparse.ArgumentParser):
         want_json = "--json" in sys.argv
         if want_json:
             err = handle_error("CommandError", message)
-            print(format_json_output(err))
+            print(format_json_output(err), file=sys.stdout)
             self.exit(2)
         super().error(message)
 
@@ -125,11 +153,9 @@ def main():
     run_parser.add_argument("--repetition-penalty", type=float, default=1.1, help="Repetition penalty (default: 1.1)")
     run_parser.add_argument("--no-stream", action="store_true", help="Disable streaming output")
     run_parser.add_argument("--no-chat-template", action="store_true", help="Disable chat template")
+    run_parser.add_argument("--no-reasoning", action="store_true", help="Hide reasoning output for reasoning models (show only final answer)")
     run_parser.add_argument("--verbose", action="store_true", help="Show detailed output")
     run_parser.add_argument("--json", action="store_true", help="Output in JSON format")
-    # Future features (beta.2)
-    run_parser.add_argument("--system", help="System prompt (future feature)")
-    run_parser.add_argument("--hide-reasoning", action="store_true", help="Hide reasoning output (future feature)")
 
     # Serve command (primary, ollama-compatible)
     serve_parser = subparsers.add_parser("serve", help="Start OpenAI-compatible API server")
@@ -199,36 +225,25 @@ def main():
         # Execute command and render per mode
         if args.command == "list":
             result = list_models(pattern=args.pattern)
-            if args.json:
-                print(format_json_output(result))
-            else:
-                show_health = getattr(args, "show_health", False)
-                show_all = getattr(args, "show_all", False)
-                verbose = getattr(args, "verbose", False)
-                print(render_list(result, show_health=show_health, show_all=show_all, verbose=verbose))
+            show_health = getattr(args, "show_health", False)
+            show_all = getattr(args, "show_all", False)
+            verbose = getattr(args, "verbose", False)
+            print_result(result, render_list, args.json,
+                        show_health=show_health, show_all=show_all, verbose=verbose)
         elif args.command == "health":
             result = health_check_operation(args.model)
-            if args.json:
-                print(format_json_output(result))
-            else:
-                print(render_health(result))
+            print_result(result, render_health, args.json)
         elif args.command == "show":
             result = show_model_operation(args.model, args.files, args.config)
-            if args.json:
-                print(format_json_output(result))
-            else:
-                print(render_show(result))
+            print_result(result, render_show, args.json)
         elif args.command == "pull":
             result = pull_operation(args.model)
-            if args.json:
-                print(format_json_output(result))
-            else:
-                print(render_pull(result))
+            print_result(result, render_pull, args.json)
         elif args.command == "clone":
             # Check if alpha features are enabled (should not reach here if not, but double-check)
             if not os.getenv("MLXK2_ENABLE_ALPHA_FEATURES"):
                 result = handle_error("CommandError", "Clone command requires MLXK2_ENABLE_ALPHA_FEATURES=1")
-                print(format_json_output(result))
+                print_result(result, None, True)  # Always JSON for this error
                 sys.exit(1)
 
             # Handle branch parameter by modifying model spec
@@ -243,16 +258,11 @@ def main():
                 target_dir=args.target_dir,
                 health_check=not getattr(args, "no_health_check", False)
             )
-            if args.json:
-                print(format_json_output(result))
-            else:
-                print(render_clone(result, quiet=getattr(args, "quiet", False)))
+            print_result(result, render_clone, args.json,
+                        quiet=getattr(args, "quiet", False))
         elif args.command == "rm":
             result = rm_operation(args.model, args.force)
-            if args.json:
-                print(format_json_output(result))
-            else:
-                print(render_rm(result))
+            print_result(result, render_rm, args.json)
         elif args.command == "run":
             # Handle run command with proper parameter mapping
             result_text = run_model_enhanced(
@@ -266,8 +276,8 @@ def main():
                 use_chat_template=not getattr(args, "no_chat_template", False),
                 json_output=args.json,
                 verbose=getattr(args, "verbose", False),
-                system_prompt=getattr(args, "system", None),
-                hide_reasoning=getattr(args, "hide_reasoning", False)
+                system_prompt=None,  # Not yet implemented
+                hide_reasoning=getattr(args, "no_reasoning", False)
             )
 
             # Detect errors from run_model_enhanced (returns "Error: ..." string on failure)
@@ -283,8 +293,9 @@ def main():
                         "message": error_message
                     }
                 }
+                # Note: run_model() already printed error to stderr in text mode
                 if args.json:
-                    print(format_json_output(result))
+                    print_result(result, None, True)
                 # Exit code will be 1 (handled by line 369)
             elif args.json and result_text is not None and args.prompt is not None:
                 # Success case: wrap result in standard format (only for single-shot mode)
@@ -343,7 +354,7 @@ def main():
             # Check if alpha features are enabled (should not reach here if not, but double-check)
             if not os.getenv("MLXK2_ENABLE_ALPHA_FEATURES"):
                 result = handle_error("CommandError", "Push command requires MLXK2_ENABLE_ALPHA_FEATURES=1")
-                print(format_json_output(result))
+                print_result(result, None, True)  # Always JSON for this error
                 sys.exit(1)
             result = push_operation(
                 local_dir=args.local_dir,
@@ -357,16 +368,14 @@ def main():
                 # Quiet mode: when emitting JSON without --verbose, suppress hub progress/log noise
                 quiet=(getattr(args, "json", False) and not getattr(args, "verbose", False)),
             )
-            if args.json:
-                print(format_json_output(result))
-            else:
-                from .output.human import render_push
-                print(render_push(result, verbose=getattr(args, "verbose", False)))
+            from .output.human import render_push
+            print_result(result, render_push, args.json,
+                        verbose=getattr(args, "verbose", False))
         elif args.command is None:
             # No command specified - show help or JSON error depending on --json flag
             if args.json:
                 result = handle_error("CommandError", "No command specified")
-                print(format_json_output(result))
+                print(format_json_output(result), file=sys.stdout)
                 sys.exit(1)
             else:
                 parser.print_help()
@@ -375,7 +384,7 @@ def main():
             # Unknown command - show help or JSON error depending on --json flag
             if args.json:
                 result = handle_error("CommandError", f"Unknown command: {args.command}")
-                print(format_json_output(result))
+                print(format_json_output(result), file=sys.stdout)
                 sys.exit(1)
             else:
                 parser.print_help()
@@ -385,8 +394,14 @@ def main():
         sys.exit(0 if result.get("status") == "success" else 1)
             
     except Exception as e:
-        error_result = handle_error("InternalError", str(e))
-        print(format_json_output(error_result))
+        # Check if --json flag was requested
+        want_json = "--json" in sys.argv
+        if want_json:
+            error_result = handle_error("InternalError", str(e))
+            print(format_json_output(error_result), file=sys.stdout)
+        else:
+            # Human-mode error
+            print(f"Error: {e}", file=sys.stderr)
         sys.exit(1)
 
 
diff --git a/mlxk2/core/runner/__init__.py b/mlxk2/core/runner/__init__.py
index 91d71d3..390cb88 100644
--- a/mlxk2/core/runner/__init__.py
+++ b/mlxk2/core/runner/__init__.py
@@ -501,9 +501,10 @@ class MLXRunner:
         repetition_context_size: int = 20,
         use_chat_template: bool = True,
         use_chat_stop_tokens: bool = False,
+        hide_reasoning: bool = False,
     ) -> str:
         """Generate text in batch mode (non-streaming).
-        
+
         Args:
             prompt: Input prompt
             max_tokens: Maximum tokens to generate (None for dynamic)
@@ -513,7 +514,8 @@ class MLXRunner:
             repetition_context_size: Context size for repetition penalty
             use_chat_template: Apply tokenizer's chat template if available
             use_chat_stop_tokens: Include chat turn markers as stop tokens (e.g., "\nHuman:")
-            
+            hide_reasoning: Hide reasoning output for reasoning models (DeepSeek-R1, QwQ, etc.)
+
         Returns:
             Generated text
         """
@@ -661,7 +663,7 @@ class MLXRunner:
                 response = response[:earliest_pos]
 
         # Format reasoning models output
-        response = self._format_reasoning_response(response)
+        response = self._format_reasoning_response(response, hide_reasoning=hide_reasoning)
 
         generation_time = time.time() - start_time
 
@@ -679,7 +681,7 @@ class MLXRunner:
         """Format conversation history into a prompt using chat template."""
         return _format_conversation_helper(self.tokenizer, messages)
 
-    def _format_reasoning_response(self, response: str) -> str:
+    def _format_reasoning_response(self, response: str, hide_reasoning: bool = False) -> str:
         """Format response from reasoning models for better readability."""
         return _format_reasoning_helper(
             response,
@@ -687,4 +689,5 @@ class MLXRunner:
             self._reasoning_start,
             self._reasoning_end,
             self._final_start,
+            hide_reasoning=hide_reasoning,
         )
diff --git a/mlxk2/core/runner/reasoning_format.py b/mlxk2/core/runner/reasoning_format.py
index 04f27ce..c9e023a 100644
--- a/mlxk2/core/runner/reasoning_format.py
+++ b/mlxk2/core/runner/reasoning_format.py
@@ -9,10 +9,19 @@ def format_reasoning_response(
     reasoning_start: Optional[str],
     reasoning_end: Optional[str],
     final_start: Optional[str],
+    hide_reasoning: bool = False,
 ) -> str:
     """Format response for reasoning-style models.
 
     Mirrors MLXRunner._format_reasoning_response behavior without changing semantics.
+
+    Args:
+        response: Raw model output
+        is_reasoning_model: Whether this is a reasoning model
+        reasoning_start: Marker for reasoning section start
+        reasoning_end: Marker for reasoning section end
+        final_start: Marker for final answer section
+        hide_reasoning: If True, only return final answer (skip reasoning section)
     """
     if not is_reasoning_model:
         return response
@@ -26,6 +35,12 @@ def format_reasoning_response(
                     final_parts = after_reasoning.split(final_start, 1)
                     if len(final_parts) > 1:
                         final_answer = final_parts[1].replace('<|channel|>final<|message|>', '', 1)
+
+                        # If hiding reasoning, return only final answer
+                        if hide_reasoning:
+                            return final_answer.strip()
+
+                        # Otherwise, format with reasoning section
                         formatted = []
                         formatted.append("\n**[Reasoning]**\n")
                         formatted.append(reasoning_content.strip())
diff --git a/mlxk2/core/runner/stop_tokens.py b/mlxk2/core/runner/stop_tokens.py
index 4f3f12e..53d370d 100644
--- a/mlxk2/core/runner/stop_tokens.py
+++ b/mlxk2/core/runner/stop_tokens.py
@@ -104,11 +104,12 @@ def extract_stop_tokens(tokenizer: Any, verbose: bool = False) -> StopTokenInfo:
                 # Keep any print semantics consistent with previous behavior
                 pass
 
+    # Chat stop tokens to prevent self-conversations in server mode
+    # Only use full-form tokens to avoid false positives in code/markdown
+    # (Short forms like '\nH:' can match code comments, labels, Q&A format)
     chat_stop_tokens = [
         '\nHuman:', '\nAssistant:', '\nYou:',
         '\n\nHuman:', '\n\nAssistant:', '\n\nYou:',
-        '\nH:', '\nA:', '\nY:',
-        '\n\nH:', '\n\nA:', '\n\nY:',
     ]
 
     # Remove None values and normalize to list[str]
diff --git a/mlxk2/operations/run.py b/mlxk2/operations/run.py
index 7dd4949..c6b4a0e 100644
--- a/mlxk2/operations/run.py
+++ b/mlxk2/operations/run.py
@@ -3,6 +3,7 @@ Run operation for 2.0 implementation.
 Ported from 1.x with 2.0 architecture integration.
 """
 
+import sys
 from typing import Optional
 
 from ..core.runner import MLXRunner
@@ -22,7 +23,8 @@ def run_model(
     repetition_penalty: float = 1.1,
     use_chat_template: bool = True,
     json_output: bool = False,
-    verbose: bool = False
+    verbose: bool = False,
+    hide_reasoning: bool = False
 ) -> Optional[str]:
     """Execute model with prompt - supports both single-shot and interactive modes.
 
@@ -37,6 +39,7 @@ def run_model(
         use_chat_template: Apply tokenizer's chat template if available
         json_output: Return JSON format instead of printing
         verbose: Show detailed output
+        hide_reasoning: Hide reasoning output for reasoning models (DeepSeek-R1, QwQ, etc.)
 
     Returns:
         Generated text on success, "Error: ..." string on failure (both modes)
@@ -51,7 +54,7 @@ def run_model(
             error_msg = f"Ambiguous model specification '{model_spec}'. Could be: {ambiguous}"
             error_result = f"Error: {error_msg}"
             if not json_output:
-                print(error_result)
+                print(error_result, file=sys.stderr)
             return error_result
 
         # Only perform compatibility check if model is actually in cache
@@ -82,7 +85,7 @@ def run_model(
                             error_msg = f"Model '{resolved_name}' is not compatible: {reason}"
                             error_result = f"Error: {error_msg}"
                             if not json_output:
-                                print(error_result)
+                                print(error_result, file=sys.stderr)
                             return error_result
 
     except Exception:
@@ -96,36 +99,38 @@ def run_model(
             # Interactive mode: no prompt provided
             if prompt is None:
                 if json_output:
-                    print("Error: Interactive mode not compatible with JSON output")
+                    print("Error: Interactive mode not compatible with JSON output", file=sys.stderr)
                     return None
                 return interactive_chat(
-                    runner, 
-                    stream=stream, 
-                    max_tokens=max_tokens,
-                    temperature=temperature,
-                    top_p=top_p,
-                    repetition_penalty=repetition_penalty,
-                    use_chat_template=use_chat_template,
-                    prepare_next_prompt=False
-                )
-            else:
-                # Single-shot mode: prompt provided  
-                return single_shot_generation(
-                    runner, 
-                    prompt, 
+                    runner,
                     stream=stream,
                     max_tokens=max_tokens,
                     temperature=temperature,
                     top_p=top_p,
                     repetition_penalty=repetition_penalty,
                     use_chat_template=use_chat_template,
-                    json_output=json_output
+                    prepare_next_prompt=False,
+                    hide_reasoning=hide_reasoning,
+                )
+            else:
+                # Single-shot mode: prompt provided
+                return single_shot_generation(
+                    runner,
+                    prompt,
+                    stream=stream,
+                    max_tokens=max_tokens,
+                    temperature=temperature,
+                    top_p=top_p,
+                    repetition_penalty=repetition_penalty,
+                    use_chat_template=use_chat_template,
+                    json_output=json_output,
+                    hide_reasoning=hide_reasoning
                 )
                     
     except Exception as e:
         error_result = f"Error: {e}"
         if not json_output:
-            print(error_result)
+            print(error_result, file=sys.stderr)
         return error_result
 
 
@@ -138,6 +143,7 @@ def interactive_chat(
     repetition_penalty: float = 1.1,
     use_chat_template: bool = True,
     prepare_next_prompt: bool = False,
+    hide_reasoning: bool = False,
 ):
     """Interactive conversation mode with history tracking."""
     print("Starting interactive chat. Type 'exit' or 'quit' to end.\n")
@@ -176,6 +182,7 @@ def interactive_chat(
                     repetition_penalty=repetition_penalty,
                     use_chat_template=False,
                     use_chat_stop_tokens=True,
+                    hide_reasoning=hide_reasoning,
                 )
                 try:
                     iterator = runner.generate_streaming(formatted_prompt, **params)
@@ -197,6 +204,7 @@ def interactive_chat(
                     repetition_penalty=repetition_penalty,
                     use_chat_template=False,
                     use_chat_stop_tokens=True,
+                    hide_reasoning=hide_reasoning,
                 )
                 try:
                     response = runner.generate_batch(formatted_prompt, **params)
@@ -222,7 +230,7 @@ def interactive_chat(
             print("\n\nChat interrupted. Goodbye!")
             break
         except Exception as e:
-            print(f"\n[ERROR] {e}")
+            print(f"\n[ERROR] {e}", file=sys.stderr)
             continue
 
 
@@ -235,7 +243,8 @@ def single_shot_generation(
     top_p: float = 0.9,
     repetition_penalty: float = 1.1,
     use_chat_template: bool = True,
-    json_output: bool = False
+    json_output: bool = False,
+    hide_reasoning: bool = False
 ) -> Optional[str]:
     """Single prompt generation."""
     if stream and not json_output:
@@ -248,6 +257,7 @@ def single_shot_generation(
             top_p=top_p,
             repetition_penalty=repetition_penalty,
             use_chat_template=use_chat_template,
+            hide_reasoning=hide_reasoning,
         ):
             print(token, end="", flush=True)
             generated_text += token
@@ -265,6 +275,7 @@ def single_shot_generation(
             top_p=top_p,
             repetition_penalty=repetition_penalty,
             use_chat_template=use_chat_template,
+            hide_reasoning=hide_reasoning,
         )
         
         if json_output:
@@ -313,10 +324,10 @@ def run_model_enhanced(
         Generated text on success, "Error: ..." string on failure (both modes)
     """
     # For now, forward to basic run_model
-    # TODO: Add system_prompt and hide_reasoning support in beta.2
+    # TODO: Add system_prompt support in future version
     if system_prompt:
-        print("Warning: System prompts not yet implemented in beta.1")
-    
+        print("Warning: System prompts not yet implemented")
+
     return run_model(
         model_spec=model_spec,
         prompt=prompt,
@@ -327,5 +338,6 @@ def run_model_enhanced(
         repetition_penalty=repetition_penalty,
         use_chat_template=use_chat_template,
         json_output=json_output,
-        verbose=verbose
+        verbose=verbose,
+        hide_reasoning=hide_reasoning
     )
diff --git a/pyproject.toml b/pyproject.toml
index 872b4cb..0808659 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,7 +27,7 @@ classifiers = [
     "License :: OSI Approved :: Apache Software License",
 ]
 dependencies = [
-    "huggingface-hub>=0.34.0",
+    "huggingface-hub>=0.34.0,<1.0",
     "requests>=2.32.0",
     "mlx-lm>=0.28.3",
     "mlx>=0.29.0",
diff --git a/tests_2.0/live/conftest.py b/tests_2.0/live/conftest.py
index d9a3db1..a60bb52 100644
--- a/tests_2.0/live/conftest.py
+++ b/tests_2.0/live/conftest.py
@@ -156,3 +156,228 @@ def model_info(portfolio_models, model_key):
             - description: Human-readable description
     """
     return portfolio_models[model_key]
+
+
+def _parse_model_family(model_id: str) -> tuple[str, str]:
+    """Extract model family and variant from HuggingFace model ID.
+
+    Examples:
+        "mlx-community/Llama-3.2-3B-Instruct-4bit" → ("llama", "3.2-3b-instruct")
+        "mlx-community/Qwen2.5-7B-Instruct-4bit" → ("qwen", "2.5-7b-instruct")
+        "mlx-community/phi-3-mini-4k-instruct" → ("phi-3", "mini-4k-instruct")
+
+    Args:
+        model_id: HuggingFace model ID (org/name format)
+
+    Returns:
+        (family, variant) tuple. Returns ("unknown", model_name) if parsing fails.
+    """
+    # Extract model name from org/name
+    model_name = model_id.split("/")[-1].lower()
+
+    # Common patterns
+    if "llama" in model_name:
+        family = "llama"
+        # Extract variant (everything after "llama-")
+        variant = model_name.split("llama-", 1)[1] if "llama-" in model_name else model_name
+        # Remove quantization suffix (-4bit, -8bit, etc.)
+        variant = variant.replace("-4bit", "").replace("-8bit", "").replace("-fp16", "")
+        return family, variant
+
+    if "qwen" in model_name:
+        family = "qwen"
+        variant = model_name.split("qwen", 1)[1] if "qwen" in model_name else model_name
+        variant = variant.replace("-4bit", "").replace("-8bit", "").replace("-fp16", "")
+        return family, variant
+
+    if "phi" in model_name:
+        # Phi models: phi-3.5, phi-3, phi-2, etc.
+        # Check most specific version first
+        if "phi-3.5" in model_name:
+            family = "phi-3.5"
+            variant = model_name.split("phi-3.5-", 1)[1] if "phi-3.5-" in model_name else "base"
+        elif "phi-3" in model_name:
+            family = "phi-3"
+            variant = model_name.split("phi-3-", 1)[1] if "phi-3-" in model_name else "base"
+        elif "phi-2" in model_name:
+            family = "phi-2"
+            variant = model_name.split("phi-2-", 1)[1] if "phi-2-" in model_name else "base"
+        else:
+            family = "phi"
+            variant = model_name
+        variant = variant.replace("-4bit", "").replace("-8bit", "")
+        return family, variant
+
+    if "deepseek" in model_name:
+        family = "deepseek"
+        variant = model_name.replace("deepseek-", "")
+        variant = variant.replace("-4bit", "").replace("-8bit", "")
+        return family, variant
+
+    if "mistral" in model_name or "mixtral" in model_name:
+        family = "mistral" if "mistral" in model_name else "mixtral"
+        variant = model_name.replace(f"{family}-", "")
+        variant = variant.replace("-4bit", "").replace("-8bit", "")
+        return family, variant
+
+    # Fallback: unknown family
+    return "unknown", model_name.replace("-4bit", "").replace("-8bit", "")
+
+
+@pytest.fixture
+def report_benchmark(request, model_info):
+    """Helper for writing benchmark data to test reports (ADR-013 Phase 0).
+
+    Simplifies adding model metadata and performance metrics to E2E test reports.
+    Reports are written as JSONL via pytest_runtest_makereport hook.
+
+    Usage:
+        def test_something(report_benchmark, model_info):
+            # ... test logic ...
+
+            # Report model info only
+            report_benchmark()
+
+            # Report with performance metrics
+            report_benchmark(performance={
+                "tokens_per_sec": 45.2,
+                "ram_peak_mb": 3200,
+                "prompt_tokens": 15,
+                "completion_tokens": 42
+            })
+
+            # Report with stop token data
+            report_benchmark(stop_tokens={
+                "configured": ["<|end|>"],
+                "detected": ["<|end|>"],
+                "workaround": "none",
+                "leaked": False
+            })
+
+    Args:
+        performance: Optional performance metrics dict
+        stop_tokens: Optional stop token validation data
+        **extra: Additional metadata (goes to metadata section)
+    """
+    def _report(performance: Dict[str, Any] = None, stop_tokens: Dict[str, Any] = None, **extra):
+        # Extract model family/variant from model_id
+        model_id = model_info["id"]
+        family, variant = _parse_model_family(model_id)
+
+        # Build model section (convert RAM estimate to disk size)
+        # ram_needed_gb includes 1.2x overhead, so disk size = ram_needed_gb / 1.2
+        disk_size_gb = model_info["ram_needed_gb"] / 1.2
+
+        request.node.user_properties.append(("model", {
+            "id": model_id,
+            "size_gb": round(disk_size_gb, 2),
+            "family": family,
+            "variant": variant,
+        }))
+
+        # Add performance if provided
+        if performance:
+            request.node.user_properties.append(("performance", performance))
+
+        # Add stop_tokens if provided
+        if stop_tokens:
+            request.node.user_properties.append(("stop_tokens", stop_tokens))
+
+        # Add any extra metadata
+        for key, value in extra.items():
+            request.node.user_properties.append((key, value))
+
+    return _report
+
+
+# ============================================================================
+# Benchmark Reporting (ADR-013 Phase 0)
+# ============================================================================
+
+def pytest_addoption(parser):
+    """Add --report-output option for benchmark reporting."""
+    parser.addoption(
+        "--report-output",
+        action="store",
+        default=None,
+        metavar="PATH",
+        help="Generate benchmark reports to JSONL file (ADR-013 Phase 0)"
+    )
+
+
+def pytest_configure(config):
+    """Initialize report file if --report-output is specified."""
+    config.report_file = None
+    if report_path := config.getoption("--report-output"):
+        config.report_file = Path(report_path).open("a", encoding="utf-8")
+        print(f"\n📊 Benchmark reporting enabled: {report_path}")
+
+
+def pytest_unconfigure(config):
+    """Close report file at end of session."""
+    if config.report_file:
+        config.report_file.close()
+
+
+@pytest.hookimpl(hookwrapper=True)
+def pytest_runtest_makereport(item, call):
+    """Generate benchmark report for each test (if --report-output enabled).
+
+    Reports are written as JSONL (one JSON object per line) to allow
+    streaming and easy appending across test runs.
+
+    Schema version: 0.1.0 (Phase 0 - Experimental)
+    See: benchmarks/schemas/report-v0.1.schema.json
+    """
+    import json
+    from datetime import datetime
+
+    outcome = yield
+    report = outcome.get_result()
+
+    # Only report on test call phase (not setup/teardown)
+    if call.when == "call" and item.config.report_file:
+        try:
+            # Import version here to avoid circular imports
+            from mlxk2 import __version__
+        except ImportError:
+            __version__ = "unknown"
+
+        # Build report data (required fields)
+        data = {
+            "schema_version": "0.1.0",
+            "timestamp": datetime.utcnow().isoformat() + "Z",
+            "mlx_knife_version": __version__,
+            "test": item.nodeid,
+            "outcome": report.outcome,
+        }
+
+        # Add duration if available
+        if hasattr(report, "duration"):
+            data["duration"] = report.duration
+
+        # Add skip reason for skipped tests
+        if report.outcome == "skipped" and hasattr(report, "longrepr"):
+            # Extract skip reason from longrepr tuple
+            if isinstance(report.longrepr, tuple) and len(report.longrepr) >= 3:
+                skip_reason = report.longrepr[2]
+                data.setdefault("metadata", {})["skip_reason"] = skip_reason
+
+        # Extract structured data from user_properties
+        # Tests can add data via: request.node.user_properties.append(("key", value))
+        for key, value in item.user_properties:
+            if key in ("model", "performance", "stop_tokens", "system"):
+                # Structured sections (top-level keys)
+                data[key] = value
+            else:
+                # Everything else goes to metadata
+                data.setdefault("metadata", {})[key] = value
+
+        # Write JSONL (one line per report)
+        try:
+            item.config.report_file.write(json.dumps(data) + "\n")
+            item.config.report_file.flush()
+        except Exception as e:
+            # Don't fail tests if reporting fails
+            print(f"\n⚠️  Benchmark report write failed: {e}")
+
diff --git a/tests_2.0/live/test_cli_e2e.py b/tests_2.0/live/test_cli_e2e.py
index 5763bb9..b844b35 100644
--- a/tests_2.0/live/test_cli_e2e.py
+++ b/tests_2.0/live/test_cli_e2e.py
@@ -70,7 +70,7 @@ class TestRunCommandBasic:
     """
 
     @pytest.mark.live_e2e
-    def test_run_command(self, portfolio_models, model_key):
+    def test_run_command(self, portfolio_models, model_key, report_benchmark):
         """Validate `mlxk run` with model.
 
         Parametrized test (one instance per model in portfolio).
@@ -116,6 +116,14 @@ class TestRunCommandBasic:
 
         print(f"✓ {model_key}: Passed (output: {len(stdout)} chars)")
 
+        # Benchmark reporting (ADR-013 Phase 0)
+        report_benchmark(stop_tokens={
+            "configured": stop_tokens,
+            "detected": found_tokens,
+            "workaround": "none",
+            "leaked": len(found_tokens) > 0
+        })
+
 
 class TestRunCommandJSON:
     """JSON output mode tests.
@@ -125,7 +133,7 @@ class TestRunCommandJSON:
     """
 
     @pytest.mark.live_e2e
-    def test_run_json_output(self, portfolio_models, model_key):
+    def test_run_json_output(self, portfolio_models, model_key, report_benchmark):
         """Validate `mlxk run --json` output format.
 
         Parametrized test (one instance per model in portfolio).
@@ -178,6 +186,14 @@ class TestRunCommandJSON:
 
         print(f"✓ {model_key}: Passed (JSON output: {len(response)} chars)")
 
+        # Benchmark reporting (ADR-013 Phase 0)
+        report_benchmark(stop_tokens={
+            "configured": stop_tokens,
+            "detected": found_tokens,
+            "workaround": "none",
+            "leaked": len(found_tokens) > 0
+        })
+
 
 class TestRunCommandExitCodes:
     """Exit code propagation tests (Issue #38)."""
diff --git a/tests_2.0/live/test_server_e2e.py b/tests_2.0/live/test_server_e2e.py
index 3dab54e..8f9fea3 100644
--- a/tests_2.0/live/test_server_e2e.py
+++ b/tests_2.0/live/test_server_e2e.py
@@ -118,7 +118,7 @@ class TestChatCompletionsBatch:
     """
 
     @pytest.mark.live_e2e
-    def test_chat_completions_batch(self, portfolio_models, model_key):
+    def test_chat_completions_batch(self, portfolio_models, model_key, report_benchmark):
         """Validate non-streaming chat completions.
 
         Parametrized test (one instance per model in portfolio).
@@ -182,6 +182,24 @@ class TestChatCompletionsBatch:
 
             print(f"✓ {model_key}: Passed (output: {len(content)} chars)")
 
+            # Benchmark reporting (ADR-013 Phase 0)
+            # Extract usage statistics if available
+            performance = {}
+            if "usage" in data:
+                usage = data["usage"]
+                performance["prompt_tokens"] = usage.get("prompt_tokens", 0)
+                performance["completion_tokens"] = usage.get("completion_tokens", 0)
+
+            report_benchmark(
+                performance=performance if performance else None,
+                stop_tokens={
+                    "configured": stop_tokens,
+                    "detected": found_tokens,
+                    "workaround": "none",
+                    "leaked": len(found_tokens) > 0
+                }
+            )
+
 
 class TestChatCompletionsStreaming:
     """SSE streaming chat completion tests across portfolio.
@@ -191,7 +209,7 @@ class TestChatCompletionsStreaming:
     """
 
     @pytest.mark.live_e2e
-    def test_chat_completions_streaming(self, portfolio_models, model_key):
+    def test_chat_completions_streaming(self, portfolio_models, model_key, report_benchmark):
         """Validate SSE streaming chat completions.
 
         Parametrized test (one instance per model in portfolio).
@@ -264,6 +282,14 @@ class TestChatCompletionsStreaming:
 
             print(f"✓ {model_key}: Passed (streamed: {len(content)} chars)")
 
+            # Benchmark reporting (ADR-013 Phase 0)
+            report_benchmark(stop_tokens={
+                "configured": stop_tokens,
+                "detected": found_tokens,
+                "workaround": "none",
+                "leaked": len(found_tokens) > 0
+            })
+
 
 class TestCompletionsBatch:
     """Non-streaming text completion tests."""
diff --git a/tests_2.0/live/test_streaming_parity.py b/tests_2.0/live/test_streaming_parity.py
index 47274a6..7aa5dac 100644
--- a/tests_2.0/live/test_streaming_parity.py
+++ b/tests_2.0/live/test_streaming_parity.py
@@ -24,7 +24,7 @@ Requires: HF_HOME set to model cache
 from __future__ import annotations
 
 import pytest
-from typing import Dict, Any
+from typing import Dict, Any, List
 
 try:
     import httpx
@@ -52,39 +52,106 @@ pytestmark = [
 ]
 
 
-# Representative test models for parity validation
-# Uses hardcoded subset (not full portfolio) to keep test time reasonable
-PARITY_TEST_MODELS = {
-    # "mxfp4": Skipped - Reasoning model (gpt-oss) has batch/stream inconsistency
-    #   Batch output: Raw reasoning text
-    #   Stream output: Adds **[Reasoning]** headers via StreamingReasoningParser
-    #   Known issue, will be fixed in ADR-010 implementation
-    "qwen25": {
-        "id": "mlx-community/Qwen2.5-0.5B-Instruct-4bit",
-        "ram_needed_gb": 1.0,
-        "description": "Qwen 2.5 (self-conversation prevention)"
-    },
-    "llama32": {
-        "id": "mlx-community/Llama-3.2-3B-Instruct-4bit",
-        "ram_needed_gb": 4.0,
-        "description": "Llama 3.2 (control baseline)"
+def _select_parity_test_models(portfolio: Dict[str, Dict[str, Any]]) -> List[str]:
+    """Select 2-3 representative models from portfolio for parity testing.
+
+    Strategy:
+    - Only small models (<6GB RAM) for fast testing
+    - Exclude reasoning models (known batch/stream inconsistency, fixed in ADR-010)
+    - Prefer diverse model families (Qwen, Llama, Phi, etc.)
+    - Limit to 3 models max (parity tests are slow)
+
+    Args:
+        portfolio: Model portfolio from portfolio_models fixture
+
+    Returns:
+        List of model_keys to test (empty if no suitable models)
+    """
+    # Filter: small models only
+    candidates = {
+        key: info for key, info in portfolio.items()
+        if info["ram_needed_gb"] <= 6.0
     }
-}
+
+    if not candidates:
+        return []
+
+    # Exclude reasoning models (known Issue #20 regression - will fix in ADR-010)
+    # Reasoning models have batch/stream inconsistency:
+    # - Batch: raw reasoning text
+    # - Stream: adds **[Reasoning]** headers via StreamingReasoningParser
+    excluded_patterns = ["gpt-oss", "deepseek-r1", "qwq"]
+    candidates = {
+        key: info for key, info in candidates.items()
+        if not any(pattern in info["id"].lower() for pattern in excluded_patterns)
+    }
+
+    if not candidates:
+        return []
+
+    # Sort by RAM (smallest first) and select up to 3
+    sorted_models = sorted(candidates.items(), key=lambda x: x[1]["ram_needed_gb"])
+    selected = [key for key, _ in sorted_models[:3]]
+
+    return selected
+
+
+def pytest_generate_tests(metafunc):
+    """Custom parametrization for parity tests.
+
+    Parametrizes parity_model_key over 2-3 selected models from portfolio.
+    This hook runs at collection time.
+    """
+    if "parity_model_key" in metafunc.fixturenames:
+        # Check if live_e2e marker is requested
+        selected_markers = metafunc.config.getoption("-m") or ""
+        if "live_e2e" not in selected_markers:
+            # Parametrize with dummy value (tests will be skipped)
+            metafunc.parametrize("parity_model_key", ["_skipped"])
+            return
+
+        # Import portfolio discovery (same as conftest.py)
+        from .test_utils import discover_mlx_models_in_user_cache, TEST_MODELS
+
+        discovered = discover_mlx_models_in_user_cache()
+
+        if discovered:
+            # Build portfolio from discovered models
+            portfolio = {}
+            for i, model in enumerate(discovered):
+                key = f"discovered_{i:02d}"
+                portfolio[key] = {
+                    "id": model["model_id"],
+                    "ram_needed_gb": model["ram_needed_gb"],
+                    "expected_issue": None,
+                    "description": f"Discovered: {model['model_id']}"
+                }
+        else:
+            # Fallback to hardcoded test models
+            portfolio = TEST_MODELS
+
+        # Select 2-3 models for parity testing
+        selected = _select_parity_test_models(portfolio)
+
+        if not selected:
+            # No suitable models - parametrize with dummy for graceful skip
+            metafunc.parametrize("parity_model_key", ["_no_suitable_models"])
+        else:
+            metafunc.parametrize("parity_model_key", selected)
 
 
 class TestRunnerStreamingParity:
     """MLXRunner direct streaming vs. batch parity.
 
-    Tests are parametrized over PARITY_TEST_MODELS (3 models).
+    Tests are parametrized over selected models from portfolio (2-3 models).
     Each test runs independently for clean isolation.
     """
 
     @pytest.mark.live_e2e
-    @pytest.mark.parametrize("parity_model_key", list(PARITY_TEST_MODELS.keys()))
-    def test_runner_streaming_batch_identical(self, _use_real_mlx_modules, parity_model_key):
+    def test_runner_streaming_batch_identical(self, _use_real_mlx_modules, portfolio_models, parity_model_key):
         """Validate MLXRunner streaming and batch produce identical output.
 
-        Parametrized test (one instance per parity test model).
+        Parametrized test (one instance per selected parity model).
 
         Issue #20: Previously, batch output had visible stop tokens while
         streaming did not. This validates the ADR-009 fix at Runner level.
@@ -93,11 +160,18 @@ class TestRunnerStreamingParity:
         """
         from mlxk2.core.runner import MLXRunner
 
-        model_info = PARITY_TEST_MODELS[parity_model_key]
+        # Handle graceful skips
+        if parity_model_key == "_skipped":
+            pytest.skip("Run with -m live_e2e to enable parity tests")
+        if parity_model_key == "_no_suitable_models":
+            pytest.skip("No suitable models for parity testing (<6GB, non-reasoning)")
+
+        # Get model info from portfolio
+        model_info = portfolio_models[parity_model_key]
         model_id = model_info["id"]
 
         # RAM gating
-        should_skip, skip_reason = should_skip_model(parity_model_key, PARITY_TEST_MODELS)
+        should_skip, skip_reason = should_skip_model(parity_model_key, portfolio_models)
         if should_skip:
             pytest.skip(skip_reason)
 
@@ -135,24 +209,30 @@ class TestRunnerStreamingParity:
 class TestServerStreamingParity:
     """Server API streaming vs. batch parity.
 
-    Tests are parametrized over PARITY_TEST_MODELS (3 models).
+    Tests are parametrized over selected models from portfolio (2-3 models).
     Each test runs independently for clean isolation.
     """
 
     @pytest.mark.live_e2e
-    @pytest.mark.parametrize("parity_model_key", list(PARITY_TEST_MODELS.keys()))
-    def test_server_api_streaming_batch_identical(self, parity_model_key):
+    def test_server_api_streaming_batch_identical(self, portfolio_models, parity_model_key):
         """Validate Server API streaming and batch produce identical output.
 
-        Parametrized test (one instance per parity test model).
+        Parametrized test (one instance per selected parity model).
 
         Tests parity at HTTP API level (closest to production usage).
         """
-        model_info = PARITY_TEST_MODELS[parity_model_key]
+        # Handle graceful skips
+        if parity_model_key == "_skipped":
+            pytest.skip("Run with -m live_e2e to enable parity tests")
+        if parity_model_key == "_no_suitable_models":
+            pytest.skip("No suitable models for parity testing (<6GB, non-reasoning)")
+
+        # Get model info from portfolio
+        model_info = portfolio_models[parity_model_key]
         model_id = model_info["id"]
 
         # RAM gating
-        should_skip, skip_reason = should_skip_model(parity_model_key, PARITY_TEST_MODELS)
+        should_skip, skip_reason = should_skip_model(parity_model_key, portfolio_models)
         if should_skip:
             pytest.skip(skip_reason)
 
@@ -206,7 +286,7 @@ class TestCrossInterfaceParity:
     """Parity across different interfaces (Runner vs Server)."""
 
     @pytest.mark.live_e2e
-    def test_runner_vs_server_consistency(self, _use_real_mlx_modules):
+    def test_runner_vs_server_consistency(self, _use_real_mlx_modules, portfolio_models):
         """Validate MLXRunner and Server API produce consistent output.
 
         Tests that direct Runner usage and Server HTTP API yield
@@ -216,13 +296,18 @@ class TestCrossInterfaceParity:
         """
         from mlxk2.core.runner import MLXRunner
 
-        # Use smallest model for faster testing
-        test_model_key = "qwen25"
-        model_info = PARITY_TEST_MODELS[test_model_key]
+        # Select smallest available model for fastest testing
+        selected = _select_parity_test_models(portfolio_models)
+        if not selected:
+            pytest.skip("No suitable models for cross-interface testing (<6GB, non-reasoning)")
+
+        # Use first (smallest) model
+        test_model_key = selected[0]
+        model_info = portfolio_models[test_model_key]
         model_id = model_info["id"]
 
         # RAM check
-        should_skip, skip_reason = should_skip_model(test_model_key, PARITY_TEST_MODELS)
+        should_skip, skip_reason = should_skip_model(test_model_key, portfolio_models)
         if should_skip:
             pytest.skip(skip_reason)
 
diff --git a/tests_2.0/test_cli_push_args.py b/tests_2.0/test_cli_push_args.py
index b45e377..6d80cfd 100644
--- a/tests_2.0/test_cli_push_args.py
+++ b/tests_2.0/test_cli_push_args.py
@@ -25,15 +25,16 @@ def _run_cli(argv: list[str], capsys):
             cli_main()
     finally:
         sys.argv = old_argv
-    out = capsys.readouterr().out
-    return out
+    captured = capsys.readouterr()
+    return captured.out, captured.err
 
 
 def test_cli_push_missing_args_json_error(capsys, monkeypatch):
     # Missing required positional args but with --json should emit JSON error
     monkeypatch.setenv("MLXK2_ENABLE_ALPHA_FEATURES", "1")
-    out = _run_cli(["mlxk2", "push", "--private", "--json"], capsys)
-    data = json.loads(out)
+    stdout, stderr = _run_cli(["mlxk2", "push", "--private", "--json"], capsys)
+    # JSON mode: all output to stdout (for scripting)
+    data = json.loads(stdout)
     assert data["status"] == "error"
     assert data["command"] is None
     assert isinstance(data["error"], dict)
@@ -44,8 +45,9 @@ def test_cli_push_workspace_missing_json_error(tmp_path, monkeypatch, capsys):
     monkeypatch.setenv("MLXK2_ENABLE_ALPHA_FEATURES", "1")
     monkeypatch.setenv("HF_TOKEN", "dummy")
     missing = str(tmp_path / "nope")
-    out = _run_cli(["mlxk2", "push", "--private", missing, "user/repo", "--json"], capsys)
-    data = json.loads(out)
+    stdout, stderr = _run_cli(["mlxk2", "push", "--private", missing, "user/repo", "--json"], capsys)
+    # JSON mode: all output to stdout (for scripting)
+    data = json.loads(stdout)
     assert data["status"] == "error"
     assert data["command"] == "push"
     assert data["error"]["type"] == "workspace_not_found"
@@ -92,8 +94,9 @@ def test_cli_push_no_changes_json_output(tmp_path, monkeypatch, capsys):
 
     _install_fake_hf(monkeypatch, mode="no_changes")
 
-    out = _run_cli(["mlxk2", "push", "--private", str(ws), "user/repo", "--json"], capsys)
-    data = json.loads(out)
+    stdout, stderr = _run_cli(["mlxk2", "push", "--private", str(ws), "user/repo", "--json"], capsys)
+    # Success goes to stdout
+    data = json.loads(stdout)
     assert data["status"] == "success"
     assert data["command"] == "push"
     assert data["data"]["no_changes"] is True
@@ -110,8 +113,9 @@ def test_cli_push_with_changes_json_output(tmp_path, monkeypatch, capsys):
 
     _install_fake_hf(monkeypatch, mode="with_changes")
 
-    out = _run_cli(["mlxk2", "push", "--private", str(ws), "user/repo", "--json"], capsys)
-    data = json.loads(out)
+    stdout, stderr = _run_cli(["mlxk2", "push", "--private", str(ws), "user/repo", "--json"], capsys)
+    # Success goes to stdout
+    data = json.loads(stdout)
     assert data["status"] == "success"
     assert data["command"] == "push"
     assert data["data"]["no_changes"] is False
diff --git a/tests_2.0/test_cli_run_exit_codes.py b/tests_2.0/test_cli_run_exit_codes.py
index 489e469..5367755 100644
--- a/tests_2.0/test_cli_run_exit_codes.py
+++ b/tests_2.0/test_cli_run_exit_codes.py
@@ -71,8 +71,8 @@ class TestRunCommandExitCodes:
                 f"stderr: {stderr}"
             )
 
-            # In text mode, error is printed directly
-            assert "Error:" in stdout, f"Expected error message in stdout, got: {stdout}"
+            # In text mode, error is printed to stderr
+            assert "Error:" in stderr, f"Expected error message in stderr, got: {stderr}"
 
     def test_run_nonexistent_model_json_mode_exit_code(self, capsys):
         """Test that run with invalid model returns non-zero exit code (JSON mode).
@@ -95,10 +95,11 @@ class TestRunCommandExitCodes:
             # Should return non-zero exit code
             assert exit_code == 1, (
                 f"Expected exit code 1 for model error, got {exit_code}\n"
-                f"stdout: {stdout}"
+                f"stdout: {stdout}\n"
+                f"stderr: {stderr}"
             )
 
-            # Parse JSON output
+            # Parse JSON output from stdout (JSON mode always stdout for scripting)
             data = json.loads(stdout)
 
             # Should have status="error"
@@ -128,9 +129,10 @@ class TestRunCommandExitCodes:
 
             assert exit_code == 1, (
                 f"Expected exit code 1 for ambiguous model, got {exit_code}\n"
-                f"stdout: {stdout}"
+                f"stdout: {stdout}\n"
+                f"stderr: {stderr}"
             )
-            assert "Error:" in stdout and "Ambiguous" in stdout
+            assert "Error:" in stderr and "Ambiguous" in stderr
 
     def test_run_ambiguous_model_json_mode(self, capsys):
         """Test ambiguous model specification returns exit code 1 (JSON mode)."""
@@ -143,6 +145,7 @@ class TestRunCommandExitCodes:
             )
 
             assert exit_code == 1
+            # Parse JSON from stdout (JSON mode always stdout)
             data = json.loads(stdout)
             assert data["status"] == "error"
             assert "ambiguous" in data["error"]["message"].lower()
@@ -187,9 +190,10 @@ class TestRunCommandExitCodes:
 
             assert exit_code == 1, (
                 f"Expected exit code 1 for incompatible model, got {exit_code}\n"
-                f"stdout: {stdout}"
+                f"stdout: {stdout}\n"
+                f"stderr: {stderr}"
             )
-            assert "Error:" in stdout and "compatible" in stdout
+            assert "Error:" in stderr and "compatible" in stderr
 
     def test_run_success_text_mode_exit_code(self, capsys):
         """Test that successful run returns zero exit code (text mode).
@@ -267,8 +271,8 @@ class TestRunCommandExitCodes:
                 f"stdout: {stdout}\n"
                 f"stderr: {stderr}"
             )
-            assert "Error:" in stdout
-            assert "failed" in stdout.lower() or "memory" in stdout.lower()
+            assert "Error:" in stderr
+            assert "failed" in stderr.lower() or "memory" in stderr.lower()
 
     def test_run_runtime_exception_json_mode(self, capsys):
         """Test that runtime exceptions are caught and propagated as errors (JSON mode)."""
@@ -281,6 +285,7 @@ class TestRunCommandExitCodes:
             )
 
             assert exit_code == 1
+            # Parse JSON from stdout (JSON mode always stdout)
             data = json.loads(stdout)
             assert data["status"] == "error"
             assert "failed" in data["error"]["message"].lower() or "memory" in data["error"]["message"].lower()
diff --git a/tests_2.0/test_interactive_mode.py b/tests_2.0/test_interactive_mode.py
index 3b474d1..7779a65 100644
--- a/tests_2.0/test_interactive_mode.py
+++ b/tests_2.0/test_interactive_mode.py
@@ -251,17 +251,18 @@ class TestChatTemplateIntegration:
         """Test behavior when chat template formatting fails"""
         def failing_format(messages):
             raise Exception("Template error")
-        
+
         mock_runner_interactive._format_conversation.side_effect = failing_format
-        
+
         with patch('builtins.input', side_effect=["test", "quit"]):
-            with patch('sys.stdout', new=StringIO()) as fake_out:
+            with patch('sys.stdout', new=StringIO()) as fake_out, \
+                 patch('sys.stderr', new=StringIO()) as fake_err:
                 # Should handle template errors gracefully
                 interactive_chat(mock_runner_interactive)
-        
-        output = fake_out.getvalue()
-        # Should show error but not crash
-        assert "ERROR" in output
+
+        stderr_output = fake_err.getvalue()
+        # Error should be on stderr
+        assert "ERROR" in stderr_output
 
 
 class TestInteractiveParameters:
@@ -277,7 +278,8 @@ class TestInteractiveParameters:
                     max_tokens=100,
                     temperature=0.8,
                     top_p=0.95,
-                    repetition_penalty=1.2
+                    repetition_penalty=1.2,
+                    hide_reasoning=True,
                 )
         
         call_args = mock_runner_interactive.generate_streaming.call_args[1]
@@ -285,6 +287,7 @@ class TestInteractiveParameters:
         assert call_args['temperature'] == 0.8
         assert call_args['top_p'] == 0.95
         assert call_args['repetition_penalty'] == 1.2
+        assert call_args['hide_reasoning'] is True
     
     def test_parameter_passing_batch(self, mock_runner_interactive):
         """Test that parameters are passed to batch generation"""
@@ -296,7 +299,8 @@ class TestInteractiveParameters:
                     max_tokens=200,
                     temperature=0.9,
                     top_p=0.85,
-                    repetition_penalty=1.3
+                    repetition_penalty=1.3,
+                    hide_reasoning=True,
                 )
         
         call_args = mock_runner_interactive.generate_batch.call_args[1]
@@ -304,6 +308,7 @@ class TestInteractiveParameters:
         assert call_args['temperature'] == 0.9
         assert call_args['top_p'] == 0.85
         assert call_args['repetition_penalty'] == 1.3
+        assert call_args['hide_reasoning'] is True
     
     def test_use_chat_template_disabled(self, mock_runner_interactive):
         """Test that use_chat_template is disabled in generation calls"""
@@ -330,15 +335,18 @@ class TestInteractiveErrorHandling:
             RuntimeError("Generation failed"),
             iter(["Success"])
         ]
-        
+
         with patch('builtins.input', side_effect=["first", "second", "quit"]):
-            with patch('sys.stdout', new=StringIO()) as fake_out:
+            with patch('sys.stdout', new=StringIO()) as fake_out, \
+                 patch('sys.stderr', new=StringIO()) as fake_err:
                 interactive_chat(mock_runner_interactive, stream=True)
-        
-        output = fake_out.getvalue()
-        # Should show error for first, success for second
-        assert "ERROR" in output
-        assert "Success" in output
+
+        stdout_output = fake_out.getvalue()
+        stderr_output = fake_err.getvalue()
+        # Error should be on stderr
+        assert "ERROR" in stderr_output
+        # Success should be on stdout
+        assert "Success" in stdout_output
     
     def test_keyboard_interrupt_handling(self, mock_runner_interactive):
         """Test Ctrl-C handling in interactive mode"""
diff --git a/tests_2.0/test_run_complete.py b/tests_2.0/test_run_complete.py
index c029a47..4eacf25 100644
--- a/tests_2.0/test_run_complete.py
+++ b/tests_2.0/test_run_complete.py
@@ -124,15 +124,16 @@ class TestRunBasic:
     
     def test_run_interactive_json_incompatible(self, mock_runner_complete):
         """Interactive mode should not work with JSON output"""
-        with patch('sys.stdout', new=StringIO()) as fake_out:
+        with patch('sys.stdout', new=StringIO()) as fake_out, \
+             patch('sys.stderr', new=StringIO()) as fake_err:
             result = run_model(
                 model_spec="test-model",
                 prompt=None,  # Interactive mode
                 json_output=True
             )
-        
-        output = fake_out.getvalue()
-        assert "not compatible with JSON output" in output
+
+        error_output = fake_err.getvalue()
+        assert "not compatible with JSON output" in error_output
         assert result is None
 
 
@@ -196,18 +197,52 @@ class TestRunParameters:
         
         call_args = mock_runner_complete.generate_streaming.call_args
         assert call_args[1]['use_chat_template'] is True
-        
+
         # Without chat template
         run_model(
             model_spec="test-model",
             prompt="test",
             use_chat_template=False
         )
-        
+
         call_args = mock_runner_complete.generate_streaming.call_args
         assert call_args[1]['use_chat_template'] is False
 
 
+class TestRunReasoningControl:
+    """Tests for --no-reasoning propagation."""
+
+    def test_interactive_streaming_hide_reasoning(self, mock_runner_complete):
+        """Interactive streaming mode respects hide_reasoning flag."""
+        with patch('builtins.input', side_effect=["hello", "quit"]):
+            with patch('sys.stdout', new=StringIO()):
+                run_model(
+                    model_spec="test-model",
+                    prompt=None,
+                    stream=True,
+                    json_output=False,
+                    hide_reasoning=True,
+                )
+
+        call_args = mock_runner_complete.generate_streaming.call_args[1]
+        assert call_args['hide_reasoning'] is True
+
+    def test_interactive_batch_hide_reasoning(self, mock_runner_complete):
+        """Interactive batch mode respects hide_reasoning flag."""
+        with patch('builtins.input', side_effect=["hello", "quit"]):
+            with patch('sys.stdout', new=StringIO()):
+                run_model(
+                    model_spec="test-model",
+                    prompt=None,
+                    stream=False,
+                    json_output=False,
+                    hide_reasoning=True,
+                )
+
+        call_args = mock_runner_complete.generate_batch.call_args[1]
+        assert call_args['hide_reasoning'] is True
+
+
 class TestConversationHistory:
     """Test conversation history tracking in interactive mode."""
     
@@ -299,16 +334,17 @@ class TestErrorHandling:
         """Test handling of model loading failures"""
         with patch('mlxk2.operations.run.MLXRunner') as mock_runner_class:
             mock_runner_class.side_effect = FileNotFoundError("Model not found")
-            
-            with patch('sys.stdout', new=StringIO()) as fake_out:
+
+            with patch('sys.stdout', new=StringIO()) as fake_out, \
+                 patch('sys.stderr', new=StringIO()) as fake_err:
                 result = run_model(
                     model_spec="nonexistent-model",
                     prompt="test",
                     json_output=False
                 )
-            
-            output = fake_out.getvalue()
-            assert "Error:" in output
+
+            error_output = fake_err.getvalue()
+            assert "Error:" in error_output
             # Issue #38: run_model now returns error string in both text and JSON modes
             assert result is not None and result.startswith("Error:")
     
@@ -461,4 +497,4 @@ class TestPreflightCompatibilityCheck:
         # Should be blocked by preflight check
         assert result is not None
         assert "Error:" in result
-        assert "not compatible" in result or "Incompatible" in result
\ No newline at end of file
+        assert "not compatible" in result or "Incompatible" in result