Files
mlx-knife/benchmarks/generate_benchmark_report.py
T
The BROKE Cluster Team bf7480d042 Release 2.0.4-beta.9: Audio transcription via mlx-audio
Major Features:
- Audio transcription via mlx-audio backend (Whisper, >10min duration)
- OpenAI /v1/audio/transcriptions endpoint
- Memory Gate System (Vision: 8GB, Audio: 4GB)
- Config-based backend routing (ADR-020)
- Benchmark toolchain (memmon/memplot, Schema v0.2.2)

Key Fixes:
- EuroLLM tokenizer decoding
- Vision-model text-only routing regression
- Multimodal model context length detection
- Memory cleanup bug (mx.metal.clear_cache)
- Orphan process bug

Test Results:
- Unit tests: 647 passed, 11 skipped (Python 3.10-3.12)
- wet-umbrella: 171 passed total

See CHANGELOG.md for complete details and known issues.
2026-02-04 03:10:30 +01:00

918 lines
38 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""Generate benchmark analysis report from JSONL test data.
Reads JSONL benchmark reports and generates structured Markdown analysis.
Usage:
# Auto-detect latest JSONL
python benchmarks/generate_benchmark_report.py
# Explicit file
python benchmarks/generate_benchmark_report.py benchmarks/reports/2025-12-20-v2.0.4b3.jsonl
# With comparison
python benchmarks/generate_benchmark_report.py new.jsonl --compare old.jsonl
"""
import argparse
import json
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional
try:
import jsonschema
except ImportError:
print("Error: jsonschema not installed. Install with: pip install jsonschema")
sys.exit(1)
# Template version
TEMPLATE_VERSION = "1.0"
REPORTS_DIR = Path("benchmarks/reports")
SCHEMA_PATH = Path("benchmarks/schemas/report-current.schema.json")
def load_schema() -> dict:
"""Load current JSON schema."""
if not SCHEMA_PATH.exists():
print(f"❌ Schema not found: {SCHEMA_PATH}")
sys.exit(1)
with open(SCHEMA_PATH) as f:
return json.load(f)
def is_memmon_jsonl(data: List[dict]) -> bool:
"""Detect if JSONL is memmon output (memory samples) vs benchmark results.
memmon JSONL has: ram_free_gb, swap_used_mb, elapsed_s (no schema_version)
benchmark JSONL has: schema_version, outcome, timestamp
"""
if not data:
return False
first_entry = data[0]
# Check for memmon-specific fields
has_memmon_fields = "ram_free_gb" in first_entry and "elapsed_s" in first_entry
# Check for benchmark-specific fields
has_benchmark_fields = "schema_version" in first_entry or "outcome" in first_entry
return has_memmon_fields and not has_benchmark_fields
def validate_jsonl(data: List[dict], schema: dict, filepath: Path) -> bool:
"""Validate JSONL data against schema.
Skips validation for memmon JSONL files (memory monitoring data).
"""
# Skip validation for memmon data
if is_memmon_jsonl(data):
print(f"️ Skipping validation for memmon data: {filepath}")
return True
errors = []
for i, entry in enumerate(data, 1):
try:
jsonschema.validate(instance=entry, schema=schema)
except jsonschema.ValidationError as e:
errors.append(f"Line {i}: {e.message}")
if errors:
print(f"❌ Validation failed for {filepath}")
for error in errors[:5]: # Show first 5 errors
print(f" {error}")
if len(errors) > 5:
print(f" ... and {len(errors) - 5} more errors")
return False
return True
def load_jsonl(filepath: Path) -> List[dict]:
"""Load JSONL file."""
data = []
with open(filepath) as f:
for line in f:
line = line.strip()
if line:
data.append(json.loads(line))
return data
def find_latest_jsonl() -> Optional[Path]:
"""Find the most recent JSONL file in reports directory."""
if not REPORTS_DIR.exists():
return None
jsonl_files = sorted(REPORTS_DIR.glob("*.jsonl"), key=lambda p: p.stat().st_mtime, reverse=True)
return jsonl_files[0] if jsonl_files else None
def extract_version_from_filename(filepath: Path) -> Optional[str]:
"""Extract version string from filename like '2025-12-20-v2.0.4b3.jsonl'."""
parts = filepath.stem.split("-v")
return parts[1].split("-")[0] if len(parts) > 1 else None
def calculate_statistics(data: List[dict]) -> Dict:
"""Calculate all benchmark statistics from JSONL data.
Filters out memmon entries (memory samples) if mixed with benchmark data.
"""
# Filter out memmon entries (memory monitoring samples)
benchmark_data = [e for e in data if not ("ram_free_gb" in e and "elapsed_s" in e and "outcome" not in e)]
# Separate by outcome
passed_tests = [e for e in benchmark_data if e.get("outcome") == "passed"]
skipped_tests = [e for e in benchmark_data if e.get("outcome") == "skipped"]
passed_with_model = [e for e in passed_tests if "model" in e]
passed_without_model = [e for e in passed_tests if "model" not in e]
# System health metrics (optional for backward compatibility with older schemas)
swap_values = []
ram_values = []
zombie_values = []
quality_flags = []
for e in data:
if "system_health" in e:
swap_mb = e["system_health"].get("swap_used_mb", 0)
ram_gb = e["system_health"].get("ram_free_gb", 0)
zombies = e["system_health"].get("zombie_processes", 0)
swap_values.append(swap_mb)
ram_values.append(ram_gb)
zombie_values.append(zombies)
# Recalculate quality_flags from raw values (ignore stored flags)
# Rationale: Thresholds are experimental and OS-specific
#
# Session 61 Analysis (Sequoia vs Tahoe):
# - Sequoia: RAM free varies 10-27 GB, swap=0
# - Tahoe: RAM free drops to 0-0.1 GB during load, recovers to ~24 GB between tests
#
# Steady-State Baseline (DeepHermes post-load relaxation):
# - Tahoe: ~24 GB free (1.2-1.4 min after first test)
# - Sequoia: ~40 GB free (similar pattern)
#
# Degraded Threshold: ram_free < 5 GB (extreme memory pressure)
# - Marks 0-0.1 GB minima as degraded ✅
# - Normal tests (10-20 GB free) stay clean ✅
flags = []
if ram_gb < 5.0: # < 5 GB free = extreme memory pressure
flags.append("degraded_ram")
if zombies > 0:
flags.append("degraded_zombies")
if not flags:
flags.append("clean")
quality_flags.append(flags)
clean_count = sum(1 for flags in quality_flags if flags == ["clean"])
degraded_ram = sum(1 for flags in quality_flags if "degraded_ram" in flags)
degraded_zombies = sum(1 for flags in quality_flags if "degraded_zombies" in flags)
# Per-model statistics (with inference modality breakdown)
# Filter: Only count actual inference tests (duration >= 0.5s)
# This excludes infrastructure tests like test_vision_model_info_fixture_works
inference_tests = [e for e in passed_with_model if e["duration"] >= 0.5]
model_stats = {}
for entry in inference_tests:
model_id = entry["model"]["id"]
if model_id not in model_stats:
model_stats[model_id] = {
"id": model_id,
"size_gb": entry["model"].get("size_gb", 0), # Default to 0 if missing (e.g., pipe tests)
# Total stats (legacy, always populated)
"count": 0,
"total_time": 0,
# Per-modality breakdown (NEW in v0.2.1, Audio in v0.2.2)
"vision_count": 0,
"vision_time": 0.0,
"vision_ram_min": float("inf"),
"vision_ram_max": 0,
"text_count": 0,
"text_time": 0.0,
"text_ram_min": float("inf"),
"text_ram_max": 0,
"audio_count": 0,
"audio_time": 0.0,
"audio_ram_min": float("inf"),
"audio_ram_max": 0,
"unknown_count": 0,
"unknown_time": 0.0,
"unknown_ram_min": float("inf"),
"unknown_ram_max": 0,
# System health (global, for backward compat)
"ram_min": float("inf"),
"ram_max": 0,
"swap_max": 0,
"zombies_max": 0,
}
stats = model_stats[model_id]
duration = entry["duration"]
# Update totals (always)
stats["count"] += 1
stats["total_time"] += duration
# Update modality-specific stats (NEW in v0.2.1, Audio in v0.2.2)
modality = entry.get("metadata", {}).get("inference_modality", "unknown")
if modality == "vision":
stats["vision_count"] += 1
stats["vision_time"] += duration
elif modality == "text":
stats["text_count"] += 1
stats["text_time"] += duration
elif modality == "audio":
stats["audio_count"] += 1
stats["audio_time"] += duration
else: # "unknown" or any other value (backward compat)
stats["unknown_count"] += 1
stats["unknown_time"] += duration
# Handle optional system_health (backward compatibility)
if "system_health" in entry:
ram_gb = entry["system_health"].get("ram_free_gb", 0)
# Update per-modality RAM stats
if modality == "vision":
stats["vision_ram_min"] = min(stats["vision_ram_min"], ram_gb)
stats["vision_ram_max"] = max(stats["vision_ram_max"], ram_gb)
elif modality == "text":
stats["text_ram_min"] = min(stats["text_ram_min"], ram_gb)
stats["text_ram_max"] = max(stats["text_ram_max"], ram_gb)
elif modality == "audio":
stats["audio_ram_min"] = min(stats["audio_ram_min"], ram_gb)
stats["audio_ram_max"] = max(stats["audio_ram_max"], ram_gb)
else:
stats["unknown_ram_min"] = min(stats["unknown_ram_min"], ram_gb)
stats["unknown_ram_max"] = max(stats["unknown_ram_max"], ram_gb)
# Handle optional system_health - global stats (backward compatibility)
if "system_health" in entry:
stats["ram_min"] = min(stats["ram_min"], entry["system_health"].get("ram_free_gb", 0))
stats["ram_max"] = max(stats["ram_max"], entry["system_health"].get("ram_free_gb", 0))
stats["swap_max"] = max(stats["swap_max"], entry["system_health"].get("swap_used_mb", 0))
stats["zombies_max"] = max(stats["zombies_max"], entry["system_health"].get("zombie_processes", 0))
# Per-test statistics (use inference_tests to filter infrastructure tests)
# Group by (test_name, modality) to differentiate Vision/Text phases of same test
import statistics
test_stats = {}
for entry in inference_tests:
# Extract test function name and normalize (remove parametrization)
test_full = entry["test"].split("::")[-1]
test_name = test_full.split("[")[0] # Remove [discovered_XX] part
model_id = entry["model"]["id"]
model_short = model_id.replace("mlx-community/", "").split("-")[0] # Short name
duration = entry["duration"]
modality = entry.get("metadata", {}).get("inference_modality", "unknown")
# Key: (test_name, modality) to separate Vision/Text phases
key = (test_name, modality)
if key not in test_stats:
test_stats[key] = {
"name": test_name,
"modality": modality,
"models": set(),
"runs": [],
}
test_stats[key]["models"].add(model_id)
test_stats[key]["runs"].append({
"model": model_id,
"model_short": model_short,
"duration": duration
})
# Calculate aggregates per test (key is now tuple: (test_name, modality))
for key, test_data in test_stats.items():
durations = [r["duration"] for r in test_data["runs"]]
test_data["model_count"] = len(test_data["models"])
test_data["median_time"] = statistics.median(durations) if durations else 0
# Find fastest and slowest
sorted_runs = sorted(test_data["runs"], key=lambda r: r["duration"])
test_data["fastest"] = sorted_runs[0] if sorted_runs else None
test_data["slowest"] = sorted_runs[-1] if sorted_runs else None
# Convert set to list for JSON serialization
test_data["models"] = list(test_data["models"])
# Hardware profile (scan for first entry with data, handles manual JSONL entries)
hw_profile = {}
for entry in data:
if "system" in entry and "hardware_profile" in entry["system"]:
hw_profile = entry["system"]["hardware_profile"]
break
return {
"total_tests": len(benchmark_data),
"passed": len(passed_tests),
"passed_with_model": len(passed_with_model),
"passed_infrastructure": len(passed_without_model),
"skipped": len(skipped_tests),
"total_duration": sum(e["duration"] for e in passed_tests),
"schema_version": benchmark_data[0].get("schema_version", "unknown") if benchmark_data else "unknown",
"mlx_knife_version": benchmark_data[0].get("mlx_knife_version", "unknown") if benchmark_data else "unknown",
"swap": {
"min": min(swap_values) if swap_values else 0,
"max": max(swap_values) if swap_values else 0,
"avg": sum(swap_values) / len(swap_values) if swap_values else 0,
},
"ram": {
"min": min(ram_values) if ram_values else 0,
"max": max(ram_values) if ram_values else 0,
"avg": sum(ram_values) / len(ram_values) if ram_values else 0,
},
"zombies": {
"min": min(zombie_values) if zombie_values else 0,
"max": max(zombie_values) if zombie_values else 0,
},
"quality": {
"clean": clean_count,
"degraded_ram": degraded_ram,
"degraded_zombies": degraded_zombies,
"clean_percent": 100 * clean_count / len(data) if data else 0,
},
"hardware": hw_profile,
"models": model_stats,
"tests": test_stats,
}
def generate_markdown(stats: Dict, input_file: Path, compare_file: Optional[Path] = None, compare_stats: Optional[Dict] = None) -> str:
"""Generate Markdown report from statistics."""
version = stats["mlx_knife_version"]
date = input_file.stem.split("-v")[0] # Extract date from filename
now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S UTC")
# Header
md = f"""# Benchmark Report v{TEMPLATE_VERSION}: {version}
**Date:** {date}
**Generated:** {now}
**Generator:** generate_benchmark_report.py v{TEMPLATE_VERSION}
**Hardware:** {stats['hardware'].get('model', 'unknown')}, {stats['hardware'].get('cores_physical', '?')} cores
---
## Input Files
- **Primary:** `{input_file}`
- **Schema:** v{stats['schema_version']}
"""
if compare_file:
md += f"- **Comparison:** `{compare_file}`\n"
md += "\n---\n\n"
# Executive Summary
md += "## Executive Summary\n\n"
md += f"**Tests:** {stats['total_tests']} total ({stats['passed']} passed, {stats['skipped']} skipped)\n"
md += f"**Duration:** {stats['total_duration']:.1f}s ({stats['total_duration']/60:.1f} min)\n"
md += f"**Quality:** {stats['quality']['clean_percent']:.1f}% clean ({stats['quality']['clean']}/{stats['total_tests']})\n"
md += f"**Models:** {len(stats['models'])} tested\n\n"
# Comparison Summary
if compare_stats:
old_duration = compare_stats['total_duration']
new_duration = stats['total_duration']
duration_delta = new_duration - old_duration
duration_pct = (duration_delta / old_duration * 100) if old_duration > 0 else 0
# Count models by change direction
compare_models_dict = {m['id']: m for m in compare_stats['models'].values()}
slower_count = 0
faster_count = 0
for model in stats['models'].values():
old_model = compare_models_dict.get(model['id'])
if old_model:
if model['total_time'] > old_model['total_time']:
slower_count += 1
elif model['total_time'] < old_model['total_time']:
faster_count += 1
total_compared = slower_count + faster_count
change_icon = "⚠️" if duration_pct > 3 else "" if duration_pct < -1 else "➡️"
md += f"### Comparison\n\n"
md += f"**vs:** `{compare_file.name}`\n"
md += f"**Duration:** {old_duration/60:.1f} min → {new_duration/60:.1f} min ({duration_pct:+.1f}%) {change_icon}\n"
if total_compared > 0:
md += f"**Models:** {slower_count}/{total_compared} slower ({100*slower_count/total_compared:.0f}%), {faster_count}/{total_compared} faster ({100*faster_count/total_compared:.0f}%)\n"
md += "\n"
# Validation Status
quality_icon = "" if stats['quality']['clean_percent'] == 100 else "⚠️"
md += f"{quality_icon} **System Health:** "
if stats['quality']['clean_percent'] == 100:
md += "All tests clean (RAM >5 GB free, 0 zombies)\n"
else:
md += f"{stats['quality']['degraded_ram']} degraded (RAM <5 GB free), {stats['quality']['degraded_zombies']} degraded (zombies)\n"
md += "\n---\n\n"
# Test Summary
md += "## Test Summary\n\n"
md += f"""```
Total tests: {stats['total_tests']}
Passed: {stats['passed']}
With model: {stats['passed_with_model']}
Infrastructure: {stats['passed_infrastructure']}
Skipped: {stats['skipped']}
Duration: {stats['total_duration']:.1f}s ({stats['total_duration']/60:.1f} min)
```
---
## System Health
"""
md += f"""```
Swap (MB): min={stats['swap']['min']}, max={stats['swap']['max']}, avg={stats['swap']['avg']:.1f}
RAM free (GB): min={stats['ram']['min']:.1f}, max={stats['ram']['max']:.1f}, avg={stats['ram']['avg']:.1f}
Zombies: min={stats['zombies']['min']}, max={stats['zombies']['max']}
Quality Flags (Thresholds: RAM <5 GB free, zombies >0):
Clean: {stats['quality']['clean']}/{stats['total_tests']} ({stats['quality']['clean_percent']:.1f}%)
Degraded (RAM): {stats['quality']['degraded_ram']}
Degraded (zombies): {stats['quality']['degraded_zombies']}
```
---
## Per-Model Statistics
"""
# Sort models by total time (descending), or by change if comparing
sorted_models = sorted(stats['models'].values(), key=lambda m: m['total_time'], reverse=True)
# Build comparison lookup if available
compare_models = {}
if compare_stats:
compare_models = {m['id']: m for m in compare_stats['models'].values()}
# Re-sort by change percentage (biggest regression first)
def get_change_pct(model):
old = compare_models.get(model['id'])
if old and old['total_time'] > 0:
return (model['total_time'] - old['total_time']) / old['total_time'] * 100
return 0
sorted_models = sorted(stats['models'].values(), key=get_change_pct, reverse=True)
if compare_stats:
md += f"""```
{'Model':<40} {'Size':<7} {'Mode':<6} {'Tests':<5} {'Time':<8} {'Old':<8} {'Δ':<8} {'Change':<10} {'RAM (GB)':<12}
{'='*40} {'='*7} {'='*6} {'='*5} {'='*8} {'='*8} {'='*8} {'='*10} {'='*12}
"""
else:
md += f"""```
{'Model':<50} {'Size':<8} {'Mode':<6} {'Tests':<6} {'Time':<10} {'RAM (GB)':<20}
{'='*50} {'='*8} {'='*6} {'='*6} {'='*10} {'='*20}
"""
for model in sorted_models:
# Shorten model ID (remove mlx-community/ prefix)
model_short = model['id'].replace('mlx-community/', '')
max_len = 38 if compare_stats else 48
if len(model_short) > max_len:
model_short = model_short[:max_len-3] + "..."
# Global RAM range (for backward compat / fallback)
ram_range = f"{model['ram_min']:.1f}-{model['ram_max']:.1f}"
if compare_stats:
old_model = compare_models.get(model['id'])
# Separate rows per modality (same as non-comparison mode)
rows_written = 0
# Vision modality
if model['vision_count'] > 0:
v_ram_min = model['vision_ram_min']
v_ram_max = model['vision_ram_max']
if v_ram_min == float('inf'):
v_ram_range = "-"
elif v_ram_min == v_ram_max:
v_ram_range = f"{v_ram_min:.1f}"
else:
v_ram_range = f"{v_ram_min:.1f}-{v_ram_max:.1f}"
# Get old vision stats (if available)
if old_model and old_model.get('vision_count', 0) > 0:
old_time = old_model['vision_time']
delta = model['vision_time'] - old_time
change_pct = (delta / old_time * 100) if old_time > 0 else 0
if change_pct > 5:
status = "⚠️"
elif change_pct < -1:
status = ""
else:
status = ""
change_str = f"{change_pct:+.1f}% {status}"
md += f"{model_short:<40} {model['size_gb']:>5.1f}GB {'Vision':<6} {model['vision_count']:<5} {model['vision_time']:>6.1f}s {old_time:>6.1f}s {delta:>+6.1f}s {change_str:<10} {v_ram_range:<12}\n"
else:
md += f"{model_short:<40} {model['size_gb']:>5.1f}GB {'Vision':<6} {model['vision_count']:<5} {model['vision_time']:>6.1f}s {'N/A':<8} {'N/A':<8} {'NEW':<10} {v_ram_range:<12}\n"
rows_written += 1
# Text modality
if model['text_count'] > 0:
t_ram_min = model['text_ram_min']
t_ram_max = model['text_ram_max']
if t_ram_min == float('inf'):
t_ram_range = "-"
elif t_ram_min == t_ram_max:
t_ram_range = f"{t_ram_min:.1f}"
else:
t_ram_range = f"{t_ram_min:.1f}-{t_ram_max:.1f}"
# Get old text stats (if available)
if old_model and old_model.get('text_count', 0) > 0:
old_time = old_model['text_time']
delta = model['text_time'] - old_time
change_pct = (delta / old_time * 100) if old_time > 0 else 0
if change_pct > 5:
status = "⚠️"
elif change_pct < -1:
status = ""
else:
status = ""
change_str = f"{change_pct:+.1f}% {status}"
md += f"{model_short:<40} {model['size_gb']:>5.1f}GB {'Text':<6} {model['text_count']:<5} {model['text_time']:>6.1f}s {old_time:>6.1f}s {delta:>+6.1f}s {change_str:<10} {t_ram_range:<12}\n"
else:
md += f"{model_short:<40} {model['size_gb']:>5.1f}GB {'Text':<6} {model['text_count']:<5} {model['text_time']:>6.1f}s {'N/A':<8} {'N/A':<8} {'NEW':<10} {t_ram_range:<12}\n"
rows_written += 1
# Audio modality (NEW in v0.2.2)
if model['audio_count'] > 0:
a_ram_min = model['audio_ram_min']
a_ram_max = model['audio_ram_max']
if a_ram_min == float('inf'):
a_ram_range = "-"
elif a_ram_min == a_ram_max:
a_ram_range = f"{a_ram_min:.1f}"
else:
a_ram_range = f"{a_ram_min:.1f}-{a_ram_max:.1f}"
# Get old audio stats (if available)
if old_model and old_model.get('audio_count', 0) > 0:
old_time = old_model['audio_time']
delta = model['audio_time'] - old_time
change_pct = (delta / old_time * 100) if old_time > 0 else 0
if change_pct > 5:
status = "⚠️"
elif change_pct < -1:
status = ""
else:
status = ""
change_str = f"{change_pct:+.1f}% {status}"
md += f"{model_short:<40} {model['size_gb']:>5.1f}GB {'Audio':<6} {model['audio_count']:<5} {model['audio_time']:>6.1f}s {old_time:>6.1f}s {delta:>+6.1f}s {change_str:<10} {a_ram_range:<12}\n"
else:
md += f"{model_short:<40} {model['size_gb']:>5.1f}GB {'Audio':<6} {model['audio_count']:<5} {model['audio_time']:>6.1f}s {'N/A':<8} {'N/A':<8} {'NEW':<10} {a_ram_range:<12}\n"
rows_written += 1
# Fallback for legacy data (no modality info) - rare in comparison mode
if rows_written == 0 and old_model:
old_time = old_model['total_time']
delta = model['total_time'] - old_time
change_pct = (delta / old_time * 100) if old_time > 0 else 0
if change_pct > 5:
status = "⚠️"
elif change_pct < -1:
status = ""
else:
status = ""
change_str = f"{change_pct:+.1f}% {status}"
md += f"{model_short:<40} {model['size_gb']:>5.1f}GB {'-':<6} {model['count']:<5} {model['total_time']:>6.1f}s {old_time:>6.1f}s {delta:>+6.1f}s {change_str:<10} {ram_range:<12}\n"
elif rows_written == 0:
# New model with no modality info
md += f"{model_short:<40} {model['size_gb']:>5.1f}GB {'-':<6} {model['count']:<5} {model['total_time']:>6.1f}s {'N/A':<8} {'N/A':<8} {'NEW':<10} {ram_range:<12}\n"
else:
# Separate rows per modality (no "Mixed" ambiguity)
# Each modality gets its own line with specific stats + RAM
rows_written = 0
if model['vision_count'] > 0:
# Use modality-specific RAM range (single value if min==max)
v_ram_min = model['vision_ram_min']
v_ram_max = model['vision_ram_max']
if v_ram_min == float('inf'):
v_ram_range = "-"
elif v_ram_min == v_ram_max:
v_ram_range = f"{v_ram_min:.1f}"
else:
v_ram_range = f"{v_ram_min:.1f}-{v_ram_max:.1f}"
md += f"{model_short:<50} {model['size_gb']:>6.1f}GB {'Vision':<6} {model['vision_count']:<6} {model['vision_time']:>8.1f}s {v_ram_range:<20}\n"
rows_written += 1
if model['text_count'] > 0:
# Use modality-specific RAM range (single value if min==max)
t_ram_min = model['text_ram_min']
t_ram_max = model['text_ram_max']
if t_ram_min == float('inf'):
t_ram_range = "-"
elif t_ram_min == t_ram_max:
t_ram_range = f"{t_ram_min:.1f}"
else:
t_ram_range = f"{t_ram_min:.1f}-{t_ram_max:.1f}"
md += f"{model_short:<50} {model['size_gb']:>6.1f}GB {'Text':<6} {model['text_count']:<6} {model['text_time']:>8.1f}s {t_ram_range:<20}\n"
rows_written += 1
if model['audio_count'] > 0:
# Use modality-specific RAM range (single value if min==max)
a_ram_min = model['audio_ram_min']
a_ram_max = model['audio_ram_max']
if a_ram_min == float('inf'):
a_ram_range = "-"
elif a_ram_min == a_ram_max:
a_ram_range = f"{a_ram_min:.1f}"
else:
a_ram_range = f"{a_ram_min:.1f}-{a_ram_max:.1f}"
md += f"{model_short:<50} {model['size_gb']:>6.1f}GB {'Audio':<6} {model['audio_count']:<6} {model['audio_time']:>8.1f}s {a_ram_range:<20}\n"
rows_written += 1
# Fallback for legacy data (no modality info)
if rows_written == 0:
md += f"{model_short:<50} {model['size_gb']:>6.1f}GB {'-':<6} {model['count']:<6} {model['total_time']:>8.1f}s {ram_range:<20}\n"
md += "```\n\n"
# Model Categories (with modality differentiation)
large_models = [m for m in sorted_models if m['size_gb'] >= 20]
medium_models = [m for m in sorted_models if 10 <= m['size_gb'] < 20]
small_models = [m for m in sorted_models if m['size_gb'] < 10]
def format_category_stats(models_list, category_name):
"""Format category statistics with Vision/Text breakdown."""
if not models_list:
return ""
# Collect Vision, Text, and Audio stats (Audio NEW in v0.2.2)
vision_models = [m for m in models_list if m.get('vision_count', 0) > 0]
text_models = [m for m in models_list if m.get('text_count', 0) > 0]
audio_models = [m for m in models_list if m.get('audio_count', 0) > 0]
output = f"{category_name}: {len(models_list)} models\n"
output += f" Avg size: {sum(m['size_gb'] for m in models_list) / len(models_list):.1f} GB\n"
# Vision stats
if vision_models:
avg_vision_time = sum(m['vision_time']/m['vision_count'] for m in vision_models) / len(vision_models)
# Collect RAM values (filter sentinel values)
vision_ram_mins = [m['vision_ram_min'] for m in vision_models if m['vision_ram_min'] != float('inf')]
vision_ram_maxs = [m['vision_ram_max'] for m in vision_models if m['vision_ram_max'] > 0]
output += f" Vision Tests:\n"
output += f" Models tested: {len(vision_models)}\n"
output += f" Avg test time: {avg_vision_time:.1f}s\n"
# Only output RAM range if data available
if vision_ram_mins and vision_ram_maxs:
all_vision_ram_min = min(vision_ram_mins)
all_vision_ram_max = max(vision_ram_maxs)
output += f" RAM range: {all_vision_ram_min:.1f}-{all_vision_ram_max:.1f} GB\n"
# Text stats
if text_models:
avg_text_time = sum(m['text_time']/m['text_count'] for m in text_models) / len(text_models)
# Collect RAM values (filter sentinel values)
text_ram_mins = [m['text_ram_min'] for m in text_models if m['text_ram_min'] != float('inf')]
text_ram_maxs = [m['text_ram_max'] for m in text_models if m['text_ram_max'] > 0]
output += f" Text Tests:\n"
output += f" Models tested: {len(text_models)}\n"
output += f" Avg test time: {avg_text_time:.1f}s\n"
# Only output RAM range if data available
if text_ram_mins and text_ram_maxs:
all_text_ram_min = min(text_ram_mins)
all_text_ram_max = max(text_ram_maxs)
output += f" RAM range: {all_text_ram_min:.1f}-{all_text_ram_max:.1f} GB\n"
# Audio stats (NEW in v0.2.2)
if audio_models:
avg_audio_time = sum(m['audio_time']/m['audio_count'] for m in audio_models) / len(audio_models)
# Collect RAM values (filter sentinel values)
audio_ram_mins = [m['audio_ram_min'] for m in audio_models if m['audio_ram_min'] != float('inf')]
audio_ram_maxs = [m['audio_ram_max'] for m in audio_models if m['audio_ram_max'] > 0]
output += f" Audio Tests:\n"
output += f" Models tested: {len(audio_models)}\n"
output += f" Avg test time: {avg_audio_time:.1f}s\n"
# Only output RAM range if data available
if audio_ram_mins and audio_ram_maxs:
all_audio_ram_min = min(audio_ram_mins)
all_audio_ram_max = max(audio_ram_maxs)
output += f" RAM range: {all_audio_ram_min:.1f}-{all_audio_ram_max:.1f} GB\n"
# Fallback for legacy data (no modality info)
if not vision_models and not text_models and not audio_models:
avg_time = sum(m['total_time']/m['count'] for m in models_list) / len(models_list)
avg_ram = sum(m['ram_min'] for m in models_list) / len(models_list)
output += f" Avg test time: {avg_time:.1f}s\n"
output += f" Avg min RAM: {avg_ram:.1f} GB\n"
return output
md += "### Model Categories\n\n"
if large_models or medium_models or small_models:
md += "```\n"
if large_models:
md += format_category_stats(large_models, "LARGE MODELS (≥20 GB)")
md += "\n"
if medium_models:
md += format_category_stats(medium_models, "MEDIUM MODELS (10-20 GB)")
md += "\n"
if small_models:
md += format_category_stats(small_models, "SMALL MODELS (<10 GB)")
md += "```\n"
md += "\n---\n\n"
# Per-Test Statistics
md += "## Per-Test Statistics\n\n"
md += "Shows performance range across models for each test.\n\n"
# Sort tests by model count (descending) - most representative tests first
sorted_tests = sorted(stats['tests'].values(), key=lambda t: t['model_count'], reverse=True)
# Build comparison lookup for tests (key: (name, modality))
compare_tests = {}
if compare_stats:
compare_tests = {(t['name'], t.get('modality', 'unknown')): t for t in compare_stats['tests'].values()}
if compare_stats:
md += f"""```
{'Test Name':<38} {'Mode':<6} {'Models':<7} {'Fastest':<18} {'Slowest':<18} {'Med':<6} {'Old':<6} {'Δ Med':<8}
{'='*38} {'='*6} {'='*7} {'='*18} {'='*18} {'='*6} {'='*6} {'='*8}
"""
else:
md += f"""```
{'Test Name':<44} {'Mode':<6} {'Models':<7} {'Fastest':<22} {'Slowest':<22} {'Med Time'}
{'='*44} {'='*6} {'='*7} {'='*22} {'='*22} {'='*8}
"""
for test in sorted_tests:
# Shorten test name if needed
max_test_len = 36 if compare_stats else 42
test_short = test['name']
if len(test_short) > max_test_len:
test_short = test_short[:max_test_len-3] + "..."
# Format modality (Vision/Text/Audio/- for unknown)
modality = test.get('modality', 'unknown')
if modality == 'vision':
mode_str = 'Vision'
elif modality == 'text':
mode_str = 'Text'
elif modality == 'audio':
mode_str = 'Audio'
else:
mode_str = '-'
# Format fastest/slowest
fastest = test['fastest']
slowest = test['slowest']
if fastest and slowest:
max_model_len = 16 if compare_stats else 20
fastest_str = f"{fastest['model_short']} ({fastest['duration']:.1f}s)"
slowest_str = f"{slowest['model_short']} ({slowest['duration']:.1f}s)"
if len(fastest_str) > max_model_len:
fastest_str = fastest_str[:max_model_len-3] + "..."
if len(slowest_str) > max_model_len:
slowest_str = slowest_str[:max_model_len-3] + "..."
med_time = test['median_time']
if compare_stats:
old_test = compare_tests.get((test['name'], test.get('modality', 'unknown')))
if old_test:
old_med = old_test['median_time']
delta_pct = ((med_time - old_med) / old_med * 100) if old_med > 0 else 0
delta_str = f"{delta_pct:+.1f}%"
md += f"{test_short:<38} {mode_str:<6} {test['model_count']:<7} {fastest_str:<18} {slowest_str:<18} {med_time:<5.1f}s {old_med:<5.1f}s {delta_str:<8}\n"
else:
md += f"{test_short:<38} {mode_str:<6} {test['model_count']:<7} {fastest_str:<18} {slowest_str:<18} {med_time:<5.1f}s {'N/A':<6} {'NEW':<8}\n"
else:
md += f"{test_short:<44} {mode_str:<6} {test['model_count']:<7} {fastest_str:<22} {slowest_str:<22} {med_time:.1f}s\n"
md += "```\n\n"
md += "\n---\n\n"
md += "## Files\n\n"
md += f"- **Benchmark report:** `{input_file}`\n"
md += f"- **Schema:** `benchmarks/schemas/report-v{stats['schema_version']}.schema.json`\n"
return md
def main():
parser = argparse.ArgumentParser(
description="Generate benchmark analysis report from JSONL data",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__
)
parser.add_argument(
'input',
nargs='?',
type=Path,
help='JSONL benchmark file (default: latest in benchmarks/reports/)'
)
parser.add_argument(
'--compare',
type=Path,
help='Compare with this JSONL file (adds Old/Δ/Change columns)'
)
parser.add_argument(
'--output',
type=Path,
help='Output markdown file (default: auto-generated in benchmarks/reports/)'
)
args = parser.parse_args()
# Determine input file
if args.input:
input_file = args.input
else:
input_file = find_latest_jsonl()
if not input_file:
print("❌ No JSONL files found in benchmarks/reports/")
sys.exit(1)
print(f"📊 Auto-detected: {input_file}")
if not input_file.exists():
print(f"❌ File not found: {input_file}")
sys.exit(1)
# Load and validate
print(f"📋 Loading: {input_file}")
schema = load_schema()
data = load_jsonl(input_file)
print(f"✓ Loaded {len(data)} entries")
# Validate against schema
if not validate_jsonl(data, schema, input_file):
sys.exit(1)
print(f"✓ Schema validation passed")
# Calculate statistics
stats = calculate_statistics(data)
# Load and calculate comparison statistics if requested
compare_stats = None
if args.compare:
if not args.compare.exists():
print(f"❌ Comparison file not found: {args.compare}")
sys.exit(1)
print(f"📊 Comparing with: {args.compare}")
compare_data = load_jsonl(args.compare)
if not validate_jsonl(compare_data, schema, args.compare):
sys.exit(1)
compare_stats = calculate_statistics(compare_data)
print(f"✓ Loaded {len(compare_data)} comparison entries")
# Generate report
markdown = generate_markdown(stats, input_file, args.compare, compare_stats)
# Determine output file
if args.output:
output_file = args.output
else:
# Auto-generate: BENCHMARK-v1-<version>-<date>.md
version = extract_version_from_filename(input_file) or stats["mlx_knife_version"]
date = input_file.stem.split("-v")[0] # Extract date portion
output_file = REPORTS_DIR / f"BENCHMARK-v{TEMPLATE_VERSION}-{version}-{date}.md"
# Write output
output_file.parent.mkdir(parents=True, exist_ok=True)
with open(output_file, 'w') as f:
f.write(markdown)
print(f"✅ Generated: {output_file}")
print()
print(f"Summary:")
print(f" Tests: {stats['passed']}/{stats['total_tests']} passed")
print(f" Duration: {stats['total_duration']/60:.1f} min")
print(f" Quality: {stats['quality']['clean_percent']:.1f}% clean")
print(f" Models: {len(stats['models'])}")
if __name__ == "__main__":
main()