mirror of
https://github.com/cloudstack-llc/mlx-knife.git
synced 2026-07-01 20:44:14 -04:00
bf7480d042
Major Features: - Audio transcription via mlx-audio backend (Whisper, >10min duration) - OpenAI /v1/audio/transcriptions endpoint - Memory Gate System (Vision: 8GB, Audio: 4GB) - Config-based backend routing (ADR-020) - Benchmark toolchain (memmon/memplot, Schema v0.2.2) Key Fixes: - EuroLLM tokenizer decoding - Vision-model text-only routing regression - Multimodal model context length detection - Memory cleanup bug (mx.metal.clear_cache) - Orphan process bug Test Results: - Unit tests: 647 passed, 11 skipped (Python 3.10-3.12) - wet-umbrella: 171 passed total See CHANGELOG.md for complete details and known issues.
918 lines
38 KiB
Python
918 lines
38 KiB
Python
#!/usr/bin/env python3
|
||
"""Generate benchmark analysis report from JSONL test data.
|
||
|
||
Reads JSONL benchmark reports and generates structured Markdown analysis.
|
||
|
||
Usage:
|
||
# Auto-detect latest JSONL
|
||
python benchmarks/generate_benchmark_report.py
|
||
|
||
# Explicit file
|
||
python benchmarks/generate_benchmark_report.py benchmarks/reports/2025-12-20-v2.0.4b3.jsonl
|
||
|
||
# With comparison
|
||
python benchmarks/generate_benchmark_report.py new.jsonl --compare old.jsonl
|
||
"""
|
||
|
||
import argparse
|
||
import json
|
||
import sys
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
from typing import Dict, List, Optional
|
||
|
||
try:
|
||
import jsonschema
|
||
except ImportError:
|
||
print("Error: jsonschema not installed. Install with: pip install jsonschema")
|
||
sys.exit(1)
|
||
|
||
|
||
# Template version
|
||
TEMPLATE_VERSION = "1.0"
|
||
REPORTS_DIR = Path("benchmarks/reports")
|
||
SCHEMA_PATH = Path("benchmarks/schemas/report-current.schema.json")
|
||
|
||
|
||
def load_schema() -> dict:
|
||
"""Load current JSON schema."""
|
||
if not SCHEMA_PATH.exists():
|
||
print(f"❌ Schema not found: {SCHEMA_PATH}")
|
||
sys.exit(1)
|
||
|
||
with open(SCHEMA_PATH) as f:
|
||
return json.load(f)
|
||
|
||
|
||
def is_memmon_jsonl(data: List[dict]) -> bool:
|
||
"""Detect if JSONL is memmon output (memory samples) vs benchmark results.
|
||
|
||
memmon JSONL has: ram_free_gb, swap_used_mb, elapsed_s (no schema_version)
|
||
benchmark JSONL has: schema_version, outcome, timestamp
|
||
"""
|
||
if not data:
|
||
return False
|
||
|
||
first_entry = data[0]
|
||
# Check for memmon-specific fields
|
||
has_memmon_fields = "ram_free_gb" in first_entry and "elapsed_s" in first_entry
|
||
# Check for benchmark-specific fields
|
||
has_benchmark_fields = "schema_version" in first_entry or "outcome" in first_entry
|
||
|
||
return has_memmon_fields and not has_benchmark_fields
|
||
|
||
|
||
def validate_jsonl(data: List[dict], schema: dict, filepath: Path) -> bool:
|
||
"""Validate JSONL data against schema.
|
||
|
||
Skips validation for memmon JSONL files (memory monitoring data).
|
||
"""
|
||
# Skip validation for memmon data
|
||
if is_memmon_jsonl(data):
|
||
print(f"ℹ️ Skipping validation for memmon data: {filepath}")
|
||
return True
|
||
|
||
errors = []
|
||
for i, entry in enumerate(data, 1):
|
||
try:
|
||
jsonschema.validate(instance=entry, schema=schema)
|
||
except jsonschema.ValidationError as e:
|
||
errors.append(f"Line {i}: {e.message}")
|
||
|
||
if errors:
|
||
print(f"❌ Validation failed for {filepath}")
|
||
for error in errors[:5]: # Show first 5 errors
|
||
print(f" {error}")
|
||
if len(errors) > 5:
|
||
print(f" ... and {len(errors) - 5} more errors")
|
||
return False
|
||
|
||
return True
|
||
|
||
|
||
def load_jsonl(filepath: Path) -> List[dict]:
|
||
"""Load JSONL file."""
|
||
data = []
|
||
with open(filepath) as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if line:
|
||
data.append(json.loads(line))
|
||
return data
|
||
|
||
|
||
def find_latest_jsonl() -> Optional[Path]:
|
||
"""Find the most recent JSONL file in reports directory."""
|
||
if not REPORTS_DIR.exists():
|
||
return None
|
||
|
||
jsonl_files = sorted(REPORTS_DIR.glob("*.jsonl"), key=lambda p: p.stat().st_mtime, reverse=True)
|
||
return jsonl_files[0] if jsonl_files else None
|
||
|
||
|
||
def extract_version_from_filename(filepath: Path) -> Optional[str]:
|
||
"""Extract version string from filename like '2025-12-20-v2.0.4b3.jsonl'."""
|
||
parts = filepath.stem.split("-v")
|
||
return parts[1].split("-")[0] if len(parts) > 1 else None
|
||
|
||
|
||
def calculate_statistics(data: List[dict]) -> Dict:
|
||
"""Calculate all benchmark statistics from JSONL data.
|
||
|
||
Filters out memmon entries (memory samples) if mixed with benchmark data.
|
||
"""
|
||
# Filter out memmon entries (memory monitoring samples)
|
||
benchmark_data = [e for e in data if not ("ram_free_gb" in e and "elapsed_s" in e and "outcome" not in e)]
|
||
|
||
# Separate by outcome
|
||
passed_tests = [e for e in benchmark_data if e.get("outcome") == "passed"]
|
||
skipped_tests = [e for e in benchmark_data if e.get("outcome") == "skipped"]
|
||
passed_with_model = [e for e in passed_tests if "model" in e]
|
||
passed_without_model = [e for e in passed_tests if "model" not in e]
|
||
|
||
# System health metrics (optional for backward compatibility with older schemas)
|
||
swap_values = []
|
||
ram_values = []
|
||
zombie_values = []
|
||
quality_flags = []
|
||
|
||
for e in data:
|
||
if "system_health" in e:
|
||
swap_mb = e["system_health"].get("swap_used_mb", 0)
|
||
ram_gb = e["system_health"].get("ram_free_gb", 0)
|
||
zombies = e["system_health"].get("zombie_processes", 0)
|
||
|
||
swap_values.append(swap_mb)
|
||
ram_values.append(ram_gb)
|
||
zombie_values.append(zombies)
|
||
|
||
# Recalculate quality_flags from raw values (ignore stored flags)
|
||
# Rationale: Thresholds are experimental and OS-specific
|
||
#
|
||
# Session 61 Analysis (Sequoia vs Tahoe):
|
||
# - Sequoia: RAM free varies 10-27 GB, swap=0
|
||
# - Tahoe: RAM free drops to 0-0.1 GB during load, recovers to ~24 GB between tests
|
||
#
|
||
# Steady-State Baseline (DeepHermes post-load relaxation):
|
||
# - Tahoe: ~24 GB free (1.2-1.4 min after first test)
|
||
# - Sequoia: ~40 GB free (similar pattern)
|
||
#
|
||
# Degraded Threshold: ram_free < 5 GB (extreme memory pressure)
|
||
# - Marks 0-0.1 GB minima as degraded ✅
|
||
# - Normal tests (10-20 GB free) stay clean ✅
|
||
flags = []
|
||
if ram_gb < 5.0: # < 5 GB free = extreme memory pressure
|
||
flags.append("degraded_ram")
|
||
if zombies > 0:
|
||
flags.append("degraded_zombies")
|
||
if not flags:
|
||
flags.append("clean")
|
||
|
||
quality_flags.append(flags)
|
||
|
||
clean_count = sum(1 for flags in quality_flags if flags == ["clean"])
|
||
degraded_ram = sum(1 for flags in quality_flags if "degraded_ram" in flags)
|
||
degraded_zombies = sum(1 for flags in quality_flags if "degraded_zombies" in flags)
|
||
|
||
# Per-model statistics (with inference modality breakdown)
|
||
# Filter: Only count actual inference tests (duration >= 0.5s)
|
||
# This excludes infrastructure tests like test_vision_model_info_fixture_works
|
||
inference_tests = [e for e in passed_with_model if e["duration"] >= 0.5]
|
||
|
||
model_stats = {}
|
||
for entry in inference_tests:
|
||
model_id = entry["model"]["id"]
|
||
if model_id not in model_stats:
|
||
model_stats[model_id] = {
|
||
"id": model_id,
|
||
"size_gb": entry["model"].get("size_gb", 0), # Default to 0 if missing (e.g., pipe tests)
|
||
# Total stats (legacy, always populated)
|
||
"count": 0,
|
||
"total_time": 0,
|
||
# Per-modality breakdown (NEW in v0.2.1, Audio in v0.2.2)
|
||
"vision_count": 0,
|
||
"vision_time": 0.0,
|
||
"vision_ram_min": float("inf"),
|
||
"vision_ram_max": 0,
|
||
"text_count": 0,
|
||
"text_time": 0.0,
|
||
"text_ram_min": float("inf"),
|
||
"text_ram_max": 0,
|
||
"audio_count": 0,
|
||
"audio_time": 0.0,
|
||
"audio_ram_min": float("inf"),
|
||
"audio_ram_max": 0,
|
||
"unknown_count": 0,
|
||
"unknown_time": 0.0,
|
||
"unknown_ram_min": float("inf"),
|
||
"unknown_ram_max": 0,
|
||
# System health (global, for backward compat)
|
||
"ram_min": float("inf"),
|
||
"ram_max": 0,
|
||
"swap_max": 0,
|
||
"zombies_max": 0,
|
||
}
|
||
|
||
stats = model_stats[model_id]
|
||
duration = entry["duration"]
|
||
|
||
# Update totals (always)
|
||
stats["count"] += 1
|
||
stats["total_time"] += duration
|
||
|
||
# Update modality-specific stats (NEW in v0.2.1, Audio in v0.2.2)
|
||
modality = entry.get("metadata", {}).get("inference_modality", "unknown")
|
||
if modality == "vision":
|
||
stats["vision_count"] += 1
|
||
stats["vision_time"] += duration
|
||
elif modality == "text":
|
||
stats["text_count"] += 1
|
||
stats["text_time"] += duration
|
||
elif modality == "audio":
|
||
stats["audio_count"] += 1
|
||
stats["audio_time"] += duration
|
||
else: # "unknown" or any other value (backward compat)
|
||
stats["unknown_count"] += 1
|
||
stats["unknown_time"] += duration
|
||
|
||
# Handle optional system_health (backward compatibility)
|
||
if "system_health" in entry:
|
||
ram_gb = entry["system_health"].get("ram_free_gb", 0)
|
||
# Update per-modality RAM stats
|
||
if modality == "vision":
|
||
stats["vision_ram_min"] = min(stats["vision_ram_min"], ram_gb)
|
||
stats["vision_ram_max"] = max(stats["vision_ram_max"], ram_gb)
|
||
elif modality == "text":
|
||
stats["text_ram_min"] = min(stats["text_ram_min"], ram_gb)
|
||
stats["text_ram_max"] = max(stats["text_ram_max"], ram_gb)
|
||
elif modality == "audio":
|
||
stats["audio_ram_min"] = min(stats["audio_ram_min"], ram_gb)
|
||
stats["audio_ram_max"] = max(stats["audio_ram_max"], ram_gb)
|
||
else:
|
||
stats["unknown_ram_min"] = min(stats["unknown_ram_min"], ram_gb)
|
||
stats["unknown_ram_max"] = max(stats["unknown_ram_max"], ram_gb)
|
||
|
||
# Handle optional system_health - global stats (backward compatibility)
|
||
if "system_health" in entry:
|
||
stats["ram_min"] = min(stats["ram_min"], entry["system_health"].get("ram_free_gb", 0))
|
||
stats["ram_max"] = max(stats["ram_max"], entry["system_health"].get("ram_free_gb", 0))
|
||
stats["swap_max"] = max(stats["swap_max"], entry["system_health"].get("swap_used_mb", 0))
|
||
stats["zombies_max"] = max(stats["zombies_max"], entry["system_health"].get("zombie_processes", 0))
|
||
|
||
# Per-test statistics (use inference_tests to filter infrastructure tests)
|
||
# Group by (test_name, modality) to differentiate Vision/Text phases of same test
|
||
import statistics
|
||
test_stats = {}
|
||
for entry in inference_tests:
|
||
# Extract test function name and normalize (remove parametrization)
|
||
test_full = entry["test"].split("::")[-1]
|
||
test_name = test_full.split("[")[0] # Remove [discovered_XX] part
|
||
|
||
model_id = entry["model"]["id"]
|
||
model_short = model_id.replace("mlx-community/", "").split("-")[0] # Short name
|
||
duration = entry["duration"]
|
||
modality = entry.get("metadata", {}).get("inference_modality", "unknown")
|
||
|
||
# Key: (test_name, modality) to separate Vision/Text phases
|
||
key = (test_name, modality)
|
||
|
||
if key not in test_stats:
|
||
test_stats[key] = {
|
||
"name": test_name,
|
||
"modality": modality,
|
||
"models": set(),
|
||
"runs": [],
|
||
}
|
||
|
||
test_stats[key]["models"].add(model_id)
|
||
test_stats[key]["runs"].append({
|
||
"model": model_id,
|
||
"model_short": model_short,
|
||
"duration": duration
|
||
})
|
||
|
||
# Calculate aggregates per test (key is now tuple: (test_name, modality))
|
||
for key, test_data in test_stats.items():
|
||
durations = [r["duration"] for r in test_data["runs"]]
|
||
test_data["model_count"] = len(test_data["models"])
|
||
test_data["median_time"] = statistics.median(durations) if durations else 0
|
||
|
||
# Find fastest and slowest
|
||
sorted_runs = sorted(test_data["runs"], key=lambda r: r["duration"])
|
||
test_data["fastest"] = sorted_runs[0] if sorted_runs else None
|
||
test_data["slowest"] = sorted_runs[-1] if sorted_runs else None
|
||
|
||
# Convert set to list for JSON serialization
|
||
test_data["models"] = list(test_data["models"])
|
||
|
||
# Hardware profile (scan for first entry with data, handles manual JSONL entries)
|
||
hw_profile = {}
|
||
for entry in data:
|
||
if "system" in entry and "hardware_profile" in entry["system"]:
|
||
hw_profile = entry["system"]["hardware_profile"]
|
||
break
|
||
|
||
return {
|
||
"total_tests": len(benchmark_data),
|
||
"passed": len(passed_tests),
|
||
"passed_with_model": len(passed_with_model),
|
||
"passed_infrastructure": len(passed_without_model),
|
||
"skipped": len(skipped_tests),
|
||
"total_duration": sum(e["duration"] for e in passed_tests),
|
||
"schema_version": benchmark_data[0].get("schema_version", "unknown") if benchmark_data else "unknown",
|
||
"mlx_knife_version": benchmark_data[0].get("mlx_knife_version", "unknown") if benchmark_data else "unknown",
|
||
"swap": {
|
||
"min": min(swap_values) if swap_values else 0,
|
||
"max": max(swap_values) if swap_values else 0,
|
||
"avg": sum(swap_values) / len(swap_values) if swap_values else 0,
|
||
},
|
||
"ram": {
|
||
"min": min(ram_values) if ram_values else 0,
|
||
"max": max(ram_values) if ram_values else 0,
|
||
"avg": sum(ram_values) / len(ram_values) if ram_values else 0,
|
||
},
|
||
"zombies": {
|
||
"min": min(zombie_values) if zombie_values else 0,
|
||
"max": max(zombie_values) if zombie_values else 0,
|
||
},
|
||
"quality": {
|
||
"clean": clean_count,
|
||
"degraded_ram": degraded_ram,
|
||
"degraded_zombies": degraded_zombies,
|
||
"clean_percent": 100 * clean_count / len(data) if data else 0,
|
||
},
|
||
"hardware": hw_profile,
|
||
"models": model_stats,
|
||
"tests": test_stats,
|
||
}
|
||
|
||
|
||
def generate_markdown(stats: Dict, input_file: Path, compare_file: Optional[Path] = None, compare_stats: Optional[Dict] = None) -> str:
|
||
"""Generate Markdown report from statistics."""
|
||
version = stats["mlx_knife_version"]
|
||
date = input_file.stem.split("-v")[0] # Extract date from filename
|
||
now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S UTC")
|
||
|
||
# Header
|
||
md = f"""# Benchmark Report v{TEMPLATE_VERSION}: {version}
|
||
|
||
**Date:** {date}
|
||
**Generated:** {now}
|
||
**Generator:** generate_benchmark_report.py v{TEMPLATE_VERSION}
|
||
**Hardware:** {stats['hardware'].get('model', 'unknown')}, {stats['hardware'].get('cores_physical', '?')} cores
|
||
|
||
---
|
||
|
||
## Input Files
|
||
|
||
- **Primary:** `{input_file}`
|
||
- **Schema:** v{stats['schema_version']}
|
||
"""
|
||
|
||
if compare_file:
|
||
md += f"- **Comparison:** `{compare_file}`\n"
|
||
|
||
md += "\n---\n\n"
|
||
|
||
# Executive Summary
|
||
md += "## Executive Summary\n\n"
|
||
md += f"**Tests:** {stats['total_tests']} total ({stats['passed']} passed, {stats['skipped']} skipped)\n"
|
||
md += f"**Duration:** {stats['total_duration']:.1f}s ({stats['total_duration']/60:.1f} min)\n"
|
||
md += f"**Quality:** {stats['quality']['clean_percent']:.1f}% clean ({stats['quality']['clean']}/{stats['total_tests']})\n"
|
||
md += f"**Models:** {len(stats['models'])} tested\n\n"
|
||
|
||
# Comparison Summary
|
||
if compare_stats:
|
||
old_duration = compare_stats['total_duration']
|
||
new_duration = stats['total_duration']
|
||
duration_delta = new_duration - old_duration
|
||
duration_pct = (duration_delta / old_duration * 100) if old_duration > 0 else 0
|
||
|
||
# Count models by change direction
|
||
compare_models_dict = {m['id']: m for m in compare_stats['models'].values()}
|
||
slower_count = 0
|
||
faster_count = 0
|
||
for model in stats['models'].values():
|
||
old_model = compare_models_dict.get(model['id'])
|
||
if old_model:
|
||
if model['total_time'] > old_model['total_time']:
|
||
slower_count += 1
|
||
elif model['total_time'] < old_model['total_time']:
|
||
faster_count += 1
|
||
|
||
total_compared = slower_count + faster_count
|
||
change_icon = "⚠️" if duration_pct > 3 else "✅" if duration_pct < -1 else "➡️"
|
||
|
||
md += f"### Comparison\n\n"
|
||
md += f"**vs:** `{compare_file.name}`\n"
|
||
md += f"**Duration:** {old_duration/60:.1f} min → {new_duration/60:.1f} min ({duration_pct:+.1f}%) {change_icon}\n"
|
||
if total_compared > 0:
|
||
md += f"**Models:** {slower_count}/{total_compared} slower ({100*slower_count/total_compared:.0f}%), {faster_count}/{total_compared} faster ({100*faster_count/total_compared:.0f}%)\n"
|
||
md += "\n"
|
||
|
||
# Validation Status
|
||
quality_icon = "✅" if stats['quality']['clean_percent'] == 100 else "⚠️"
|
||
md += f"{quality_icon} **System Health:** "
|
||
if stats['quality']['clean_percent'] == 100:
|
||
md += "All tests clean (RAM >5 GB free, 0 zombies)\n"
|
||
else:
|
||
md += f"{stats['quality']['degraded_ram']} degraded (RAM <5 GB free), {stats['quality']['degraded_zombies']} degraded (zombies)\n"
|
||
|
||
md += "\n---\n\n"
|
||
|
||
# Test Summary
|
||
md += "## Test Summary\n\n"
|
||
md += f"""```
|
||
Total tests: {stats['total_tests']}
|
||
Passed: {stats['passed']}
|
||
With model: {stats['passed_with_model']}
|
||
Infrastructure: {stats['passed_infrastructure']}
|
||
Skipped: {stats['skipped']}
|
||
Duration: {stats['total_duration']:.1f}s ({stats['total_duration']/60:.1f} min)
|
||
```
|
||
|
||
---
|
||
|
||
## System Health
|
||
|
||
"""
|
||
md += f"""```
|
||
Swap (MB): min={stats['swap']['min']}, max={stats['swap']['max']}, avg={stats['swap']['avg']:.1f}
|
||
RAM free (GB): min={stats['ram']['min']:.1f}, max={stats['ram']['max']:.1f}, avg={stats['ram']['avg']:.1f}
|
||
Zombies: min={stats['zombies']['min']}, max={stats['zombies']['max']}
|
||
|
||
Quality Flags (Thresholds: RAM <5 GB free, zombies >0):
|
||
Clean: {stats['quality']['clean']}/{stats['total_tests']} ({stats['quality']['clean_percent']:.1f}%)
|
||
Degraded (RAM): {stats['quality']['degraded_ram']}
|
||
Degraded (zombies): {stats['quality']['degraded_zombies']}
|
||
```
|
||
|
||
---
|
||
|
||
## Per-Model Statistics
|
||
|
||
"""
|
||
|
||
# Sort models by total time (descending), or by change if comparing
|
||
sorted_models = sorted(stats['models'].values(), key=lambda m: m['total_time'], reverse=True)
|
||
|
||
# Build comparison lookup if available
|
||
compare_models = {}
|
||
if compare_stats:
|
||
compare_models = {m['id']: m for m in compare_stats['models'].values()}
|
||
# Re-sort by change percentage (biggest regression first)
|
||
def get_change_pct(model):
|
||
old = compare_models.get(model['id'])
|
||
if old and old['total_time'] > 0:
|
||
return (model['total_time'] - old['total_time']) / old['total_time'] * 100
|
||
return 0
|
||
sorted_models = sorted(stats['models'].values(), key=get_change_pct, reverse=True)
|
||
|
||
if compare_stats:
|
||
md += f"""```
|
||
{'Model':<40} {'Size':<7} {'Mode':<6} {'Tests':<5} {'Time':<8} {'Old':<8} {'Δ':<8} {'Change':<10} {'RAM (GB)':<12}
|
||
{'='*40} {'='*7} {'='*6} {'='*5} {'='*8} {'='*8} {'='*8} {'='*10} {'='*12}
|
||
"""
|
||
else:
|
||
md += f"""```
|
||
{'Model':<50} {'Size':<8} {'Mode':<6} {'Tests':<6} {'Time':<10} {'RAM (GB)':<20}
|
||
{'='*50} {'='*8} {'='*6} {'='*6} {'='*10} {'='*20}
|
||
"""
|
||
|
||
for model in sorted_models:
|
||
# Shorten model ID (remove mlx-community/ prefix)
|
||
model_short = model['id'].replace('mlx-community/', '')
|
||
max_len = 38 if compare_stats else 48
|
||
if len(model_short) > max_len:
|
||
model_short = model_short[:max_len-3] + "..."
|
||
|
||
# Global RAM range (for backward compat / fallback)
|
||
ram_range = f"{model['ram_min']:.1f}-{model['ram_max']:.1f}"
|
||
|
||
if compare_stats:
|
||
old_model = compare_models.get(model['id'])
|
||
|
||
# Separate rows per modality (same as non-comparison mode)
|
||
rows_written = 0
|
||
|
||
# Vision modality
|
||
if model['vision_count'] > 0:
|
||
v_ram_min = model['vision_ram_min']
|
||
v_ram_max = model['vision_ram_max']
|
||
if v_ram_min == float('inf'):
|
||
v_ram_range = "-"
|
||
elif v_ram_min == v_ram_max:
|
||
v_ram_range = f"{v_ram_min:.1f}"
|
||
else:
|
||
v_ram_range = f"{v_ram_min:.1f}-{v_ram_max:.1f}"
|
||
|
||
# Get old vision stats (if available)
|
||
if old_model and old_model.get('vision_count', 0) > 0:
|
||
old_time = old_model['vision_time']
|
||
delta = model['vision_time'] - old_time
|
||
change_pct = (delta / old_time * 100) if old_time > 0 else 0
|
||
if change_pct > 5:
|
||
status = "⚠️"
|
||
elif change_pct < -1:
|
||
status = "✅"
|
||
else:
|
||
status = ""
|
||
change_str = f"{change_pct:+.1f}% {status}"
|
||
md += f"{model_short:<40} {model['size_gb']:>5.1f}GB {'Vision':<6} {model['vision_count']:<5} {model['vision_time']:>6.1f}s {old_time:>6.1f}s {delta:>+6.1f}s {change_str:<10} {v_ram_range:<12}\n"
|
||
else:
|
||
md += f"{model_short:<40} {model['size_gb']:>5.1f}GB {'Vision':<6} {model['vision_count']:<5} {model['vision_time']:>6.1f}s {'N/A':<8} {'N/A':<8} {'NEW':<10} {v_ram_range:<12}\n"
|
||
rows_written += 1
|
||
|
||
# Text modality
|
||
if model['text_count'] > 0:
|
||
t_ram_min = model['text_ram_min']
|
||
t_ram_max = model['text_ram_max']
|
||
if t_ram_min == float('inf'):
|
||
t_ram_range = "-"
|
||
elif t_ram_min == t_ram_max:
|
||
t_ram_range = f"{t_ram_min:.1f}"
|
||
else:
|
||
t_ram_range = f"{t_ram_min:.1f}-{t_ram_max:.1f}"
|
||
|
||
# Get old text stats (if available)
|
||
if old_model and old_model.get('text_count', 0) > 0:
|
||
old_time = old_model['text_time']
|
||
delta = model['text_time'] - old_time
|
||
change_pct = (delta / old_time * 100) if old_time > 0 else 0
|
||
if change_pct > 5:
|
||
status = "⚠️"
|
||
elif change_pct < -1:
|
||
status = "✅"
|
||
else:
|
||
status = ""
|
||
change_str = f"{change_pct:+.1f}% {status}"
|
||
md += f"{model_short:<40} {model['size_gb']:>5.1f}GB {'Text':<6} {model['text_count']:<5} {model['text_time']:>6.1f}s {old_time:>6.1f}s {delta:>+6.1f}s {change_str:<10} {t_ram_range:<12}\n"
|
||
else:
|
||
md += f"{model_short:<40} {model['size_gb']:>5.1f}GB {'Text':<6} {model['text_count']:<5} {model['text_time']:>6.1f}s {'N/A':<8} {'N/A':<8} {'NEW':<10} {t_ram_range:<12}\n"
|
||
rows_written += 1
|
||
|
||
# Audio modality (NEW in v0.2.2)
|
||
if model['audio_count'] > 0:
|
||
a_ram_min = model['audio_ram_min']
|
||
a_ram_max = model['audio_ram_max']
|
||
if a_ram_min == float('inf'):
|
||
a_ram_range = "-"
|
||
elif a_ram_min == a_ram_max:
|
||
a_ram_range = f"{a_ram_min:.1f}"
|
||
else:
|
||
a_ram_range = f"{a_ram_min:.1f}-{a_ram_max:.1f}"
|
||
|
||
# Get old audio stats (if available)
|
||
if old_model and old_model.get('audio_count', 0) > 0:
|
||
old_time = old_model['audio_time']
|
||
delta = model['audio_time'] - old_time
|
||
change_pct = (delta / old_time * 100) if old_time > 0 else 0
|
||
if change_pct > 5:
|
||
status = "⚠️"
|
||
elif change_pct < -1:
|
||
status = "✅"
|
||
else:
|
||
status = ""
|
||
change_str = f"{change_pct:+.1f}% {status}"
|
||
md += f"{model_short:<40} {model['size_gb']:>5.1f}GB {'Audio':<6} {model['audio_count']:<5} {model['audio_time']:>6.1f}s {old_time:>6.1f}s {delta:>+6.1f}s {change_str:<10} {a_ram_range:<12}\n"
|
||
else:
|
||
md += f"{model_short:<40} {model['size_gb']:>5.1f}GB {'Audio':<6} {model['audio_count']:<5} {model['audio_time']:>6.1f}s {'N/A':<8} {'N/A':<8} {'NEW':<10} {a_ram_range:<12}\n"
|
||
rows_written += 1
|
||
|
||
# Fallback for legacy data (no modality info) - rare in comparison mode
|
||
if rows_written == 0 and old_model:
|
||
old_time = old_model['total_time']
|
||
delta = model['total_time'] - old_time
|
||
change_pct = (delta / old_time * 100) if old_time > 0 else 0
|
||
if change_pct > 5:
|
||
status = "⚠️"
|
||
elif change_pct < -1:
|
||
status = "✅"
|
||
else:
|
||
status = ""
|
||
change_str = f"{change_pct:+.1f}% {status}"
|
||
md += f"{model_short:<40} {model['size_gb']:>5.1f}GB {'-':<6} {model['count']:<5} {model['total_time']:>6.1f}s {old_time:>6.1f}s {delta:>+6.1f}s {change_str:<10} {ram_range:<12}\n"
|
||
elif rows_written == 0:
|
||
# New model with no modality info
|
||
md += f"{model_short:<40} {model['size_gb']:>5.1f}GB {'-':<6} {model['count']:<5} {model['total_time']:>6.1f}s {'N/A':<8} {'N/A':<8} {'NEW':<10} {ram_range:<12}\n"
|
||
else:
|
||
# Separate rows per modality (no "Mixed" ambiguity)
|
||
# Each modality gets its own line with specific stats + RAM
|
||
rows_written = 0
|
||
|
||
if model['vision_count'] > 0:
|
||
# Use modality-specific RAM range (single value if min==max)
|
||
v_ram_min = model['vision_ram_min']
|
||
v_ram_max = model['vision_ram_max']
|
||
if v_ram_min == float('inf'):
|
||
v_ram_range = "-"
|
||
elif v_ram_min == v_ram_max:
|
||
v_ram_range = f"{v_ram_min:.1f}"
|
||
else:
|
||
v_ram_range = f"{v_ram_min:.1f}-{v_ram_max:.1f}"
|
||
md += f"{model_short:<50} {model['size_gb']:>6.1f}GB {'Vision':<6} {model['vision_count']:<6} {model['vision_time']:>8.1f}s {v_ram_range:<20}\n"
|
||
rows_written += 1
|
||
|
||
if model['text_count'] > 0:
|
||
# Use modality-specific RAM range (single value if min==max)
|
||
t_ram_min = model['text_ram_min']
|
||
t_ram_max = model['text_ram_max']
|
||
if t_ram_min == float('inf'):
|
||
t_ram_range = "-"
|
||
elif t_ram_min == t_ram_max:
|
||
t_ram_range = f"{t_ram_min:.1f}"
|
||
else:
|
||
t_ram_range = f"{t_ram_min:.1f}-{t_ram_max:.1f}"
|
||
md += f"{model_short:<50} {model['size_gb']:>6.1f}GB {'Text':<6} {model['text_count']:<6} {model['text_time']:>8.1f}s {t_ram_range:<20}\n"
|
||
rows_written += 1
|
||
|
||
if model['audio_count'] > 0:
|
||
# Use modality-specific RAM range (single value if min==max)
|
||
a_ram_min = model['audio_ram_min']
|
||
a_ram_max = model['audio_ram_max']
|
||
if a_ram_min == float('inf'):
|
||
a_ram_range = "-"
|
||
elif a_ram_min == a_ram_max:
|
||
a_ram_range = f"{a_ram_min:.1f}"
|
||
else:
|
||
a_ram_range = f"{a_ram_min:.1f}-{a_ram_max:.1f}"
|
||
md += f"{model_short:<50} {model['size_gb']:>6.1f}GB {'Audio':<6} {model['audio_count']:<6} {model['audio_time']:>8.1f}s {a_ram_range:<20}\n"
|
||
rows_written += 1
|
||
|
||
# Fallback for legacy data (no modality info)
|
||
if rows_written == 0:
|
||
md += f"{model_short:<50} {model['size_gb']:>6.1f}GB {'-':<6} {model['count']:<6} {model['total_time']:>8.1f}s {ram_range:<20}\n"
|
||
|
||
md += "```\n\n"
|
||
|
||
# Model Categories (with modality differentiation)
|
||
large_models = [m for m in sorted_models if m['size_gb'] >= 20]
|
||
medium_models = [m for m in sorted_models if 10 <= m['size_gb'] < 20]
|
||
small_models = [m for m in sorted_models if m['size_gb'] < 10]
|
||
|
||
def format_category_stats(models_list, category_name):
|
||
"""Format category statistics with Vision/Text breakdown."""
|
||
if not models_list:
|
||
return ""
|
||
|
||
# Collect Vision, Text, and Audio stats (Audio NEW in v0.2.2)
|
||
vision_models = [m for m in models_list if m.get('vision_count', 0) > 0]
|
||
text_models = [m for m in models_list if m.get('text_count', 0) > 0]
|
||
audio_models = [m for m in models_list if m.get('audio_count', 0) > 0]
|
||
|
||
output = f"{category_name}: {len(models_list)} models\n"
|
||
output += f" Avg size: {sum(m['size_gb'] for m in models_list) / len(models_list):.1f} GB\n"
|
||
|
||
# Vision stats
|
||
if vision_models:
|
||
avg_vision_time = sum(m['vision_time']/m['vision_count'] for m in vision_models) / len(vision_models)
|
||
|
||
# Collect RAM values (filter sentinel values)
|
||
vision_ram_mins = [m['vision_ram_min'] for m in vision_models if m['vision_ram_min'] != float('inf')]
|
||
vision_ram_maxs = [m['vision_ram_max'] for m in vision_models if m['vision_ram_max'] > 0]
|
||
|
||
output += f" Vision Tests:\n"
|
||
output += f" Models tested: {len(vision_models)}\n"
|
||
output += f" Avg test time: {avg_vision_time:.1f}s\n"
|
||
|
||
# Only output RAM range if data available
|
||
if vision_ram_mins and vision_ram_maxs:
|
||
all_vision_ram_min = min(vision_ram_mins)
|
||
all_vision_ram_max = max(vision_ram_maxs)
|
||
output += f" RAM range: {all_vision_ram_min:.1f}-{all_vision_ram_max:.1f} GB\n"
|
||
|
||
# Text stats
|
||
if text_models:
|
||
avg_text_time = sum(m['text_time']/m['text_count'] for m in text_models) / len(text_models)
|
||
|
||
# Collect RAM values (filter sentinel values)
|
||
text_ram_mins = [m['text_ram_min'] for m in text_models if m['text_ram_min'] != float('inf')]
|
||
text_ram_maxs = [m['text_ram_max'] for m in text_models if m['text_ram_max'] > 0]
|
||
|
||
output += f" Text Tests:\n"
|
||
output += f" Models tested: {len(text_models)}\n"
|
||
output += f" Avg test time: {avg_text_time:.1f}s\n"
|
||
|
||
# Only output RAM range if data available
|
||
if text_ram_mins and text_ram_maxs:
|
||
all_text_ram_min = min(text_ram_mins)
|
||
all_text_ram_max = max(text_ram_maxs)
|
||
output += f" RAM range: {all_text_ram_min:.1f}-{all_text_ram_max:.1f} GB\n"
|
||
|
||
# Audio stats (NEW in v0.2.2)
|
||
if audio_models:
|
||
avg_audio_time = sum(m['audio_time']/m['audio_count'] for m in audio_models) / len(audio_models)
|
||
|
||
# Collect RAM values (filter sentinel values)
|
||
audio_ram_mins = [m['audio_ram_min'] for m in audio_models if m['audio_ram_min'] != float('inf')]
|
||
audio_ram_maxs = [m['audio_ram_max'] for m in audio_models if m['audio_ram_max'] > 0]
|
||
|
||
output += f" Audio Tests:\n"
|
||
output += f" Models tested: {len(audio_models)}\n"
|
||
output += f" Avg test time: {avg_audio_time:.1f}s\n"
|
||
|
||
# Only output RAM range if data available
|
||
if audio_ram_mins and audio_ram_maxs:
|
||
all_audio_ram_min = min(audio_ram_mins)
|
||
all_audio_ram_max = max(audio_ram_maxs)
|
||
output += f" RAM range: {all_audio_ram_min:.1f}-{all_audio_ram_max:.1f} GB\n"
|
||
|
||
# Fallback for legacy data (no modality info)
|
||
if not vision_models and not text_models and not audio_models:
|
||
avg_time = sum(m['total_time']/m['count'] for m in models_list) / len(models_list)
|
||
avg_ram = sum(m['ram_min'] for m in models_list) / len(models_list)
|
||
output += f" Avg test time: {avg_time:.1f}s\n"
|
||
output += f" Avg min RAM: {avg_ram:.1f} GB\n"
|
||
|
||
return output
|
||
|
||
md += "### Model Categories\n\n"
|
||
if large_models or medium_models or small_models:
|
||
md += "```\n"
|
||
if large_models:
|
||
md += format_category_stats(large_models, "LARGE MODELS (≥20 GB)")
|
||
md += "\n"
|
||
if medium_models:
|
||
md += format_category_stats(medium_models, "MEDIUM MODELS (10-20 GB)")
|
||
md += "\n"
|
||
if small_models:
|
||
md += format_category_stats(small_models, "SMALL MODELS (<10 GB)")
|
||
md += "```\n"
|
||
|
||
md += "\n---\n\n"
|
||
|
||
# Per-Test Statistics
|
||
md += "## Per-Test Statistics\n\n"
|
||
md += "Shows performance range across models for each test.\n\n"
|
||
|
||
# Sort tests by model count (descending) - most representative tests first
|
||
sorted_tests = sorted(stats['tests'].values(), key=lambda t: t['model_count'], reverse=True)
|
||
|
||
# Build comparison lookup for tests (key: (name, modality))
|
||
compare_tests = {}
|
||
if compare_stats:
|
||
compare_tests = {(t['name'], t.get('modality', 'unknown')): t for t in compare_stats['tests'].values()}
|
||
|
||
if compare_stats:
|
||
md += f"""```
|
||
{'Test Name':<38} {'Mode':<6} {'Models':<7} {'Fastest':<18} {'Slowest':<18} {'Med':<6} {'Old':<6} {'Δ Med':<8}
|
||
{'='*38} {'='*6} {'='*7} {'='*18} {'='*18} {'='*6} {'='*6} {'='*8}
|
||
"""
|
||
else:
|
||
md += f"""```
|
||
{'Test Name':<44} {'Mode':<6} {'Models':<7} {'Fastest':<22} {'Slowest':<22} {'Med Time'}
|
||
{'='*44} {'='*6} {'='*7} {'='*22} {'='*22} {'='*8}
|
||
"""
|
||
|
||
for test in sorted_tests:
|
||
# Shorten test name if needed
|
||
max_test_len = 36 if compare_stats else 42
|
||
test_short = test['name']
|
||
if len(test_short) > max_test_len:
|
||
test_short = test_short[:max_test_len-3] + "..."
|
||
|
||
# Format modality (Vision/Text/Audio/- for unknown)
|
||
modality = test.get('modality', 'unknown')
|
||
if modality == 'vision':
|
||
mode_str = 'Vision'
|
||
elif modality == 'text':
|
||
mode_str = 'Text'
|
||
elif modality == 'audio':
|
||
mode_str = 'Audio'
|
||
else:
|
||
mode_str = '-'
|
||
|
||
# Format fastest/slowest
|
||
fastest = test['fastest']
|
||
slowest = test['slowest']
|
||
|
||
if fastest and slowest:
|
||
max_model_len = 16 if compare_stats else 20
|
||
fastest_str = f"{fastest['model_short']} ({fastest['duration']:.1f}s)"
|
||
slowest_str = f"{slowest['model_short']} ({slowest['duration']:.1f}s)"
|
||
if len(fastest_str) > max_model_len:
|
||
fastest_str = fastest_str[:max_model_len-3] + "..."
|
||
if len(slowest_str) > max_model_len:
|
||
slowest_str = slowest_str[:max_model_len-3] + "..."
|
||
|
||
med_time = test['median_time']
|
||
|
||
if compare_stats:
|
||
old_test = compare_tests.get((test['name'], test.get('modality', 'unknown')))
|
||
if old_test:
|
||
old_med = old_test['median_time']
|
||
delta_pct = ((med_time - old_med) / old_med * 100) if old_med > 0 else 0
|
||
delta_str = f"{delta_pct:+.1f}%"
|
||
md += f"{test_short:<38} {mode_str:<6} {test['model_count']:<7} {fastest_str:<18} {slowest_str:<18} {med_time:<5.1f}s {old_med:<5.1f}s {delta_str:<8}\n"
|
||
else:
|
||
md += f"{test_short:<38} {mode_str:<6} {test['model_count']:<7} {fastest_str:<18} {slowest_str:<18} {med_time:<5.1f}s {'N/A':<6} {'NEW':<8}\n"
|
||
else:
|
||
md += f"{test_short:<44} {mode_str:<6} {test['model_count']:<7} {fastest_str:<22} {slowest_str:<22} {med_time:.1f}s\n"
|
||
|
||
md += "```\n\n"
|
||
|
||
md += "\n---\n\n"
|
||
md += "## Files\n\n"
|
||
md += f"- **Benchmark report:** `{input_file}`\n"
|
||
md += f"- **Schema:** `benchmarks/schemas/report-v{stats['schema_version']}.schema.json`\n"
|
||
|
||
return md
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(
|
||
description="Generate benchmark analysis report from JSONL data",
|
||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
epilog=__doc__
|
||
)
|
||
parser.add_argument(
|
||
'input',
|
||
nargs='?',
|
||
type=Path,
|
||
help='JSONL benchmark file (default: latest in benchmarks/reports/)'
|
||
)
|
||
parser.add_argument(
|
||
'--compare',
|
||
type=Path,
|
||
help='Compare with this JSONL file (adds Old/Δ/Change columns)'
|
||
)
|
||
parser.add_argument(
|
||
'--output',
|
||
type=Path,
|
||
help='Output markdown file (default: auto-generated in benchmarks/reports/)'
|
||
)
|
||
|
||
args = parser.parse_args()
|
||
|
||
# Determine input file
|
||
if args.input:
|
||
input_file = args.input
|
||
else:
|
||
input_file = find_latest_jsonl()
|
||
if not input_file:
|
||
print("❌ No JSONL files found in benchmarks/reports/")
|
||
sys.exit(1)
|
||
print(f"📊 Auto-detected: {input_file}")
|
||
|
||
if not input_file.exists():
|
||
print(f"❌ File not found: {input_file}")
|
||
sys.exit(1)
|
||
|
||
# Load and validate
|
||
print(f"📋 Loading: {input_file}")
|
||
schema = load_schema()
|
||
data = load_jsonl(input_file)
|
||
|
||
print(f"✓ Loaded {len(data)} entries")
|
||
|
||
# Validate against schema
|
||
if not validate_jsonl(data, schema, input_file):
|
||
sys.exit(1)
|
||
|
||
print(f"✓ Schema validation passed")
|
||
|
||
# Calculate statistics
|
||
stats = calculate_statistics(data)
|
||
|
||
# Load and calculate comparison statistics if requested
|
||
compare_stats = None
|
||
if args.compare:
|
||
if not args.compare.exists():
|
||
print(f"❌ Comparison file not found: {args.compare}")
|
||
sys.exit(1)
|
||
print(f"📊 Comparing with: {args.compare}")
|
||
compare_data = load_jsonl(args.compare)
|
||
if not validate_jsonl(compare_data, schema, args.compare):
|
||
sys.exit(1)
|
||
compare_stats = calculate_statistics(compare_data)
|
||
print(f"✓ Loaded {len(compare_data)} comparison entries")
|
||
|
||
# Generate report
|
||
markdown = generate_markdown(stats, input_file, args.compare, compare_stats)
|
||
|
||
# Determine output file
|
||
if args.output:
|
||
output_file = args.output
|
||
else:
|
||
# Auto-generate: BENCHMARK-v1-<version>-<date>.md
|
||
version = extract_version_from_filename(input_file) or stats["mlx_knife_version"]
|
||
date = input_file.stem.split("-v")[0] # Extract date portion
|
||
output_file = REPORTS_DIR / f"BENCHMARK-v{TEMPLATE_VERSION}-{version}-{date}.md"
|
||
|
||
# Write output
|
||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||
with open(output_file, 'w') as f:
|
||
f.write(markdown)
|
||
|
||
print(f"✅ Generated: {output_file}")
|
||
print()
|
||
print(f"Summary:")
|
||
print(f" Tests: {stats['passed']}/{stats['total_tests']} passed")
|
||
print(f" Duration: {stats['total_duration']/60:.1f} min")
|
||
print(f" Quality: {stats['quality']['clean_percent']:.1f}% clean")
|
||
print(f" Models: {len(stats['models'])}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|