Files
mlx-knife/benchmarks/tools/memmon.py
T
The BROKE Cluster Team bf7480d042 Release 2.0.4-beta.9: Audio transcription via mlx-audio
Major Features:
- Audio transcription via mlx-audio backend (Whisper, >10min duration)
- OpenAI /v1/audio/transcriptions endpoint
- Memory Gate System (Vision: 8GB, Audio: 4GB)
- Config-based backend routing (ADR-020)
- Benchmark toolchain (memmon/memplot, Schema v0.2.2)

Key Fixes:
- EuroLLM tokenizer decoding
- Vision-model text-only routing regression
- Multimodal model context length detection
- Memory cleanup bug (mx.metal.clear_cache)
- Orphan process bug

Test Results:
- Unit tests: 647 passed, 11 skipped (Python 3.10-3.12)
- wet-umbrella: 171 passed total

See CHANGELOG.md for complete details and known issues.
2026-02-04 03:10:30 +01:00

508 lines
18 KiB
Python

#!/usr/bin/env python3
"""Memory Monitor - Standalone tool for tracking memory, CPU, and GPU during subprocess execution.
Samples RAM, swap, memory pressure, CPU load/usage, and GPU utilization while running any command.
Outputs JSONL with per-sample data and final summary.
Metrics tracked:
- RAM: free GB, memory pressure (kern.memorystatus_vm_pressure_level), vm_pressure (vm.memory_pressure)
- Swap: used MB
- CPU: load average (1/5/15 min), user/sys/idle %
- GPU: Device/Renderer/Tiler utilization % (via ioreg PerformanceStatistics, no sudo required)
Usage:
# Basic usage
python benchmarks/tools/memmon.py -- pytest -m live_e2e tests_2.0/live/
# With options
python benchmarks/tools/memmon.py --interval 200 --output memory.jsonl -- pytest -v
# Just monitor (no subprocess)
python benchmarks/tools/memmon.py --duration 60 --output memory.jsonl
Platform: macOS + Apple Silicon (MLX requirement)
Dependencies: ZERO - uses native macOS tools (sysctl, vm_stat, top, ioreg)
Future: Will be part of mlxk-benchmark kit.
"""
import argparse
import json
import os
import re
import subprocess
import sys
import threading
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
def parse_vm_stat_page_size(output: str) -> int:
"""Extract vm_stat page size in bytes, falling back to 16384 (Apple Silicon default).
Reuses proven logic from tests_2.0/conftest.py (ADR-013 Phase 0.5).
"""
match = re.search(r"page size of (\d+) bytes", output)
if match:
return int(match.group(1))
return 16384 # Apple Silicon default
def get_cpu_load() -> dict:
"""Get CPU load average and usage.
Returns load averages (1/5/15 min) and current CPU usage via top.
"""
import os
env = os.environ.copy()
env["LC_ALL"] = "C"
load_1 = load_5 = load_15 = 0.0
cpu_user = cpu_sys = cpu_idle = 0.0
# Load average via sysctl
try:
result = subprocess.run(
["sysctl", "-n", "vm.loadavg"],
capture_output=True, text=True, timeout=1, env=env
)
if result.returncode == 0:
# Parse: "{ 2.45 3.12 2.89 }"
parts = result.stdout.strip().strip("{}").split()
if len(parts) >= 3:
load_1 = float(parts[0])
load_5 = float(parts[1])
load_15 = float(parts[2])
except Exception:
pass
# CPU usage via top (single sample)
try:
result = subprocess.run(
["top", "-l", "1", "-n", "0", "-s", "0"],
capture_output=True, text=True, timeout=2, env=env
)
if result.returncode == 0:
for line in result.stdout.splitlines():
if "CPU usage:" in line:
# Parse: "CPU usage: 5.26% user, 10.52% sys, 84.21% idle"
parts = line.split("CPU usage:")[1].split(",")
for part in parts:
part = part.strip()
if "user" in part:
cpu_user = float(part.split("%")[0])
elif "sys" in part:
cpu_sys = float(part.split("%")[0])
elif "idle" in part:
cpu_idle = float(part.split("%")[0])
break
except Exception:
pass
return {
"load_1": round(load_1, 2),
"load_5": round(load_5, 2),
"load_15": round(load_15, 2),
"cpu_user": round(cpu_user, 1),
"cpu_sys": round(cpu_sys, 1),
"cpu_idle": round(cpu_idle, 1),
}
def get_gpu_usage() -> dict:
"""Get Apple Silicon GPU usage via ioreg PerformanceStatistics.
Parses ioreg AGXAccelerator PerformanceStatistics to extract:
- Device Utilization % (overall GPU busy %)
- Renderer Utilization % (3D rendering cores)
- Tiler Utilization % (geometry processing)
No sudo required. Falls back to basic detection if parsing fails.
"""
gpu_active = False
gpu_device_util = 0.0
gpu_renderer_util = 0.0
gpu_tiler_util = 0.0
try:
result = subprocess.run(
["ioreg", "-r", "-c", "AGXAccelerator", "-d", "2"],
capture_output=True, text=True, timeout=2
)
if result.returncode == 0:
# Parse PerformanceStatistics dictionary
# Format: "PerformanceStatistics" = {"Device Utilization %"=5,"Renderer Utilization %"=3,...}
for line in result.stdout.splitlines():
if "PerformanceStatistics" in line:
# Extract utilization values
if "Device Utilization %" in line:
match = re.search(r'"Device Utilization %"=(\d+)', line)
if match:
gpu_device_util = float(match.group(1))
gpu_active = True
if "Renderer Utilization %" in line:
match = re.search(r'"Renderer Utilization %"=(\d+)', line)
if match:
gpu_renderer_util = float(match.group(1))
if "Tiler Utilization %" in line:
match = re.search(r'"Tiler Utilization %"=(\d+)', line)
if match:
gpu_tiler_util = float(match.group(1))
break
except Exception:
pass
return {
"gpu_active": gpu_active,
"gpu_device_util": gpu_device_util, # Overall GPU utilization %
"gpu_renderer_util": gpu_renderer_util, # 3D rendering cores %
"gpu_tiler_util": gpu_tiler_util, # Geometry/tiler cores %
}
def get_memory_sample() -> dict:
"""Get current memory state using native macOS tools.
Platform: macOS only (MLX requirement)
Dependencies: ZERO - uses sysctl and vm_stat
Reuses proven parsing logic from tests_2.0/conftest.py (_get_macos_system_health).
Previously used psutil.swap_memory() which was BROKEN (showed 0 MB during 65GB real usage).
"""
# Force C locale for consistent number formatting (avoid locale-specific decimal separators)
import os
env = os.environ.copy()
env["LC_ALL"] = "C"
# Get memory pressure (kern.memorystatus_vm_pressure_level: 1=NORMAL, 2=WARN, 4=CRITICAL)
memory_pressure = 1 # Default to NORMAL
try:
result = subprocess.run(
["sysctl", "-n", "kern.memorystatus_vm_pressure_level"],
capture_output=True, text=True, timeout=1, env=env
)
memory_pressure = int(result.stdout.strip())
except Exception:
pass
# Get vm.memory_pressure (0=NORMAL, 1=WARN, 4=CRITICAL) - used by Memory Gates
vm_pressure = 0 # Default to NORMAL
try:
result = subprocess.run(
["sysctl", "-n", "vm.memory_pressure"],
capture_output=True, text=True, timeout=1, env=env
)
vm_pressure = int(result.stdout.strip())
except Exception:
pass
# Get swap usage via sysctl (proven working - same logic as conftest.py)
swap_mb = 0
try:
result = subprocess.run(
["sysctl", "vm.swapusage"],
capture_output=True, text=True, timeout=1, env=env
)
if result.returncode == 0:
# Parse: "vm.swapusage: total = 0.00M used = 0.00M free = 0.00M (encrypted)"
# LC_ALL=C ensures consistent dot decimal separator
parts = result.stdout.split("used = ")
if len(parts) > 1:
used_str = parts[1].split()[0]
# Parse size (can be M or G suffix)
if used_str.endswith("G"):
swap_mb = int(float(used_str[:-1]) * 1024)
elif used_str.endswith("M"):
swap_mb = int(float(used_str[:-1]))
except Exception:
pass
# Get RAM via vm_stat (proven working - same logic as conftest.py)
ram_free_gb = 0
try:
result = subprocess.run(
["vm_stat"],
capture_output=True, text=True, timeout=1, env=env
)
if result.returncode == 0:
page_size = parse_vm_stat_page_size(result.stdout)
# Parse "Pages free: 12345."
for line in result.stdout.splitlines():
if "Pages free:" in line:
pages_free = int(line.split(":")[1].strip().rstrip("."))
ram_free_gb = round(pages_free * page_size / (1024**3), 2)
break
except Exception:
pass
# Get CPU and GPU metrics
cpu_data = get_cpu_load()
gpu_data = get_gpu_usage()
return {
"ram_free_gb": ram_free_gb,
"ram_used_gb": 0, # Not available from vm_stat alone
"ram_percent": 0,
"swap_used_mb": swap_mb,
"swap_percent": 0,
"memory_pressure": memory_pressure, # kern.memorystatus_vm_pressure_level
"vm_pressure": vm_pressure, # vm.memory_pressure (used by Memory Gates)
**cpu_data,
**gpu_data,
}
class MemoryMonitor:
"""Background memory sampler.
Usage:
monitor = MemoryMonitor(interval_ms=200)
monitor.start()
# ... do work ...
summary = monitor.stop()
"""
def __init__(self, interval_ms: int = 200):
self.interval = interval_ms / 1000
self.samples: list[dict] = []
self.running = False
self.thread: Optional[threading.Thread] = None
self.start_time: float = 0
def start(self):
"""Start background sampling."""
self.running = True
self.samples = []
self.start_time = time.time()
self.thread = threading.Thread(target=self._sample_loop, daemon=True)
self.thread.start()
def stop(self) -> dict:
"""Stop sampling and return summary."""
self.running = False
if self.thread:
self.thread.join(timeout=1.0)
if not self.samples:
return {"error": "No samples collected"}
ram_values = [s["ram_free_gb"] for s in self.samples]
swap_values = [s["swap_used_mb"] for s in self.samples]
load_values = [s.get("load_1", 0) for s in self.samples]
cpu_user_values = [s.get("cpu_user", 0) for s in self.samples]
cpu_sys_values = [s.get("cpu_sys", 0) for s in self.samples]
gpu_device_values = [s.get("gpu_device_util", 0) for s in self.samples]
gpu_renderer_values = [s.get("gpu_renderer_util", 0) for s in self.samples]
return {
"duration_s": round(time.time() - self.start_time, 2),
"samples": len(self.samples),
"interval_ms": int(self.interval * 1000),
"ram_free_min_gb": min(ram_values),
"ram_free_max_gb": max(ram_values),
"ram_free_avg_gb": round(sum(ram_values) / len(ram_values), 2),
"swap_max_mb": max(swap_values),
"swap_avg_mb": round(sum(swap_values) / len(swap_values), 1),
"load_max": round(max(load_values), 2),
"load_avg": round(sum(load_values) / len(load_values), 2),
"cpu_user_max": round(max(cpu_user_values), 1),
"cpu_sys_max": round(max(cpu_sys_values), 1),
"gpu_device_max": round(max(gpu_device_values), 1),
"gpu_device_avg": round(sum(gpu_device_values) / len(gpu_device_values), 1) if gpu_device_values else 0,
"gpu_renderer_max": round(max(gpu_renderer_values), 1),
"gpu_renderer_avg": round(sum(gpu_renderer_values) / len(gpu_renderer_values), 1) if gpu_renderer_values else 0,
}
def get_samples(self) -> list[dict]:
"""Get all collected samples."""
return self.samples.copy()
def _sample_loop(self):
"""Background sampling loop."""
while self.running:
sample = get_memory_sample()
sample["ts"] = round(time.time(), 3)
sample["elapsed_s"] = round(time.time() - self.start_time, 2)
self.samples.append(sample)
time.sleep(self.interval)
def run_with_monitoring(
command: list[str],
interval_ms: int = 200,
output_file: Optional[Path] = None,
verbose: bool = False
) -> dict:
"""Run a command while monitoring memory.
Args:
command: Command and arguments to run
interval_ms: Sampling interval in milliseconds
output_file: Optional JSONL output file
verbose: Print samples as they're collected
Returns:
Summary dict with memory statistics
"""
monitor = MemoryMonitor(interval_ms=interval_ms)
print(f"Starting memory monitor (interval: {interval_ms}ms)")
print(f"Running: {' '.join(command)}")
print("-" * 60)
monitor.start()
# Run subprocess
try:
result = subprocess.run(command)
exit_code = result.returncode
except KeyboardInterrupt:
exit_code = 130
print("\nInterrupted")
except Exception as e:
exit_code = 1
print(f"\nError: {e}")
summary = monitor.stop()
summary["exit_code"] = exit_code
summary["command"] = " ".join(command)
summary["timestamp"] = datetime.now(timezone.utc).isoformat()
print("-" * 60)
print(f"Memory Monitor Summary:")
print(f" Duration: {summary['duration_s']:.1f}s ({summary['samples']} samples)")
print(f" RAM free: {summary['ram_free_min_gb']:.1f} - {summary['ram_free_max_gb']:.1f} GB")
print(f" Swap peak: {summary['swap_max_mb']:.1f} MB")
print(f" CPU load: max {summary.get('load_max', 0):.1f}, avg {summary.get('load_avg', 0):.1f}")
print(f" CPU user/sys: max {summary.get('cpu_user_max', 0):.0f}% / {summary.get('cpu_sys_max', 0):.0f}%")
print(f" GPU device: max {summary.get('gpu_device_max', 0):.0f}%, avg {summary.get('gpu_device_avg', 0):.0f}%")
print(f" GPU renderer: max {summary.get('gpu_renderer_max', 0):.0f}%, avg {summary.get('gpu_renderer_avg', 0):.0f}%")
print(f" Exit code: {exit_code}")
# Write output
if output_file:
with open(output_file, "w") as f:
# Write samples
for sample in monitor.get_samples():
f.write(json.dumps(sample) + "\n")
# Write summary as last line
f.write(json.dumps({"summary": summary}) + "\n")
print(f" Output: {output_file}")
return summary
def monitor_only(
duration_s: float,
interval_ms: int = 200,
output_file: Optional[Path] = None
) -> dict:
"""Monitor memory for a fixed duration (no subprocess).
Args:
duration_s: How long to monitor
interval_ms: Sampling interval in milliseconds
output_file: Optional JSONL output file
Returns:
Summary dict with memory statistics
"""
monitor = MemoryMonitor(interval_ms=interval_ms)
print(f"Monitoring memory for {duration_s}s (interval: {interval_ms}ms)")
print("-" * 60)
monitor.start()
try:
time.sleep(duration_s)
except KeyboardInterrupt:
print("\nInterrupted")
summary = monitor.stop()
summary["timestamp"] = datetime.now(timezone.utc).isoformat()
print("-" * 60)
print(f"Memory Monitor Summary:")
print(f" Duration: {summary['duration_s']:.1f}s ({summary['samples']} samples)")
print(f" RAM free: {summary['ram_free_min_gb']:.1f} - {summary['ram_free_max_gb']:.1f} GB")
print(f" Swap peak: {summary['swap_max_mb']:.1f} MB")
print(f" CPU load: max {summary.get('load_max', 0):.1f}, avg {summary.get('load_avg', 0):.1f}")
print(f" CPU user/sys: max {summary.get('cpu_user_max', 0):.0f}% / {summary.get('cpu_sys_max', 0):.0f}%")
print(f" GPU device: max {summary.get('gpu_device_max', 0):.0f}%, avg {summary.get('gpu_device_avg', 0):.0f}%")
print(f" GPU renderer: max {summary.get('gpu_renderer_max', 0):.0f}%, avg {summary.get('gpu_renderer_avg', 0):.0f}%")
if output_file:
with open(output_file, "w") as f:
for sample in monitor.get_samples():
f.write(json.dumps(sample) + "\n")
f.write(json.dumps({"summary": summary}) + "\n")
print(f" Output: {output_file}")
return summary
def main():
parser = argparse.ArgumentParser(
description="Monitor memory while running a command",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__
)
parser.add_argument(
"--interval", "-i",
type=int,
default=200,
help="Sampling interval in milliseconds (default: 200)"
)
parser.add_argument(
"--output", "-o",
type=Path,
help="Output JSONL file for samples and summary"
)
parser.add_argument(
"--duration", "-d",
type=float,
help="Monitor for fixed duration (seconds), no subprocess"
)
parser.add_argument(
"--verbose", "-v",
action="store_true",
help="Print samples as they're collected"
)
parser.add_argument(
"command",
nargs="*",
help="Command to run (after --)"
)
args = parser.parse_args()
if args.duration:
# Monitor-only mode
summary = monitor_only(
duration_s=args.duration,
interval_ms=args.interval,
output_file=args.output
)
elif args.command:
# Run command with monitoring
summary = run_with_monitoring(
command=args.command,
interval_ms=args.interval,
output_file=args.output,
verbose=args.verbose
)
sys.exit(summary.get("exit_code", 0))
else:
parser.print_help()
print("\nExamples:")
print(" python benchmarks/tools/memmon.py -- pytest -m live_e2e")
print(" python benchmarks/tools/memmon.py --duration 10 --output mem.jsonl")
sys.exit(1)
if __name__ == "__main__":
main()