mirror of
https://github.com/cloudstack-llc/mlx-knife.git
synced 2026-07-01 20:44:14 -04:00
bf7480d042
Major Features: - Audio transcription via mlx-audio backend (Whisper, >10min duration) - OpenAI /v1/audio/transcriptions endpoint - Memory Gate System (Vision: 8GB, Audio: 4GB) - Config-based backend routing (ADR-020) - Benchmark toolchain (memmon/memplot, Schema v0.2.2) Key Fixes: - EuroLLM tokenizer decoding - Vision-model text-only routing regression - Multimodal model context length detection - Memory cleanup bug (mx.metal.clear_cache) - Orphan process bug Test Results: - Unit tests: 647 passed, 11 skipped (Python 3.10-3.12) - wet-umbrella: 171 passed total See CHANGELOG.md for complete details and known issues.
508 lines
18 KiB
Python
508 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""Memory Monitor - Standalone tool for tracking memory, CPU, and GPU during subprocess execution.
|
|
|
|
Samples RAM, swap, memory pressure, CPU load/usage, and GPU utilization while running any command.
|
|
Outputs JSONL with per-sample data and final summary.
|
|
|
|
Metrics tracked:
|
|
- RAM: free GB, memory pressure (kern.memorystatus_vm_pressure_level), vm_pressure (vm.memory_pressure)
|
|
- Swap: used MB
|
|
- CPU: load average (1/5/15 min), user/sys/idle %
|
|
- GPU: Device/Renderer/Tiler utilization % (via ioreg PerformanceStatistics, no sudo required)
|
|
|
|
Usage:
|
|
# Basic usage
|
|
python benchmarks/tools/memmon.py -- pytest -m live_e2e tests_2.0/live/
|
|
|
|
# With options
|
|
python benchmarks/tools/memmon.py --interval 200 --output memory.jsonl -- pytest -v
|
|
|
|
# Just monitor (no subprocess)
|
|
python benchmarks/tools/memmon.py --duration 60 --output memory.jsonl
|
|
|
|
Platform: macOS + Apple Silicon (MLX requirement)
|
|
Dependencies: ZERO - uses native macOS tools (sysctl, vm_stat, top, ioreg)
|
|
|
|
Future: Will be part of mlxk-benchmark kit.
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import threading
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
|
|
def parse_vm_stat_page_size(output: str) -> int:
|
|
"""Extract vm_stat page size in bytes, falling back to 16384 (Apple Silicon default).
|
|
|
|
Reuses proven logic from tests_2.0/conftest.py (ADR-013 Phase 0.5).
|
|
"""
|
|
match = re.search(r"page size of (\d+) bytes", output)
|
|
if match:
|
|
return int(match.group(1))
|
|
return 16384 # Apple Silicon default
|
|
|
|
|
|
def get_cpu_load() -> dict:
|
|
"""Get CPU load average and usage.
|
|
|
|
Returns load averages (1/5/15 min) and current CPU usage via top.
|
|
"""
|
|
import os
|
|
env = os.environ.copy()
|
|
env["LC_ALL"] = "C"
|
|
|
|
load_1 = load_5 = load_15 = 0.0
|
|
cpu_user = cpu_sys = cpu_idle = 0.0
|
|
|
|
# Load average via sysctl
|
|
try:
|
|
result = subprocess.run(
|
|
["sysctl", "-n", "vm.loadavg"],
|
|
capture_output=True, text=True, timeout=1, env=env
|
|
)
|
|
if result.returncode == 0:
|
|
# Parse: "{ 2.45 3.12 2.89 }"
|
|
parts = result.stdout.strip().strip("{}").split()
|
|
if len(parts) >= 3:
|
|
load_1 = float(parts[0])
|
|
load_5 = float(parts[1])
|
|
load_15 = float(parts[2])
|
|
except Exception:
|
|
pass
|
|
|
|
# CPU usage via top (single sample)
|
|
try:
|
|
result = subprocess.run(
|
|
["top", "-l", "1", "-n", "0", "-s", "0"],
|
|
capture_output=True, text=True, timeout=2, env=env
|
|
)
|
|
if result.returncode == 0:
|
|
for line in result.stdout.splitlines():
|
|
if "CPU usage:" in line:
|
|
# Parse: "CPU usage: 5.26% user, 10.52% sys, 84.21% idle"
|
|
parts = line.split("CPU usage:")[1].split(",")
|
|
for part in parts:
|
|
part = part.strip()
|
|
if "user" in part:
|
|
cpu_user = float(part.split("%")[0])
|
|
elif "sys" in part:
|
|
cpu_sys = float(part.split("%")[0])
|
|
elif "idle" in part:
|
|
cpu_idle = float(part.split("%")[0])
|
|
break
|
|
except Exception:
|
|
pass
|
|
|
|
return {
|
|
"load_1": round(load_1, 2),
|
|
"load_5": round(load_5, 2),
|
|
"load_15": round(load_15, 2),
|
|
"cpu_user": round(cpu_user, 1),
|
|
"cpu_sys": round(cpu_sys, 1),
|
|
"cpu_idle": round(cpu_idle, 1),
|
|
}
|
|
|
|
|
|
def get_gpu_usage() -> dict:
|
|
"""Get Apple Silicon GPU usage via ioreg PerformanceStatistics.
|
|
|
|
Parses ioreg AGXAccelerator PerformanceStatistics to extract:
|
|
- Device Utilization % (overall GPU busy %)
|
|
- Renderer Utilization % (3D rendering cores)
|
|
- Tiler Utilization % (geometry processing)
|
|
|
|
No sudo required. Falls back to basic detection if parsing fails.
|
|
"""
|
|
gpu_active = False
|
|
gpu_device_util = 0.0
|
|
gpu_renderer_util = 0.0
|
|
gpu_tiler_util = 0.0
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
["ioreg", "-r", "-c", "AGXAccelerator", "-d", "2"],
|
|
capture_output=True, text=True, timeout=2
|
|
)
|
|
if result.returncode == 0:
|
|
# Parse PerformanceStatistics dictionary
|
|
# Format: "PerformanceStatistics" = {"Device Utilization %"=5,"Renderer Utilization %"=3,...}
|
|
for line in result.stdout.splitlines():
|
|
if "PerformanceStatistics" in line:
|
|
# Extract utilization values
|
|
if "Device Utilization %" in line:
|
|
match = re.search(r'"Device Utilization %"=(\d+)', line)
|
|
if match:
|
|
gpu_device_util = float(match.group(1))
|
|
gpu_active = True
|
|
if "Renderer Utilization %" in line:
|
|
match = re.search(r'"Renderer Utilization %"=(\d+)', line)
|
|
if match:
|
|
gpu_renderer_util = float(match.group(1))
|
|
if "Tiler Utilization %" in line:
|
|
match = re.search(r'"Tiler Utilization %"=(\d+)', line)
|
|
if match:
|
|
gpu_tiler_util = float(match.group(1))
|
|
break
|
|
except Exception:
|
|
pass
|
|
|
|
return {
|
|
"gpu_active": gpu_active,
|
|
"gpu_device_util": gpu_device_util, # Overall GPU utilization %
|
|
"gpu_renderer_util": gpu_renderer_util, # 3D rendering cores %
|
|
"gpu_tiler_util": gpu_tiler_util, # Geometry/tiler cores %
|
|
}
|
|
|
|
|
|
def get_memory_sample() -> dict:
|
|
"""Get current memory state using native macOS tools.
|
|
|
|
Platform: macOS only (MLX requirement)
|
|
Dependencies: ZERO - uses sysctl and vm_stat
|
|
|
|
Reuses proven parsing logic from tests_2.0/conftest.py (_get_macos_system_health).
|
|
Previously used psutil.swap_memory() which was BROKEN (showed 0 MB during 65GB real usage).
|
|
"""
|
|
# Force C locale for consistent number formatting (avoid locale-specific decimal separators)
|
|
import os
|
|
env = os.environ.copy()
|
|
env["LC_ALL"] = "C"
|
|
|
|
# Get memory pressure (kern.memorystatus_vm_pressure_level: 1=NORMAL, 2=WARN, 4=CRITICAL)
|
|
memory_pressure = 1 # Default to NORMAL
|
|
try:
|
|
result = subprocess.run(
|
|
["sysctl", "-n", "kern.memorystatus_vm_pressure_level"],
|
|
capture_output=True, text=True, timeout=1, env=env
|
|
)
|
|
memory_pressure = int(result.stdout.strip())
|
|
except Exception:
|
|
pass
|
|
|
|
# Get vm.memory_pressure (0=NORMAL, 1=WARN, 4=CRITICAL) - used by Memory Gates
|
|
vm_pressure = 0 # Default to NORMAL
|
|
try:
|
|
result = subprocess.run(
|
|
["sysctl", "-n", "vm.memory_pressure"],
|
|
capture_output=True, text=True, timeout=1, env=env
|
|
)
|
|
vm_pressure = int(result.stdout.strip())
|
|
except Exception:
|
|
pass
|
|
|
|
# Get swap usage via sysctl (proven working - same logic as conftest.py)
|
|
swap_mb = 0
|
|
try:
|
|
result = subprocess.run(
|
|
["sysctl", "vm.swapusage"],
|
|
capture_output=True, text=True, timeout=1, env=env
|
|
)
|
|
if result.returncode == 0:
|
|
# Parse: "vm.swapusage: total = 0.00M used = 0.00M free = 0.00M (encrypted)"
|
|
# LC_ALL=C ensures consistent dot decimal separator
|
|
parts = result.stdout.split("used = ")
|
|
if len(parts) > 1:
|
|
used_str = parts[1].split()[0]
|
|
# Parse size (can be M or G suffix)
|
|
if used_str.endswith("G"):
|
|
swap_mb = int(float(used_str[:-1]) * 1024)
|
|
elif used_str.endswith("M"):
|
|
swap_mb = int(float(used_str[:-1]))
|
|
except Exception:
|
|
pass
|
|
|
|
# Get RAM via vm_stat (proven working - same logic as conftest.py)
|
|
ram_free_gb = 0
|
|
try:
|
|
result = subprocess.run(
|
|
["vm_stat"],
|
|
capture_output=True, text=True, timeout=1, env=env
|
|
)
|
|
if result.returncode == 0:
|
|
page_size = parse_vm_stat_page_size(result.stdout)
|
|
# Parse "Pages free: 12345."
|
|
for line in result.stdout.splitlines():
|
|
if "Pages free:" in line:
|
|
pages_free = int(line.split(":")[1].strip().rstrip("."))
|
|
ram_free_gb = round(pages_free * page_size / (1024**3), 2)
|
|
break
|
|
except Exception:
|
|
pass
|
|
|
|
# Get CPU and GPU metrics
|
|
cpu_data = get_cpu_load()
|
|
gpu_data = get_gpu_usage()
|
|
|
|
return {
|
|
"ram_free_gb": ram_free_gb,
|
|
"ram_used_gb": 0, # Not available from vm_stat alone
|
|
"ram_percent": 0,
|
|
"swap_used_mb": swap_mb,
|
|
"swap_percent": 0,
|
|
"memory_pressure": memory_pressure, # kern.memorystatus_vm_pressure_level
|
|
"vm_pressure": vm_pressure, # vm.memory_pressure (used by Memory Gates)
|
|
**cpu_data,
|
|
**gpu_data,
|
|
}
|
|
|
|
|
|
class MemoryMonitor:
|
|
"""Background memory sampler.
|
|
|
|
Usage:
|
|
monitor = MemoryMonitor(interval_ms=200)
|
|
monitor.start()
|
|
# ... do work ...
|
|
summary = monitor.stop()
|
|
"""
|
|
|
|
def __init__(self, interval_ms: int = 200):
|
|
self.interval = interval_ms / 1000
|
|
self.samples: list[dict] = []
|
|
self.running = False
|
|
self.thread: Optional[threading.Thread] = None
|
|
self.start_time: float = 0
|
|
|
|
def start(self):
|
|
"""Start background sampling."""
|
|
self.running = True
|
|
self.samples = []
|
|
self.start_time = time.time()
|
|
self.thread = threading.Thread(target=self._sample_loop, daemon=True)
|
|
self.thread.start()
|
|
|
|
def stop(self) -> dict:
|
|
"""Stop sampling and return summary."""
|
|
self.running = False
|
|
if self.thread:
|
|
self.thread.join(timeout=1.0)
|
|
|
|
if not self.samples:
|
|
return {"error": "No samples collected"}
|
|
|
|
ram_values = [s["ram_free_gb"] for s in self.samples]
|
|
swap_values = [s["swap_used_mb"] for s in self.samples]
|
|
load_values = [s.get("load_1", 0) for s in self.samples]
|
|
cpu_user_values = [s.get("cpu_user", 0) for s in self.samples]
|
|
cpu_sys_values = [s.get("cpu_sys", 0) for s in self.samples]
|
|
gpu_device_values = [s.get("gpu_device_util", 0) for s in self.samples]
|
|
gpu_renderer_values = [s.get("gpu_renderer_util", 0) for s in self.samples]
|
|
|
|
return {
|
|
"duration_s": round(time.time() - self.start_time, 2),
|
|
"samples": len(self.samples),
|
|
"interval_ms": int(self.interval * 1000),
|
|
"ram_free_min_gb": min(ram_values),
|
|
"ram_free_max_gb": max(ram_values),
|
|
"ram_free_avg_gb": round(sum(ram_values) / len(ram_values), 2),
|
|
"swap_max_mb": max(swap_values),
|
|
"swap_avg_mb": round(sum(swap_values) / len(swap_values), 1),
|
|
"load_max": round(max(load_values), 2),
|
|
"load_avg": round(sum(load_values) / len(load_values), 2),
|
|
"cpu_user_max": round(max(cpu_user_values), 1),
|
|
"cpu_sys_max": round(max(cpu_sys_values), 1),
|
|
"gpu_device_max": round(max(gpu_device_values), 1),
|
|
"gpu_device_avg": round(sum(gpu_device_values) / len(gpu_device_values), 1) if gpu_device_values else 0,
|
|
"gpu_renderer_max": round(max(gpu_renderer_values), 1),
|
|
"gpu_renderer_avg": round(sum(gpu_renderer_values) / len(gpu_renderer_values), 1) if gpu_renderer_values else 0,
|
|
}
|
|
|
|
def get_samples(self) -> list[dict]:
|
|
"""Get all collected samples."""
|
|
return self.samples.copy()
|
|
|
|
def _sample_loop(self):
|
|
"""Background sampling loop."""
|
|
while self.running:
|
|
sample = get_memory_sample()
|
|
sample["ts"] = round(time.time(), 3)
|
|
sample["elapsed_s"] = round(time.time() - self.start_time, 2)
|
|
self.samples.append(sample)
|
|
time.sleep(self.interval)
|
|
|
|
|
|
def run_with_monitoring(
|
|
command: list[str],
|
|
interval_ms: int = 200,
|
|
output_file: Optional[Path] = None,
|
|
verbose: bool = False
|
|
) -> dict:
|
|
"""Run a command while monitoring memory.
|
|
|
|
Args:
|
|
command: Command and arguments to run
|
|
interval_ms: Sampling interval in milliseconds
|
|
output_file: Optional JSONL output file
|
|
verbose: Print samples as they're collected
|
|
|
|
Returns:
|
|
Summary dict with memory statistics
|
|
"""
|
|
monitor = MemoryMonitor(interval_ms=interval_ms)
|
|
|
|
print(f"Starting memory monitor (interval: {interval_ms}ms)")
|
|
print(f"Running: {' '.join(command)}")
|
|
print("-" * 60)
|
|
|
|
monitor.start()
|
|
|
|
# Run subprocess
|
|
try:
|
|
result = subprocess.run(command)
|
|
exit_code = result.returncode
|
|
except KeyboardInterrupt:
|
|
exit_code = 130
|
|
print("\nInterrupted")
|
|
except Exception as e:
|
|
exit_code = 1
|
|
print(f"\nError: {e}")
|
|
|
|
summary = monitor.stop()
|
|
summary["exit_code"] = exit_code
|
|
summary["command"] = " ".join(command)
|
|
summary["timestamp"] = datetime.now(timezone.utc).isoformat()
|
|
|
|
print("-" * 60)
|
|
print(f"Memory Monitor Summary:")
|
|
print(f" Duration: {summary['duration_s']:.1f}s ({summary['samples']} samples)")
|
|
print(f" RAM free: {summary['ram_free_min_gb']:.1f} - {summary['ram_free_max_gb']:.1f} GB")
|
|
print(f" Swap peak: {summary['swap_max_mb']:.1f} MB")
|
|
print(f" CPU load: max {summary.get('load_max', 0):.1f}, avg {summary.get('load_avg', 0):.1f}")
|
|
print(f" CPU user/sys: max {summary.get('cpu_user_max', 0):.0f}% / {summary.get('cpu_sys_max', 0):.0f}%")
|
|
print(f" GPU device: max {summary.get('gpu_device_max', 0):.0f}%, avg {summary.get('gpu_device_avg', 0):.0f}%")
|
|
print(f" GPU renderer: max {summary.get('gpu_renderer_max', 0):.0f}%, avg {summary.get('gpu_renderer_avg', 0):.0f}%")
|
|
print(f" Exit code: {exit_code}")
|
|
|
|
# Write output
|
|
if output_file:
|
|
with open(output_file, "w") as f:
|
|
# Write samples
|
|
for sample in monitor.get_samples():
|
|
f.write(json.dumps(sample) + "\n")
|
|
# Write summary as last line
|
|
f.write(json.dumps({"summary": summary}) + "\n")
|
|
print(f" Output: {output_file}")
|
|
|
|
return summary
|
|
|
|
|
|
def monitor_only(
|
|
duration_s: float,
|
|
interval_ms: int = 200,
|
|
output_file: Optional[Path] = None
|
|
) -> dict:
|
|
"""Monitor memory for a fixed duration (no subprocess).
|
|
|
|
Args:
|
|
duration_s: How long to monitor
|
|
interval_ms: Sampling interval in milliseconds
|
|
output_file: Optional JSONL output file
|
|
|
|
Returns:
|
|
Summary dict with memory statistics
|
|
"""
|
|
monitor = MemoryMonitor(interval_ms=interval_ms)
|
|
|
|
print(f"Monitoring memory for {duration_s}s (interval: {interval_ms}ms)")
|
|
print("-" * 60)
|
|
|
|
monitor.start()
|
|
|
|
try:
|
|
time.sleep(duration_s)
|
|
except KeyboardInterrupt:
|
|
print("\nInterrupted")
|
|
|
|
summary = monitor.stop()
|
|
summary["timestamp"] = datetime.now(timezone.utc).isoformat()
|
|
|
|
print("-" * 60)
|
|
print(f"Memory Monitor Summary:")
|
|
print(f" Duration: {summary['duration_s']:.1f}s ({summary['samples']} samples)")
|
|
print(f" RAM free: {summary['ram_free_min_gb']:.1f} - {summary['ram_free_max_gb']:.1f} GB")
|
|
print(f" Swap peak: {summary['swap_max_mb']:.1f} MB")
|
|
print(f" CPU load: max {summary.get('load_max', 0):.1f}, avg {summary.get('load_avg', 0):.1f}")
|
|
print(f" CPU user/sys: max {summary.get('cpu_user_max', 0):.0f}% / {summary.get('cpu_sys_max', 0):.0f}%")
|
|
print(f" GPU device: max {summary.get('gpu_device_max', 0):.0f}%, avg {summary.get('gpu_device_avg', 0):.0f}%")
|
|
print(f" GPU renderer: max {summary.get('gpu_renderer_max', 0):.0f}%, avg {summary.get('gpu_renderer_avg', 0):.0f}%")
|
|
|
|
if output_file:
|
|
with open(output_file, "w") as f:
|
|
for sample in monitor.get_samples():
|
|
f.write(json.dumps(sample) + "\n")
|
|
f.write(json.dumps({"summary": summary}) + "\n")
|
|
print(f" Output: {output_file}")
|
|
|
|
return summary
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Monitor memory while running a command",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog=__doc__
|
|
)
|
|
parser.add_argument(
|
|
"--interval", "-i",
|
|
type=int,
|
|
default=200,
|
|
help="Sampling interval in milliseconds (default: 200)"
|
|
)
|
|
parser.add_argument(
|
|
"--output", "-o",
|
|
type=Path,
|
|
help="Output JSONL file for samples and summary"
|
|
)
|
|
parser.add_argument(
|
|
"--duration", "-d",
|
|
type=float,
|
|
help="Monitor for fixed duration (seconds), no subprocess"
|
|
)
|
|
parser.add_argument(
|
|
"--verbose", "-v",
|
|
action="store_true",
|
|
help="Print samples as they're collected"
|
|
)
|
|
parser.add_argument(
|
|
"command",
|
|
nargs="*",
|
|
help="Command to run (after --)"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.duration:
|
|
# Monitor-only mode
|
|
summary = monitor_only(
|
|
duration_s=args.duration,
|
|
interval_ms=args.interval,
|
|
output_file=args.output
|
|
)
|
|
elif args.command:
|
|
# Run command with monitoring
|
|
summary = run_with_monitoring(
|
|
command=args.command,
|
|
interval_ms=args.interval,
|
|
output_file=args.output,
|
|
verbose=args.verbose
|
|
)
|
|
sys.exit(summary.get("exit_code", 0))
|
|
else:
|
|
parser.print_help()
|
|
print("\nExamples:")
|
|
print(" python benchmarks/tools/memmon.py -- pytest -m live_e2e")
|
|
print(" python benchmarks/tools/memmon.py --duration 10 --output mem.jsonl")
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|