mlx-knife/benchmarks/tools/memmon.py

#!/usr/bin/env python3
"""Memory Monitor - Standalone tool for tracking memory, CPU, and GPU during subprocess execution.

Samples RAM, swap, memory pressure, CPU load/usage, and GPU utilization while running any command.
Outputs JSONL with per-sample data and final summary.

Metrics tracked:
- RAM: free GB, memory pressure (kern.memorystatus_vm_pressure_level), vm_pressure (vm.memory_pressure)
- Swap: used MB
- CPU: load average (1/5/15 min), user/sys/idle %
- GPU: Device/Renderer/Tiler utilization % (via ioreg PerformanceStatistics, no sudo required)

Usage:
    # Basic usage
    python benchmarks/tools/memmon.py -- pytest -m live_e2e tests_2.0/live/

    # With options
    python benchmarks/tools/memmon.py --interval 200 --output memory.jsonl -- pytest -v

    # Just monitor (no subprocess)
    python benchmarks/tools/memmon.py --duration 60 --output memory.jsonl

Platform: macOS + Apple Silicon (MLX requirement)
Dependencies: ZERO - uses native macOS tools (sysctl, vm_stat, top, ioreg)

Future: Will be part of mlxk-benchmark kit.
"""

import argparse
import json
import os
import re
import subprocess
import sys
import threading
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional


def parse_vm_stat_page_size(output: str) -> int:
    """Extract vm_stat page size in bytes, falling back to 16384 (Apple Silicon default).

    Reuses proven logic from tests_2.0/conftest.py (ADR-013 Phase 0.5).
    """
    match = re.search(r"page size of (\d+) bytes", output)
    if match:
        return int(match.group(1))
    return 16384  # Apple Silicon default


def get_cpu_load() -> dict:
    """Get CPU load average and usage.

    Returns load averages (1/5/15 min) and current CPU usage via top.
    """
    import os
    env = os.environ.copy()
    env["LC_ALL"] = "C"

    load_1 = load_5 = load_15 = 0.0
    cpu_user = cpu_sys = cpu_idle = 0.0

    # Load average via sysctl
    try:
        result = subprocess.run(
            ["sysctl", "-n", "vm.loadavg"],
            capture_output=True, text=True, timeout=1, env=env
        )
        if result.returncode == 0:
            # Parse: "{ 2.45 3.12 2.89 }"
            parts = result.stdout.strip().strip("{}").split()
            if len(parts) >= 3:
                load_1 = float(parts[0])
                load_5 = float(parts[1])
                load_15 = float(parts[2])
    except Exception:
        pass

    # CPU usage via top (single sample)
    try:
        result = subprocess.run(
            ["top", "-l", "1", "-n", "0", "-s", "0"],
            capture_output=True, text=True, timeout=2, env=env
        )
        if result.returncode == 0:
            for line in result.stdout.splitlines():
                if "CPU usage:" in line:
                    # Parse: "CPU usage: 5.26% user, 10.52% sys, 84.21% idle"
                    parts = line.split("CPU usage:")[1].split(",")
                    for part in parts:
                        part = part.strip()
                        if "user" in part:
                            cpu_user = float(part.split("%")[0])
                        elif "sys" in part:
                            cpu_sys = float(part.split("%")[0])
                        elif "idle" in part:
                            cpu_idle = float(part.split("%")[0])
                    break
    except Exception:
        pass

    return {
        "load_1": round(load_1, 2),
        "load_5": round(load_5, 2),
        "load_15": round(load_15, 2),
        "cpu_user": round(cpu_user, 1),
        "cpu_sys": round(cpu_sys, 1),
        "cpu_idle": round(cpu_idle, 1),
    }


def get_gpu_usage() -> dict:
    """Get Apple Silicon GPU usage via ioreg PerformanceStatistics.

    Parses ioreg AGXAccelerator PerformanceStatistics to extract:
    - Device Utilization % (overall GPU busy %)
    - Renderer Utilization % (3D rendering cores)
    - Tiler Utilization % (geometry processing)

    No sudo required. Falls back to basic detection if parsing fails.
    """
    gpu_active = False
    gpu_device_util = 0.0
    gpu_renderer_util = 0.0
    gpu_tiler_util = 0.0

    try:
        result = subprocess.run(
            ["ioreg", "-r", "-c", "AGXAccelerator", "-d", "2"],
            capture_output=True, text=True, timeout=2
        )
        if result.returncode == 0:
            # Parse PerformanceStatistics dictionary
            # Format: "PerformanceStatistics" = {"Device Utilization %"=5,"Renderer Utilization %"=3,...}
            for line in result.stdout.splitlines():
                if "PerformanceStatistics" in line:
                    # Extract utilization values
                    if "Device Utilization %" in line:
                        match = re.search(r'"Device Utilization %"=(\d+)', line)
                        if match:
                            gpu_device_util = float(match.group(1))
                            gpu_active = True
                    if "Renderer Utilization %" in line:
                        match = re.search(r'"Renderer Utilization %"=(\d+)', line)
                        if match:
                            gpu_renderer_util = float(match.group(1))
                    if "Tiler Utilization %" in line:
                        match = re.search(r'"Tiler Utilization %"=(\d+)', line)
                        if match:
                            gpu_tiler_util = float(match.group(1))
                    break
    except Exception:
        pass

    return {
        "gpu_active": gpu_active,
        "gpu_device_util": gpu_device_util,  # Overall GPU utilization %
        "gpu_renderer_util": gpu_renderer_util,  # 3D rendering cores %
        "gpu_tiler_util": gpu_tiler_util,  # Geometry/tiler cores %
    }


def get_memory_sample() -> dict:
    """Get current memory state using native macOS tools.

    Platform: macOS only (MLX requirement)
    Dependencies: ZERO - uses sysctl and vm_stat

    Reuses proven parsing logic from tests_2.0/conftest.py (_get_macos_system_health).
    Previously used psutil.swap_memory() which was BROKEN (showed 0 MB during 65GB real usage).
    """
    # Force C locale for consistent number formatting (avoid locale-specific decimal separators)
    import os
    env = os.environ.copy()
    env["LC_ALL"] = "C"

    # Get memory pressure (kern.memorystatus_vm_pressure_level: 1=NORMAL, 2=WARN, 4=CRITICAL)
    memory_pressure = 1  # Default to NORMAL
    try:
        result = subprocess.run(
            ["sysctl", "-n", "kern.memorystatus_vm_pressure_level"],
            capture_output=True, text=True, timeout=1, env=env
        )
        memory_pressure = int(result.stdout.strip())
    except Exception:
        pass

    # Get vm.memory_pressure (0=NORMAL, 1=WARN, 4=CRITICAL) - used by Memory Gates
    vm_pressure = 0  # Default to NORMAL
    try:
        result = subprocess.run(
            ["sysctl", "-n", "vm.memory_pressure"],
            capture_output=True, text=True, timeout=1, env=env
        )
        vm_pressure = int(result.stdout.strip())
    except Exception:
        pass

    # Get swap usage via sysctl (proven working - same logic as conftest.py)
    swap_mb = 0
    try:
        result = subprocess.run(
            ["sysctl", "vm.swapusage"],
            capture_output=True, text=True, timeout=1, env=env
        )
        if result.returncode == 0:
            # Parse: "vm.swapusage: total = 0.00M  used = 0.00M  free = 0.00M  (encrypted)"
            # LC_ALL=C ensures consistent dot decimal separator
            parts = result.stdout.split("used = ")
            if len(parts) > 1:
                used_str = parts[1].split()[0]
                # Parse size (can be M or G suffix)
                if used_str.endswith("G"):
                    swap_mb = int(float(used_str[:-1]) * 1024)
                elif used_str.endswith("M"):
                    swap_mb = int(float(used_str[:-1]))
    except Exception:
        pass

    # Get RAM via vm_stat (proven working - same logic as conftest.py)
    ram_free_gb = 0
    try:
        result = subprocess.run(
            ["vm_stat"],
            capture_output=True, text=True, timeout=1, env=env
        )
        if result.returncode == 0:
            page_size = parse_vm_stat_page_size(result.stdout)
            # Parse "Pages free: 12345."
            for line in result.stdout.splitlines():
                if "Pages free:" in line:
                    pages_free = int(line.split(":")[1].strip().rstrip("."))
                    ram_free_gb = round(pages_free * page_size / (1024**3), 2)
                    break
    except Exception:
        pass

    # Get CPU and GPU metrics
    cpu_data = get_cpu_load()
    gpu_data = get_gpu_usage()

    return {
        "ram_free_gb": ram_free_gb,
        "ram_used_gb": 0,  # Not available from vm_stat alone
        "ram_percent": 0,
        "swap_used_mb": swap_mb,
        "swap_percent": 0,
        "memory_pressure": memory_pressure,  # kern.memorystatus_vm_pressure_level
        "vm_pressure": vm_pressure,  # vm.memory_pressure (used by Memory Gates)
        **cpu_data,
        **gpu_data,
    }


class MemoryMonitor:
    """Background memory sampler.

    Usage:
        monitor = MemoryMonitor(interval_ms=200)
        monitor.start()
        # ... do work ...
        summary = monitor.stop()
    """

    def __init__(self, interval_ms: int = 200):
        self.interval = interval_ms / 1000
        self.samples: list[dict] = []
        self.running = False
        self.thread: Optional[threading.Thread] = None
        self.start_time: float = 0

    def start(self):
        """Start background sampling."""
        self.running = True
        self.samples = []
        self.start_time = time.time()
        self.thread = threading.Thread(target=self._sample_loop, daemon=True)
        self.thread.start()

    def stop(self) -> dict:
        """Stop sampling and return summary."""
        self.running = False
        if self.thread:
            self.thread.join(timeout=1.0)

        if not self.samples:
            return {"error": "No samples collected"}

        ram_values = [s["ram_free_gb"] for s in self.samples]
        swap_values = [s["swap_used_mb"] for s in self.samples]
        load_values = [s.get("load_1", 0) for s in self.samples]
        cpu_user_values = [s.get("cpu_user", 0) for s in self.samples]
        cpu_sys_values = [s.get("cpu_sys", 0) for s in self.samples]
        gpu_device_values = [s.get("gpu_device_util", 0) for s in self.samples]
        gpu_renderer_values = [s.get("gpu_renderer_util", 0) for s in self.samples]

        return {
            "duration_s": round(time.time() - self.start_time, 2),
            "samples": len(self.samples),
            "interval_ms": int(self.interval * 1000),
            "ram_free_min_gb": min(ram_values),
            "ram_free_max_gb": max(ram_values),
            "ram_free_avg_gb": round(sum(ram_values) / len(ram_values), 2),
            "swap_max_mb": max(swap_values),
            "swap_avg_mb": round(sum(swap_values) / len(swap_values), 1),
            "load_max": round(max(load_values), 2),
            "load_avg": round(sum(load_values) / len(load_values), 2),
            "cpu_user_max": round(max(cpu_user_values), 1),
            "cpu_sys_max": round(max(cpu_sys_values), 1),
            "gpu_device_max": round(max(gpu_device_values), 1),
            "gpu_device_avg": round(sum(gpu_device_values) / len(gpu_device_values), 1) if gpu_device_values else 0,
            "gpu_renderer_max": round(max(gpu_renderer_values), 1),
            "gpu_renderer_avg": round(sum(gpu_renderer_values) / len(gpu_renderer_values), 1) if gpu_renderer_values else 0,
        }

    def get_samples(self) -> list[dict]:
        """Get all collected samples."""
        return self.samples.copy()

    def _sample_loop(self):
        """Background sampling loop."""
        while self.running:
            sample = get_memory_sample()
            sample["ts"] = round(time.time(), 3)
            sample["elapsed_s"] = round(time.time() - self.start_time, 2)
            self.samples.append(sample)
            time.sleep(self.interval)


def run_with_monitoring(
    command: list[str],
    interval_ms: int = 200,
    output_file: Optional[Path] = None,
    verbose: bool = False
) -> dict:
    """Run a command while monitoring memory.

    Args:
        command: Command and arguments to run
        interval_ms: Sampling interval in milliseconds
        output_file: Optional JSONL output file
        verbose: Print samples as they're collected

    Returns:
        Summary dict with memory statistics
    """
    monitor = MemoryMonitor(interval_ms=interval_ms)

    print(f"Starting memory monitor (interval: {interval_ms}ms)")
    print(f"Running: {' '.join(command)}")
    print("-" * 60)

    monitor.start()

    # Run subprocess
    try:
        result = subprocess.run(command)
        exit_code = result.returncode
    except KeyboardInterrupt:
        exit_code = 130
        print("\nInterrupted")
    except Exception as e:
        exit_code = 1
        print(f"\nError: {e}")

    summary = monitor.stop()
    summary["exit_code"] = exit_code
    summary["command"] = " ".join(command)
    summary["timestamp"] = datetime.now(timezone.utc).isoformat()

    print("-" * 60)
    print(f"Memory Monitor Summary:")
    print(f"  Duration:     {summary['duration_s']:.1f}s ({summary['samples']} samples)")
    print(f"  RAM free:     {summary['ram_free_min_gb']:.1f} - {summary['ram_free_max_gb']:.1f} GB")
    print(f"  Swap peak:    {summary['swap_max_mb']:.1f} MB")
    print(f"  CPU load:     max {summary.get('load_max', 0):.1f}, avg {summary.get('load_avg', 0):.1f}")
    print(f"  CPU user/sys: max {summary.get('cpu_user_max', 0):.0f}% / {summary.get('cpu_sys_max', 0):.0f}%")
    print(f"  GPU device:   max {summary.get('gpu_device_max', 0):.0f}%, avg {summary.get('gpu_device_avg', 0):.0f}%")
    print(f"  GPU renderer: max {summary.get('gpu_renderer_max', 0):.0f}%, avg {summary.get('gpu_renderer_avg', 0):.0f}%")
    print(f"  Exit code:    {exit_code}")

    # Write output
    if output_file:
        with open(output_file, "w") as f:
            # Write samples
            for sample in monitor.get_samples():
                f.write(json.dumps(sample) + "\n")
            # Write summary as last line
            f.write(json.dumps({"summary": summary}) + "\n")
        print(f"  Output:       {output_file}")

    return summary


def monitor_only(
    duration_s: float,
    interval_ms: int = 200,
    output_file: Optional[Path] = None
) -> dict:
    """Monitor memory for a fixed duration (no subprocess).

    Args:
        duration_s: How long to monitor
        interval_ms: Sampling interval in milliseconds
        output_file: Optional JSONL output file

    Returns:
        Summary dict with memory statistics
    """
    monitor = MemoryMonitor(interval_ms=interval_ms)

    print(f"Monitoring memory for {duration_s}s (interval: {interval_ms}ms)")
    print("-" * 60)

    monitor.start()

    try:
        time.sleep(duration_s)
    except KeyboardInterrupt:
        print("\nInterrupted")

    summary = monitor.stop()
    summary["timestamp"] = datetime.now(timezone.utc).isoformat()

    print("-" * 60)
    print(f"Memory Monitor Summary:")
    print(f"  Duration:     {summary['duration_s']:.1f}s ({summary['samples']} samples)")
    print(f"  RAM free:     {summary['ram_free_min_gb']:.1f} - {summary['ram_free_max_gb']:.1f} GB")
    print(f"  Swap peak:    {summary['swap_max_mb']:.1f} MB")
    print(f"  CPU load:     max {summary.get('load_max', 0):.1f}, avg {summary.get('load_avg', 0):.1f}")
    print(f"  CPU user/sys: max {summary.get('cpu_user_max', 0):.0f}% / {summary.get('cpu_sys_max', 0):.0f}%")
    print(f"  GPU device:   max {summary.get('gpu_device_max', 0):.0f}%, avg {summary.get('gpu_device_avg', 0):.0f}%")
    print(f"  GPU renderer: max {summary.get('gpu_renderer_max', 0):.0f}%, avg {summary.get('gpu_renderer_avg', 0):.0f}%")

    if output_file:
        with open(output_file, "w") as f:
            for sample in monitor.get_samples():
                f.write(json.dumps(sample) + "\n")
            f.write(json.dumps({"summary": summary}) + "\n")
        print(f"  Output:       {output_file}")

    return summary


def main():
    parser = argparse.ArgumentParser(
        description="Monitor memory while running a command",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__
    )
    parser.add_argument(
        "--interval", "-i",
        type=int,
        default=200,
        help="Sampling interval in milliseconds (default: 200)"
    )
    parser.add_argument(
        "--output", "-o",
        type=Path,
        help="Output JSONL file for samples and summary"
    )
    parser.add_argument(
        "--duration", "-d",
        type=float,
        help="Monitor for fixed duration (seconds), no subprocess"
    )
    parser.add_argument(
        "--verbose", "-v",
        action="store_true",
        help="Print samples as they're collected"
    )
    parser.add_argument(
        "command",
        nargs="*",
        help="Command to run (after --)"
    )

    args = parser.parse_args()

    if args.duration:
        # Monitor-only mode
        summary = monitor_only(
            duration_s=args.duration,
            interval_ms=args.interval,
            output_file=args.output
        )
    elif args.command:
        # Run command with monitoring
        summary = run_with_monitoring(
            command=args.command,
            interval_ms=args.interval,
            output_file=args.output,
            verbose=args.verbose
        )
        sys.exit(summary.get("exit_code", 0))
    else:
        parser.print_help()
        print("\nExamples:")
        print("  python benchmarks/tools/memmon.py -- pytest -m live_e2e")
        print("  python benchmarks/tools/memmon.py --duration 10 --output mem.jsonl")
        sys.exit(1)


if __name__ == "__main__":
    main()