mirror of
https://github.com/cloudstack-llc/mlx-knife.git
synced 2026-07-01 20:44:14 -04:00
bf7480d042
Major Features: - Audio transcription via mlx-audio backend (Whisper, >10min duration) - OpenAI /v1/audio/transcriptions endpoint - Memory Gate System (Vision: 8GB, Audio: 4GB) - Config-based backend routing (ADR-020) - Benchmark toolchain (memmon/memplot, Schema v0.2.2) Key Fixes: - EuroLLM tokenizer decoding - Vision-model text-only routing regression - Multimodal model context length detection - Memory cleanup bug (mx.metal.clear_cache) - Orphan process bug Test Results: - Unit tests: 647 passed, 11 skipped (Python 3.10-3.12) - wet-umbrella: 171 passed total See CHANGELOG.md for complete details and known issues.
327 lines
12 KiB
Python
327 lines
12 KiB
Python
"""LocalServer context manager for E2E testing (ADR-011).
|
|
|
|
Provides a clean subprocess-based server lifecycle for testing:
|
|
- Starts server with pre-loaded model
|
|
- Waits for health check before yielding
|
|
- Ensures graceful cleanup on exit
|
|
- Memory-aware cleanup: waits for Metal GPU cache release
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import gc
|
|
import os
|
|
import signal
|
|
import sys
|
|
import time
|
|
import subprocess
|
|
from contextlib import contextmanager
|
|
from typing import Optional
|
|
|
|
try:
|
|
import httpx
|
|
except ImportError:
|
|
httpx = None # Will fail at test time with clear error
|
|
|
|
|
|
def _get_available_memory_gb() -> float:
|
|
"""Get available system memory in GB (macOS).
|
|
|
|
Returns available (free + speculative) memory that can be used immediately.
|
|
Critical for robust test scheduling - ensures enough memory before next test.
|
|
|
|
Note: macOS Tahoe caches aggressively, so "free" is often minimal.
|
|
IMPORTANT: We do NOT count "inactive" pages because Metal/GPU cache may hold
|
|
them even though macOS reports them as "reclaimable". This was causing false
|
|
positives where Memory Gates reported 20+ GB available but Pixtral failed
|
|
with "Broken pipe" due to actual memory pressure. (Session 136 fix)
|
|
"""
|
|
try:
|
|
result = subprocess.run(
|
|
["vm_stat"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=5,
|
|
)
|
|
if result.returncode == 0:
|
|
lines = result.stdout.split("\n")
|
|
page_size = 16384 # Default macOS page size (Apple Silicon)
|
|
if "page size of" in lines[0]:
|
|
try:
|
|
page_size = int(lines[0].split("page size of")[1].split()[0])
|
|
except (ValueError, IndexError):
|
|
pass
|
|
|
|
free_pages = 0
|
|
speculative_pages = 0
|
|
for line in lines:
|
|
if "Pages free:" in line:
|
|
free_pages = int(line.split(":")[1].strip().rstrip("."))
|
|
elif "Pages speculative:" in line:
|
|
speculative_pages = int(line.split(":")[1].strip().rstrip("."))
|
|
|
|
# Available = free + speculative only (NOT inactive - may be held by GPU cache)
|
|
return (free_pages + speculative_pages) * page_size / (1024**3)
|
|
except Exception:
|
|
pass
|
|
return 0.0
|
|
|
|
|
|
def _get_memory_pressure() -> int:
|
|
"""Get macOS memory pressure level via sysctl.
|
|
|
|
Returns:
|
|
0 = NORMAL (system relaxed, safe to load models)
|
|
1 = WARN (system under some pressure)
|
|
4 = CRITICAL (system under severe pressure)
|
|
-1 = Unable to determine
|
|
"""
|
|
try:
|
|
result = subprocess.run(
|
|
["sysctl", "-n", "vm.memory_pressure"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=2,
|
|
)
|
|
if result.returncode == 0:
|
|
return int(result.stdout.strip())
|
|
except Exception:
|
|
pass
|
|
return -1
|
|
|
|
|
|
def _wait_for_memory_release(
|
|
min_free_gb: float = 20.0,
|
|
timeout_seconds: float = 30.0,
|
|
poll_interval: float = 1.0,
|
|
) -> bool:
|
|
"""Wait for system memory to be released after server shutdown.
|
|
|
|
Metal GPU cache is shared across processes and released asynchronously.
|
|
This function actively waits until enough memory is free before
|
|
allowing the next test to start.
|
|
|
|
Uses TWO indicators for robust detection (Session 136 finding):
|
|
1. vm.memory_pressure == 0 (macOS kernel says system is relaxed)
|
|
2. Available memory >= min_free_gb (enough free+speculative pages)
|
|
|
|
Args:
|
|
min_free_gb: Minimum free memory required (default 20 GB for vision models)
|
|
timeout_seconds: Maximum wait time (default 30s for GPU cache release)
|
|
poll_interval: Time between memory checks (default 1s)
|
|
|
|
Returns:
|
|
True if memory threshold reached, False if timeout
|
|
"""
|
|
start_time = time.time()
|
|
|
|
while time.time() - start_time < timeout_seconds:
|
|
# Check memory pressure first (fast sysctl call)
|
|
pressure = _get_memory_pressure()
|
|
if pressure == 0: # NORMAL - system is relaxed
|
|
free_gb = _get_available_memory_gb()
|
|
if free_gb >= min_free_gb:
|
|
return True
|
|
time.sleep(poll_interval)
|
|
|
|
return False
|
|
|
|
# Optional: RAM monitoring for debugging (requires psutil)
|
|
# Uncomment to enable RAM logging during test runs
|
|
# try:
|
|
# import psutil
|
|
# def log_ram_status(stage: str) -> None:
|
|
# """Log current RAM status (non-blocking)."""
|
|
# mem = psutil.virtual_memory()
|
|
# free_gb = mem.available / (1024**3)
|
|
# total_gb = mem.total / (1024**3)
|
|
# print(f"[RAM-{stage}] Free: {free_gb:.1f}GB / {total_gb:.1f}GB ({mem.percent:.1f}% used)")
|
|
# except ImportError:
|
|
# def log_ram_status(stage: str) -> None:
|
|
# pass # psutil not installed, skip logging
|
|
|
|
|
|
@contextmanager
|
|
def LocalServer(
|
|
model: str,
|
|
port: int = 8765,
|
|
timeout: int = 60,
|
|
log_level: str = "warning"
|
|
):
|
|
"""Start a local mlx-knife server for E2E testing.
|
|
|
|
Context manager that:
|
|
1. Launches server subprocess with pre-loaded model
|
|
2. Waits for /health endpoint to respond (up to timeout)
|
|
3. Yields server URL for testing
|
|
4. Ensures graceful shutdown (SIGTERM → SIGKILL fallback)
|
|
|
|
Args:
|
|
model: Model ID to pre-load (e.g., "mlx-community/Llama-3.2-3B-Instruct-4bit")
|
|
port: Server port (default 8765, non-standard to avoid conflicts)
|
|
timeout: Startup timeout in seconds (default 60s for model loading)
|
|
log_level: Server log level (default "warning" to reduce noise)
|
|
|
|
Yields:
|
|
server_url: str like "http://127.0.0.1:8765"
|
|
|
|
Raises:
|
|
TimeoutError: If server fails to start within timeout
|
|
RuntimeError: If httpx not installed
|
|
|
|
Example:
|
|
>>> with LocalServer("mlx-community/Llama-3.2-3B-Instruct-4bit") as url:
|
|
... response = httpx.post(f"{url}/v1/chat/completions", json={...})
|
|
... assert response.status_code == 200
|
|
"""
|
|
if httpx is None:
|
|
raise RuntimeError("httpx required for E2E tests (pip install httpx)")
|
|
|
|
# Start server subprocess
|
|
# Pass environment variables (including HF_HOME) to subprocess
|
|
env = os.environ.copy()
|
|
|
|
# Start server_base directly (NOT via CLI) to avoid double start_new_session orphan bug
|
|
# The CLI uses start_new_session=True in serve.py, which creates a separate process group
|
|
# that won't receive our SIGTERM. By starting server_base directly, we control the session.
|
|
env["MLXK2_HOST"] = "127.0.0.1"
|
|
env["MLXK2_PORT"] = str(port)
|
|
env["MLXK2_LOG_LEVEL"] = log_level
|
|
env["MLXK2_PRELOAD_MODEL"] = model
|
|
|
|
proc = subprocess.Popen(
|
|
[
|
|
sys.executable, "-m", "mlxk2.core.server_base",
|
|
],
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
text=True,
|
|
env=env,
|
|
start_new_session=True # Create process group for robust cleanup
|
|
)
|
|
|
|
server_url = f"http://127.0.0.1:{port}"
|
|
|
|
# Wait for server health check
|
|
start_time = time.time()
|
|
last_error: Optional[Exception] = None
|
|
|
|
while time.time() - start_time < timeout:
|
|
try:
|
|
response = httpx.get(f"{server_url}/health", timeout=2.0)
|
|
if response.status_code == 200:
|
|
# Server ready
|
|
break
|
|
except Exception as e:
|
|
last_error = e
|
|
time.sleep(0.5)
|
|
else:
|
|
# Timeout: kill server and report error
|
|
proc.kill()
|
|
stdout, stderr = proc.communicate()
|
|
|
|
error_msg = (
|
|
f"Server failed to start within {timeout}s\n"
|
|
f"Last error: {last_error}\n"
|
|
f"--- STDOUT ---\n{stdout}\n"
|
|
f"--- STDERR ---\n{stderr}"
|
|
)
|
|
raise TimeoutError(error_msg)
|
|
|
|
try:
|
|
yield server_url
|
|
finally:
|
|
# Robust cleanup with process group verification
|
|
# This prevents zombie accumulation from failed cleanups
|
|
cleanup_success = False
|
|
|
|
try:
|
|
# Step 1: Graceful shutdown (SIGTERM to process group)
|
|
try:
|
|
pgid = os.getpgid(proc.pid)
|
|
os.killpg(pgid, signal.SIGTERM)
|
|
except (ProcessLookupError, PermissionError):
|
|
# Process/group already gone or not accessible - try direct terminate
|
|
proc.terminate()
|
|
|
|
# Step 2: Wait for graceful cleanup (most models finish in 5-10s)
|
|
try:
|
|
proc.wait(timeout=10)
|
|
cleanup_success = True
|
|
except subprocess.TimeoutExpired:
|
|
# Graceful shutdown failed - escalate to SIGKILL
|
|
print(f"\n⚠️ WARNING: Server cleanup timeout after 10s (PID {proc.pid}, model: {model})")
|
|
print(f" Escalating to SIGKILL...")
|
|
|
|
except Exception as e:
|
|
print(f"\n⚠️ WARNING: Error during graceful cleanup: {e}")
|
|
|
|
# Step 3: Forceful shutdown if graceful failed
|
|
if not cleanup_success:
|
|
try:
|
|
# Kill process group with SIGKILL
|
|
try:
|
|
pgid = os.getpgid(proc.pid)
|
|
os.killpg(pgid, signal.SIGKILL)
|
|
print(f" Process group {pgid} killed with SIGKILL")
|
|
except (ProcessLookupError, PermissionError):
|
|
# Group gone or not accessible - try direct kill
|
|
proc.kill()
|
|
print(f" Process {proc.pid} killed with SIGKILL")
|
|
|
|
# Wait for death
|
|
proc.wait(timeout=5)
|
|
cleanup_success = True
|
|
except subprocess.TimeoutExpired:
|
|
# CRITICAL: Process refuses to die even with SIGKILL
|
|
print(f"⚠️ CRITICAL: Process {proc.pid} refuses to die even with SIGKILL!")
|
|
except (ProcessLookupError, PermissionError):
|
|
# Process already gone - cleanup succeeded
|
|
cleanup_success = True
|
|
except Exception as e:
|
|
print(f"⚠️ WARNING: Forceful cleanup failed: {e}")
|
|
|
|
# Step 4: Drain pipes (prevents zombies from pipe backpressure)
|
|
try:
|
|
proc.communicate(timeout=5)
|
|
except Exception:
|
|
pass # Best-effort
|
|
|
|
# Step 5: Final verification - ensure no zombies leaked
|
|
try:
|
|
if proc.poll() is None:
|
|
# Still alive - one last kill attempt
|
|
print(f"⚠️ WARNING: Process {proc.pid} still alive after cleanup - final SIGKILL")
|
|
proc.kill()
|
|
proc.wait(timeout=2)
|
|
except Exception:
|
|
pass # Best-effort
|
|
|
|
# Step 6: Verify process group is dead (prevent zombie accumulation)
|
|
try:
|
|
# Check if any processes in the group are still alive
|
|
result = subprocess.run(
|
|
["pgrep", "-g", str(os.getpgid(proc.pid))],
|
|
capture_output=True,
|
|
timeout=2,
|
|
)
|
|
if result.returncode == 0:
|
|
# Group still has members - kill them all
|
|
subprocess.run(["pkill", "-9", "-g", str(os.getpgid(proc.pid))], timeout=2)
|
|
print(f"⚠️ WARNING: Killed remaining processes in group {os.getpgid(proc.pid)}")
|
|
except (ProcessLookupError, PermissionError, subprocess.TimeoutExpired):
|
|
pass # Group gone or not accessible - that's fine
|
|
except Exception:
|
|
pass # Best-effort
|
|
|
|
# Step 7: Explicit garbage collection + Metal memory release
|
|
gc.collect()
|
|
|
|
# Memory Gate: Wait for memory release (robust scheduling)
|
|
# Metal GPU cache is shared across processes - wait until enough is free
|
|
# 8 GB threshold validated via wet-memmon (avg 10.5 GB free, Firefox running)
|
|
if not _wait_for_memory_release(min_free_gb=8.0, timeout_seconds=10.0):
|
|
free_gb = _get_available_memory_gb()
|
|
print(f"⚠️ Memory release timeout: {free_gb:.1f} GB available (wanted 8 GB)")
|
|
# Continue anyway - test may still succeed or fail with clear OOM error
|