Files
sec-mcp/gpu-diagnose.sh

178 lines
5.4 KiB
Bash

#!/usr/bin/env bash
set -euo pipefail
ok() { echo "[OK] $1"; }
info() { echo "[INFO] $1"; }
warn() { echo "[WARN] $1"; }
fail() { echo "[FAIL] $1"; }
has_fail=0
section() {
echo
echo "== $1 =="
}
require_cmd() {
local cmd="$1"
if command -v "$cmd" >/dev/null 2>&1; then
ok "'$cmd' is installed"
else
fail "'$cmd' is not installed"
has_fail=1
fi
}
resolve_group_name() {
local gid="$1"
if command -v getent >/dev/null 2>&1; then
getent group "$gid" | cut -d: -f1 || true
fi
}
show_device_owner() {
local device="$1"
if [ -e "$device" ]; then
local gid owner mode gname
gid="$(stat -c '%g' "$device")"
owner="$(stat -c '%U:%G' "$device")"
mode="$(stat -c '%a' "$device")"
gname="$(resolve_group_name "$gid")"
ok "$device present (owner=$owner mode=$mode gid=$gid${gname:+ group=$gname})"
else
fail "$device missing"
has_fail=1
fi
}
find_ollama_container() {
local cid
cid="$(podman ps --filter name=sec-mcp_ollama_1 --format '{{.ID}}' | head -n1 || true)"
if [ -n "$cid" ]; then
echo "$cid"
return 0
fi
cid="$(podman ps --filter ancestor=ollama/ollama:rocm --format '{{.ID}}' | head -n1 || true)"
if [ -n "$cid" ]; then
echo "$cid"
return 0
fi
return 1
}
echo "[gpu-diagnose] sec-mcp ROCm/Ollama diagnostic"
section "Host checks"
require_cmd podman
if command -v lspci >/dev/null 2>&1; then
info "PCI display adapters:"
lspci | grep -Ei 'vga|3d|display|amd|ati' || true
else
warn "lspci not available; skipping PCI adapter probe"
fi
show_device_owner "/dev/kfd"
show_device_owner "/dev/dri"
if [ -d /dev/dri ]; then
info "DRM nodes:"
ls -l /dev/dri || true
fi
section "Host kernel log analysis"
if command -v dmesg >/dev/null 2>&1; then
dmesg_raw="$(dmesg 2>&1 || true)"
dmesg_tail="$(echo "$dmesg_raw" | grep -Ei 'amdgpu|kfd|drm|gfx|mes|psp|sdma' | tail -n 200 || true)"
if echo "$dmesg_raw" | grep -Eqi 'Operation not permitted|permission denied|read kernel buffer failed'; then
warn "dmesg access is restricted in this environment; run this script on the Proxmox host (or with sufficient privileges) for full kernel diagnostics"
fi
if [ -z "$dmesg_tail" ]; then
warn "No recent amdgpu/kfd/drm lines found in dmesg"
else
info "Recent amdgpu/kfd/drm lines detected"
fi
if echo "$dmesg_tail" | grep -Eqi 'MES failed to respond|GPU reset begin|device lost from bus|ASIC reset failed|Failed to quiesce KFD|evicting device resources failed|failed to suspend gangs|PSP resume failed|failed to load ucode SDMA_CTX|amdgpu_device_ip_resume failed|resume of IP block <psp> failed'; then
fail "Host dmesg shows critical AMDGPU/KFD instability (GPU reset or queue failures)"
echo " This is a host-side driver/runtime issue and will force Ollama CPU fallback."
has_fail=1
fi
else
warn "dmesg command unavailable; skipping kernel log analysis"
fi
section "Container checks"
if ! podman info >/dev/null 2>&1; then
fail "podman runtime is not healthy (podman info failed)"
has_fail=1
fi
if ollama_cid="$(find_ollama_container)"; then
ollama_name="$(podman inspect --format '{{.Name}}' "$ollama_cid" | sed 's#^/##')"
ok "found running ollama container: $ollama_name ($ollama_cid)"
else
fail "no running ollama container found (expected sec-mcp_ollama_1 or ollama/ollama:rocm)"
echo " Start it with: podman compose up -d ollama"
has_fail=1
fi
if [ "${ollama_cid:-}" != "" ]; then
info "Container device visibility:"
podman exec "$ollama_cid" sh -lc 'ls -l /dev/kfd /dev/dri 2>/dev/null; [ -d /dev/dri ] && ls -l /dev/dri || true' || true
info "Container runtime identity:"
podman exec "$ollama_cid" sh -lc 'id' || true
info "Container ROCm-related env vars:"
podman exec "$ollama_cid" sh -lc 'env | grep -E "OLLAMA_LLM_LIBRARY|HSA_|HIP_|ROCR_" || true' || true
info "ROCm backend library presence in container:"
podman exec "$ollama_cid" sh -lc 'ls /usr/lib/ollama/libggml-rocm* 2>/dev/null || echo "(no libggml-rocm files found)"' || true
section "Ollama log analysis"
logs="$(podman logs --tail 300 "$ollama_cid" 2>&1 || true)"
if echo "$logs" | grep -Eqi 'inference compute.*(id=gpu|library=rocm|library=hip)'; then
ok "Ollama reports GPU inference backend"
else
warn "No explicit GPU inference backend reported in recent logs"
fi
if echo "$logs" | grep -Eqi 'offloaded [1-9][0-9]*/[0-9]+ layers to GPU'; then
ok "Model layers are being offloaded to GPU"
elif echo "$logs" | grep -Eqi 'offloaded 0/[0-9]+ layers to GPU'; then
fail "Ollama reports zero GPU layer offload (CPU fallback)"
has_fail=1
fi
if echo "$logs" | grep -Eqi 'total_vram="0 B"|inference compute.*id=cpu|library=cpu'; then
fail "Logs indicate CPU-only inference (VRAM unavailable to Ollama)"
has_fail=1
fi
if echo "$logs" | grep -Eqi 'amdgpu|kfd|rocm|hip'; then
info "Recent ROCm/AMD-related log lines found"
else
warn "No ROCm/AMD-related lines found in recent Ollama logs"
fi
fi
section "Result"
if [ "$has_fail" -ne 0 ]; then
echo "[gpu-diagnose] One or more critical checks failed."
echo "[gpu-diagnose] Likely causes now: host ROCm/GPU compatibility, LXC passthrough policy, or driver stack mismatch."
echo "[gpu-diagnose] Next host checks:"
echo " - dmesg | grep -Ei 'amdgpu|kfd|drm|gfx' | tail -n 120"
echo " - lspci | grep -Ei 'vga|3d|display|amd'"
exit 1
fi
ok "GPU diagnostics passed"
echo "[gpu-diagnose] Ollama appears ready for GPU inference."