fix: enhance device node checks for ollama-rocm.service and add GPU diagnostics script

This commit is contained in:
John Doe
2026-02-27 23:36:53 -05:00
parent 1078dbc876
commit ccbe256bbe
5 changed files with 106 additions and 5 deletions
+11 -2
View File
@@ -170,13 +170,22 @@ sudo bash ./install-rootful.sh
```
If logs show device mapping errors for `/dev/dri`, verify your runtime and GPU device nodes.
The installers skip starting `ollama-rocm.service` when `/dev/kfd` or `/dev/dri` is missing.
The installers skip starting `ollama-rocm.service` when `/dev/kfd` or `/dev/dri` is missing,
or when `/dev/dri` exists but has no `renderD*` / `card*` nodes.
Verify GPU device nodes:
```bash
ls -l /dev/kfd
ls -l /dev/dri
ls -l /dev/dri/renderD* /dev/dri/card*
```
Run bundled diagnostics helper:
```bash
chmod +x ./diag-gpu.sh
./diag-gpu.sh
```
To force-refresh the Ollama image manually:
@@ -192,4 +201,4 @@ sudo systemctl restart ollama-rocm.service
- `podman-mcp-server` is launched via `npx` inside a Node container because the upstream project is distributed as binary/npm package.
- The Ollama unit mirrors your ROCm `docker run` flags.
- If this host is not Linux with ROCm devices (`/dev/kfd`, `/dev/dri`), `ollama` will fail to start.
- If this host is not Linux with ROCm devices (`/dev/kfd`, `/dev/dri/renderD*`), `ollama` will fail to start.
+82
View File
@@ -0,0 +1,82 @@
#!/usr/bin/env bash
set -euo pipefail
ok() { echo "[OK] $*"; }
warn() { echo "[WARN] $*"; }
fail() { echo "[FAIL] $*"; }
info() { echo "[INFO] $*"; }
FAILED=0
echo "ROCm + Podman GPU diagnostics"
echo
if [[ -e /dev/kfd ]]; then
ok "/dev/kfd is present"
else
fail "/dev/kfd is missing"
FAILED=1
fi
if [[ -d /dev/dri ]]; then
ok "/dev/dri directory is present"
else
fail "/dev/dri directory is missing"
FAILED=1
fi
shopt -s nullglob
DRI_NODES=(/dev/dri/renderD* /dev/dri/card*)
shopt -u nullglob
if (( ${#DRI_NODES[@]} > 0 )); then
ok "Detected DRM device nodes under /dev/dri"
ls -l /dev/dri/renderD* /dev/dri/card* 2>/dev/null || true
else
fail "No /dev/dri/renderD* or /dev/dri/card* device nodes found"
FAILED=1
fi
echo
info "Raw device listing"
ls -l /dev/kfd 2>/dev/null || true
ls -la /dev/dri 2>/dev/null || true
echo
if lsmod | grep -q '^amdgpu'; then
ok "Kernel module amdgpu is loaded"
else
warn "Kernel module amdgpu is not loaded"
fi
if command -v podman >/dev/null 2>&1; then
ok "podman is installed"
if podman info >/dev/null 2>&1; then
ok "podman info succeeded"
else
warn "podman info failed (check podman setup/permissions)"
fi
else
fail "podman is not installed"
FAILED=1
fi
if command -v rocminfo >/dev/null 2>&1; then
ok "rocminfo is installed"
if rocminfo >/dev/null 2>&1; then
ok "rocminfo executed successfully"
else
warn "rocminfo exists but failed to execute"
fi
else
warn "rocminfo not found (install ROCm userspace tools for deeper checks)"
fi
echo
if [[ "${FAILED}" -eq 0 ]]; then
ok "Diagnostic passed: host exposes required GPU device nodes for ollama-rocm"
exit 0
fi
fail "Diagnostic failed: fix missing device nodes/driver setup, then rerun"
exit 1
+4 -1
View File
@@ -50,6 +50,9 @@ fi
if [[ ! -d /dev/dri ]]; then
echo "Skipping ollama-rocm.service: /dev/dri is missing on this host."
OLLAMA_READY=false
elif ! compgen -G "/dev/dri/renderD*" >/dev/null && ! compgen -G "/dev/dri/card*" >/dev/null; then
echo "Skipping ollama-rocm.service: /dev/dri has no render/card nodes on this host."
OLLAMA_READY=false
fi
if [[ "${OLLAMA_READY}" == "true" ]]; then
@@ -68,7 +71,7 @@ echo " - podman.socket"
if [[ "${OLLAMA_READY}" == "true" ]]; then
echo " - ollama-rocm.service"
else
echo " - ollama-rocm.service (skipped: missing /dev/kfd or /dev/dri)"
echo " - ollama-rocm.service (skipped: missing /dev/kfd, /dev/dri, or /dev/dri nodes)"
fi
echo " - open-webui.service"
echo " - podman-mcp-server.service"
+4 -1
View File
@@ -102,6 +102,9 @@ fi
if [[ ! -d /dev/dri ]]; then
echo "Skipping ollama-rocm.service: /dev/dri is missing on this host."
OLLAMA_READY=false
elif ! compgen -G "/dev/dri/renderD*" >/dev/null && ! compgen -G "/dev/dri/card*" >/dev/null; then
echo "Skipping ollama-rocm.service: /dev/dri has no render/card nodes on this host."
OLLAMA_READY=false
fi
if [[ "${OLLAMA_READY}" == "true" ]]; then
@@ -120,7 +123,7 @@ echo " - podman.socket"
if [[ "${OLLAMA_READY}" == "true" ]]; then
echo " - ollama-rocm.service"
else
echo " - ollama-rocm.service (skipped: missing /dev/kfd or /dev/dri)"
echo " - ollama-rocm.service (skipped: missing /dev/kfd, /dev/dri, or /dev/dri nodes)"
fi
echo " - open-webui.service"
echo " - podman-mcp-server.service"
+5 -1
View File
@@ -75,7 +75,11 @@ else
fi
if [[ -d /dev/dri ]]; then
ok "/dev/dri present"
if compgen -G "/dev/dri/renderD*" >/dev/null || compgen -G "/dev/dri/card*" >/dev/null; then
ok "/dev/dri has render/card device nodes"
else
warn "/dev/dri exists but has no render/card nodes (ROCm container will not start)"
fi
else
warn "/dev/dri missing (ROCm container will not start)"
fi