mirror of
https://github.com/BillyOutlast/openwebui-ollama-rocm-podman-mcp.git
synced 2026-07-01 09:46:36 -04:00
fix: enhance device node checks for ollama-rocm.service and add GPU diagnostics script
This commit is contained in:
@@ -170,13 +170,22 @@ sudo bash ./install-rootful.sh
|
||||
```
|
||||
|
||||
If logs show device mapping errors for `/dev/dri`, verify your runtime and GPU device nodes.
|
||||
The installers skip starting `ollama-rocm.service` when `/dev/kfd` or `/dev/dri` is missing.
|
||||
The installers skip starting `ollama-rocm.service` when `/dev/kfd` or `/dev/dri` is missing,
|
||||
or when `/dev/dri` exists but has no `renderD*` / `card*` nodes.
|
||||
|
||||
Verify GPU device nodes:
|
||||
|
||||
```bash
|
||||
ls -l /dev/kfd
|
||||
ls -l /dev/dri
|
||||
ls -l /dev/dri/renderD* /dev/dri/card*
|
||||
```
|
||||
|
||||
Run bundled diagnostics helper:
|
||||
|
||||
```bash
|
||||
chmod +x ./diag-gpu.sh
|
||||
./diag-gpu.sh
|
||||
```
|
||||
|
||||
To force-refresh the Ollama image manually:
|
||||
@@ -192,4 +201,4 @@ sudo systemctl restart ollama-rocm.service
|
||||
|
||||
- `podman-mcp-server` is launched via `npx` inside a Node container because the upstream project is distributed as binary/npm package.
|
||||
- The Ollama unit mirrors your ROCm `docker run` flags.
|
||||
- If this host is not Linux with ROCm devices (`/dev/kfd`, `/dev/dri`), `ollama` will fail to start.
|
||||
- If this host is not Linux with ROCm devices (`/dev/kfd`, `/dev/dri/renderD*`), `ollama` will fail to start.
|
||||
|
||||
+82
@@ -0,0 +1,82 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ok() { echo "[OK] $*"; }
|
||||
warn() { echo "[WARN] $*"; }
|
||||
fail() { echo "[FAIL] $*"; }
|
||||
info() { echo "[INFO] $*"; }
|
||||
|
||||
FAILED=0
|
||||
|
||||
echo "ROCm + Podman GPU diagnostics"
|
||||
echo
|
||||
|
||||
if [[ -e /dev/kfd ]]; then
|
||||
ok "/dev/kfd is present"
|
||||
else
|
||||
fail "/dev/kfd is missing"
|
||||
FAILED=1
|
||||
fi
|
||||
|
||||
if [[ -d /dev/dri ]]; then
|
||||
ok "/dev/dri directory is present"
|
||||
else
|
||||
fail "/dev/dri directory is missing"
|
||||
FAILED=1
|
||||
fi
|
||||
|
||||
shopt -s nullglob
|
||||
DRI_NODES=(/dev/dri/renderD* /dev/dri/card*)
|
||||
shopt -u nullglob
|
||||
|
||||
if (( ${#DRI_NODES[@]} > 0 )); then
|
||||
ok "Detected DRM device nodes under /dev/dri"
|
||||
ls -l /dev/dri/renderD* /dev/dri/card* 2>/dev/null || true
|
||||
else
|
||||
fail "No /dev/dri/renderD* or /dev/dri/card* device nodes found"
|
||||
FAILED=1
|
||||
fi
|
||||
|
||||
echo
|
||||
info "Raw device listing"
|
||||
ls -l /dev/kfd 2>/dev/null || true
|
||||
ls -la /dev/dri 2>/dev/null || true
|
||||
|
||||
echo
|
||||
if lsmod | grep -q '^amdgpu'; then
|
||||
ok "Kernel module amdgpu is loaded"
|
||||
else
|
||||
warn "Kernel module amdgpu is not loaded"
|
||||
fi
|
||||
|
||||
if command -v podman >/dev/null 2>&1; then
|
||||
ok "podman is installed"
|
||||
if podman info >/dev/null 2>&1; then
|
||||
ok "podman info succeeded"
|
||||
else
|
||||
warn "podman info failed (check podman setup/permissions)"
|
||||
fi
|
||||
else
|
||||
fail "podman is not installed"
|
||||
FAILED=1
|
||||
fi
|
||||
|
||||
if command -v rocminfo >/dev/null 2>&1; then
|
||||
ok "rocminfo is installed"
|
||||
if rocminfo >/dev/null 2>&1; then
|
||||
ok "rocminfo executed successfully"
|
||||
else
|
||||
warn "rocminfo exists but failed to execute"
|
||||
fi
|
||||
else
|
||||
warn "rocminfo not found (install ROCm userspace tools for deeper checks)"
|
||||
fi
|
||||
|
||||
echo
|
||||
if [[ "${FAILED}" -eq 0 ]]; then
|
||||
ok "Diagnostic passed: host exposes required GPU device nodes for ollama-rocm"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
fail "Diagnostic failed: fix missing device nodes/driver setup, then rerun"
|
||||
exit 1
|
||||
+4
-1
@@ -50,6 +50,9 @@ fi
|
||||
if [[ ! -d /dev/dri ]]; then
|
||||
echo "Skipping ollama-rocm.service: /dev/dri is missing on this host."
|
||||
OLLAMA_READY=false
|
||||
elif ! compgen -G "/dev/dri/renderD*" >/dev/null && ! compgen -G "/dev/dri/card*" >/dev/null; then
|
||||
echo "Skipping ollama-rocm.service: /dev/dri has no render/card nodes on this host."
|
||||
OLLAMA_READY=false
|
||||
fi
|
||||
|
||||
if [[ "${OLLAMA_READY}" == "true" ]]; then
|
||||
@@ -68,7 +71,7 @@ echo " - podman.socket"
|
||||
if [[ "${OLLAMA_READY}" == "true" ]]; then
|
||||
echo " - ollama-rocm.service"
|
||||
else
|
||||
echo " - ollama-rocm.service (skipped: missing /dev/kfd or /dev/dri)"
|
||||
echo " - ollama-rocm.service (skipped: missing /dev/kfd, /dev/dri, or /dev/dri nodes)"
|
||||
fi
|
||||
echo " - open-webui.service"
|
||||
echo " - podman-mcp-server.service"
|
||||
|
||||
+4
-1
@@ -102,6 +102,9 @@ fi
|
||||
if [[ ! -d /dev/dri ]]; then
|
||||
echo "Skipping ollama-rocm.service: /dev/dri is missing on this host."
|
||||
OLLAMA_READY=false
|
||||
elif ! compgen -G "/dev/dri/renderD*" >/dev/null && ! compgen -G "/dev/dri/card*" >/dev/null; then
|
||||
echo "Skipping ollama-rocm.service: /dev/dri has no render/card nodes on this host."
|
||||
OLLAMA_READY=false
|
||||
fi
|
||||
|
||||
if [[ "${OLLAMA_READY}" == "true" ]]; then
|
||||
@@ -120,7 +123,7 @@ echo " - podman.socket"
|
||||
if [[ "${OLLAMA_READY}" == "true" ]]; then
|
||||
echo " - ollama-rocm.service"
|
||||
else
|
||||
echo " - ollama-rocm.service (skipped: missing /dev/kfd or /dev/dri)"
|
||||
echo " - ollama-rocm.service (skipped: missing /dev/kfd, /dev/dri, or /dev/dri nodes)"
|
||||
fi
|
||||
echo " - open-webui.service"
|
||||
echo " - podman-mcp-server.service"
|
||||
|
||||
+5
-1
@@ -75,7 +75,11 @@ else
|
||||
fi
|
||||
|
||||
if [[ -d /dev/dri ]]; then
|
||||
ok "/dev/dri present"
|
||||
if compgen -G "/dev/dri/renderD*" >/dev/null || compgen -G "/dev/dri/card*" >/dev/null; then
|
||||
ok "/dev/dri has render/card device nodes"
|
||||
else
|
||||
warn "/dev/dri exists but has no render/card nodes (ROCm container will not start)"
|
||||
fi
|
||||
else
|
||||
warn "/dev/dri missing (ROCm container will not start)"
|
||||
fi
|
||||
|
||||
Reference in New Issue
Block a user