mirror of
https://github.com/BillyOutlast/proxmox-rocm-toolkit.git
synced 2026-07-01 19:54:40 -04:00
Add initial scripts and README for Proxmox ROCm LXC Toolkit setup
- Create .gitignore to exclude log and temporary files - Add README.md with project overview, requirements, and quick start guide - Implement create_rocm_lxc.sh for creating unprivileged Ubuntu 24.04 LXC - Implement configure_gpu_passthrough.sh for GPU passthrough configuration - Implement install_rocm_in_ct.sh for installing ROCm 7.2 in the container
This commit is contained in:
@@ -0,0 +1,2 @@
|
||||
*.log
|
||||
*.tmp
|
||||
@@ -0,0 +1,90 @@
|
||||
# Proxmox ROCm LXC Toolkit
|
||||
|
||||
Toolkit for building an **unprivileged Ubuntu 24.04 LXC** on Proxmox and installing **ROCm 7.2** with AMD's official Ubuntu package-manager method.
|
||||
|
||||
## What this includes
|
||||
|
||||
- `scripts/create_rocm_lxc.sh`
|
||||
- Creates an unprivileged Ubuntu 24.04 container using `pct`.
|
||||
- `scripts/configure_gpu_passthrough.sh`
|
||||
- Adds `/dev/kfd` + `/dev/dri` passthrough and cgroup permissions in LXC config.
|
||||
- `scripts/install_rocm_in_ct.sh`
|
||||
- Registers ROCm 7.2 `noble` apt repos and installs a chosen ROCm meta package.
|
||||
|
||||
## Requirements
|
||||
|
||||
- Proxmox VE host with a working AMD GPU stack exposing:
|
||||
- `/dev/kfd`
|
||||
- `/dev/dri`
|
||||
- Ubuntu 24.04 LXC template available in Proxmox storage.
|
||||
- Run scripts on the Proxmox host as `root`.
|
||||
|
||||
## Quick start
|
||||
|
||||
1) Create unprivileged container:
|
||||
|
||||
```bash
|
||||
chmod +x scripts/*.sh
|
||||
|
||||
sudo ./scripts/create_rocm_lxc.sh \
|
||||
--ctid 120 \
|
||||
--hostname rocm-ct \
|
||||
--template local:vztmpl/ubuntu-24.04-standard_24.04-1_amd64.tar.zst \
|
||||
--storage local-lvm
|
||||
```
|
||||
|
||||
2) Configure GPU passthrough on host:
|
||||
|
||||
```bash
|
||||
sudo ./scripts/configure_gpu_passthrough.sh --ctid 120
|
||||
```
|
||||
|
||||
3) Install ROCm in container:
|
||||
|
||||
```bash
|
||||
sudo ./scripts/install_rocm_in_ct.sh --ctid 120 --package rocm
|
||||
```
|
||||
|
||||
4) Optional manual checks:
|
||||
|
||||
```bash
|
||||
pct exec 120 -- bash -lc '/opt/rocm/bin/rocminfo | head -n 40'
|
||||
pct exec 120 -- bash -lc '/opt/rocm/bin/rocm-smi || true'
|
||||
```
|
||||
|
||||
## ROCm package options
|
||||
|
||||
`install_rocm_in_ct.sh` defaults to `rocm`, but you can pass alternatives, for example:
|
||||
|
||||
- `rocm-hip-runtime`
|
||||
- `rocm-opencl-runtime`
|
||||
- `rocm-ml-libraries`
|
||||
|
||||
Example:
|
||||
|
||||
```bash
|
||||
sudo ./scripts/install_rocm_in_ct.sh --ctid 120 --package rocm-hip-runtime
|
||||
```
|
||||
|
||||
## Notes for unprivileged LXC
|
||||
|
||||
- Device passthrough to unprivileged containers can be sensitive to host kernel/driver updates.
|
||||
- If your workload runs as a non-root user inside the CT, ensure that user is in `video`/`render` groups:
|
||||
|
||||
```bash
|
||||
pct exec 120 -- bash -lc 'usermod -aG video,render <your-user>'
|
||||
```
|
||||
|
||||
- A CT restart is often required after changing LXC device mappings.
|
||||
|
||||
## Alignment with AMD docs
|
||||
|
||||
ROCm install flow follows AMD’s Ubuntu package-manager guidance for ROCm 7.2 and Ubuntu 24.04 (`noble`):
|
||||
|
||||
- GPG key to `/etc/apt/keyrings/rocm.gpg`
|
||||
- `rocm/apt/7.2` + `graphics/7.2/ubuntu` apt repos
|
||||
- apt preference pin (`Pin-Priority: 600`)
|
||||
|
||||
Reference:
|
||||
|
||||
- https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/install-methods/package-manager/package-manager-ubuntu.html
|
||||
@@ -0,0 +1,87 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Configure /etc/pve/lxc/<CTID>.conf for unprivileged ROCm GPU passthrough.
|
||||
|
||||
Usage:
|
||||
sudo ./scripts/configure_gpu_passthrough.sh --ctid 120
|
||||
|
||||
What it does:
|
||||
- Allows /dev/kfd and /dev/dri* device classes in cgroup2
|
||||
- Bind-mounts /dev/kfd and /dev/dri into the container
|
||||
- Restarts container if running
|
||||
|
||||
Notes:
|
||||
- Run on Proxmox host as root.
|
||||
- Host must already have amdgpu loaded and expose /dev/kfd and /dev/dri.
|
||||
EOF
|
||||
}
|
||||
|
||||
require_root() {
|
||||
if [[ "${EUID}" -ne 0 ]]; then
|
||||
echo "This script must be run as root." >&2
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
CTID=""
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--ctid) CTID="$2"; shift 2 ;;
|
||||
-h|--help) usage; exit 0 ;;
|
||||
*)
|
||||
echo "Unknown argument: $1" >&2
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ -z "${CTID}" ]]; then
|
||||
echo "--ctid is required." >&2
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
|
||||
require_root
|
||||
|
||||
if [[ ! -e /dev/kfd ]]; then
|
||||
echo "Host device /dev/kfd not found. Ensure AMD driver is loaded on host." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! -d /dev/dri ]]; then
|
||||
echo "Host directory /dev/dri not found. Ensure AMD DRM devices exist on host." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
CONF_FILE="/etc/pve/lxc/${CTID}.conf"
|
||||
if [[ ! -f "${CONF_FILE}" ]]; then
|
||||
echo "Container config not found: ${CONF_FILE}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
append_if_missing() {
|
||||
local line="$1"
|
||||
if ! grep -Fqx "${line}" "${CONF_FILE}"; then
|
||||
echo "${line}" >> "${CONF_FILE}"
|
||||
fi
|
||||
}
|
||||
|
||||
append_if_missing "lxc.cgroup2.devices.allow: c 226:* rwm"
|
||||
append_if_missing "lxc.cgroup2.devices.allow: c 235:* rwm"
|
||||
append_if_missing "lxc.mount.entry: /dev/kfd dev/kfd none bind,optional,create=file"
|
||||
append_if_missing "lxc.mount.entry: /dev/dri dev/dri none bind,optional,create=dir"
|
||||
|
||||
if pct status "${CTID}" | grep -q "status: running"; then
|
||||
echo "Restarting CT ${CTID} to apply new LXC config..."
|
||||
pct restart "${CTID}"
|
||||
else
|
||||
echo "Starting CT ${CTID}..."
|
||||
pct start "${CTID}"
|
||||
fi
|
||||
|
||||
echo "Done. GPU passthrough directives are present in ${CONF_FILE}."
|
||||
@@ -0,0 +1,129 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Create an unprivileged Ubuntu 24.04 LXC on Proxmox.
|
||||
|
||||
Usage:
|
||||
sudo ./scripts/create_rocm_lxc.sh \
|
||||
--ctid 120 \
|
||||
--hostname rocm-ct \
|
||||
--template local:vztmpl/ubuntu-24.04-standard_24.04-1_amd64.tar.zst \
|
||||
--storage local-lvm
|
||||
|
||||
Optional:
|
||||
--rootfs-size 32 (GiB, default: 32)
|
||||
--cores 8 (default: 8)
|
||||
--memory 16384 (MiB, default: 16384)
|
||||
--swap 2048 (MiB, default: 2048)
|
||||
--bridge vmbr0 (default: vmbr0)
|
||||
--ip dhcp (default: dhcp)
|
||||
--gateway 192.168.1.1 (optional)
|
||||
--dns 1.1.1.1 (optional)
|
||||
--password 'StrongPass123' (optional)
|
||||
--onboot 1 (default: 1)
|
||||
|
||||
Notes:
|
||||
- Run this on a Proxmox host as root.
|
||||
- This only creates the container. GPU passthrough and ROCm install are separate steps.
|
||||
EOF
|
||||
}
|
||||
|
||||
require_root() {
|
||||
if [[ "${EUID}" -ne 0 ]]; then
|
||||
echo "This script must be run as root." >&2
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
CTID=""
|
||||
HOSTNAME=""
|
||||
TEMPLATE=""
|
||||
STORAGE=""
|
||||
ROOTFS_SIZE=32
|
||||
CORES=8
|
||||
MEMORY=16384
|
||||
SWAP=2048
|
||||
BRIDGE="vmbr0"
|
||||
IP="dhcp"
|
||||
GATEWAY=""
|
||||
DNS=""
|
||||
PASSWORD=""
|
||||
ONBOOT=1
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--ctid) CTID="$2"; shift 2 ;;
|
||||
--hostname) HOSTNAME="$2"; shift 2 ;;
|
||||
--template) TEMPLATE="$2"; shift 2 ;;
|
||||
--storage) STORAGE="$2"; shift 2 ;;
|
||||
--rootfs-size) ROOTFS_SIZE="$2"; shift 2 ;;
|
||||
--cores) CORES="$2"; shift 2 ;;
|
||||
--memory) MEMORY="$2"; shift 2 ;;
|
||||
--swap) SWAP="$2"; shift 2 ;;
|
||||
--bridge) BRIDGE="$2"; shift 2 ;;
|
||||
--ip) IP="$2"; shift 2 ;;
|
||||
--gateway) GATEWAY="$2"; shift 2 ;;
|
||||
--dns) DNS="$2"; shift 2 ;;
|
||||
--password) PASSWORD="$2"; shift 2 ;;
|
||||
--onboot) ONBOOT="$2"; shift 2 ;;
|
||||
-h|--help) usage; exit 0 ;;
|
||||
*)
|
||||
echo "Unknown argument: $1" >&2
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ -z "${CTID}" || -z "${HOSTNAME}" || -z "${TEMPLATE}" || -z "${STORAGE}" ]]; then
|
||||
echo "Missing required arguments." >&2
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
|
||||
require_root
|
||||
|
||||
if pct status "${CTID}" >/dev/null 2>&1; then
|
||||
echo "Container ${CTID} already exists. Aborting." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
NET0="name=eth0,bridge=${BRIDGE},ip=${IP}"
|
||||
if [[ -n "${GATEWAY}" ]]; then
|
||||
NET0+=",gw=${GATEWAY}"
|
||||
fi
|
||||
|
||||
CREATE_ARGS=(
|
||||
"${CTID}"
|
||||
"${TEMPLATE}"
|
||||
--hostname "${HOSTNAME}"
|
||||
--ostype ubuntu
|
||||
--rootfs "${STORAGE}:${ROOTFS_SIZE}"
|
||||
--cores "${CORES}"
|
||||
--memory "${MEMORY}"
|
||||
--swap "${SWAP}"
|
||||
--unprivileged 1
|
||||
--features nesting=1,keyctl=1
|
||||
--onboot "${ONBOOT}"
|
||||
--net0 "${NET0}"
|
||||
)
|
||||
|
||||
if [[ -n "${DNS}" ]]; then
|
||||
CREATE_ARGS+=(--nameserver "${DNS}")
|
||||
fi
|
||||
|
||||
if [[ -n "${PASSWORD}" ]]; then
|
||||
CREATE_ARGS+=(--password "${PASSWORD}")
|
||||
fi
|
||||
|
||||
echo "Creating LXC ${CTID} (${HOSTNAME})..."
|
||||
pct create "${CREATE_ARGS[@]}"
|
||||
|
||||
echo "Starting container ${CTID}..."
|
||||
pct start "${CTID}"
|
||||
|
||||
echo "Done. Next steps:"
|
||||
echo "1) Configure GPU passthrough: ./scripts/configure_gpu_passthrough.sh --ctid ${CTID}"
|
||||
echo "2) Install ROCm in CT: ./scripts/install_rocm_in_ct.sh --ctid ${CTID}"
|
||||
@@ -0,0 +1,104 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Install ROCm 7.2 inside Ubuntu 24.04 LXC.
|
||||
|
||||
Usage:
|
||||
sudo ./scripts/install_rocm_in_ct.sh --ctid 120 [--package rocm]
|
||||
|
||||
Options:
|
||||
--ctid <id> Proxmox CTID (required)
|
||||
--package <name> ROCm meta package (default: rocm)
|
||||
Examples: rocm, rocm-hip-runtime, rocm-opencl-runtime
|
||||
|
||||
Notes:
|
||||
- Follows AMD ROCm Ubuntu package-manager method for Ubuntu 24.04 (noble).
|
||||
- Run from Proxmox host as root.
|
||||
EOF
|
||||
}
|
||||
|
||||
require_root() {
|
||||
if [[ "${EUID}" -ne 0 ]]; then
|
||||
echo "This script must be run as root." >&2
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
CTID=""
|
||||
ROCM_PACKAGE="rocm"
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--ctid) CTID="$2"; shift 2 ;;
|
||||
--package) ROCM_PACKAGE="$2"; shift 2 ;;
|
||||
-h|--help) usage; exit 0 ;;
|
||||
*)
|
||||
echo "Unknown argument: $1" >&2
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ -z "${CTID}" ]]; then
|
||||
echo "--ctid is required." >&2
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
|
||||
require_root
|
||||
|
||||
if ! pct status "${CTID}" >/dev/null 2>&1; then
|
||||
echo "Container ${CTID} does not exist." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! pct status "${CTID}" | grep -q "status: running"; then
|
||||
echo "Starting CT ${CTID}..."
|
||||
pct start "${CTID}"
|
||||
fi
|
||||
|
||||
echo "Installing ROCm package '${ROCM_PACKAGE}' in CT ${CTID}..."
|
||||
pct exec "${CTID}" -- bash -lc "
|
||||
set -euo pipefail
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
apt update
|
||||
apt install -y ca-certificates gnupg wget
|
||||
|
||||
mkdir -p /etc/apt/keyrings
|
||||
wget -qO- https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor > /etc/apt/keyrings/rocm.gpg
|
||||
|
||||
cat >/etc/apt/sources.list.d/rocm.list <<'EOF'
|
||||
deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/7.2 noble main
|
||||
deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/graphics/7.2/ubuntu noble main
|
||||
EOF
|
||||
|
||||
cat >/etc/apt/preferences.d/rocm-pin-600 <<'EOF'
|
||||
Package: *
|
||||
Pin: release o=repo.radeon.com
|
||||
Pin-Priority: 600
|
||||
EOF
|
||||
|
||||
apt install -y ${ROCM_PACKAGE}
|
||||
"
|
||||
|
||||
echo "ROCm install finished. Running quick checks in CT ${CTID}..."
|
||||
pct exec "${CTID}" -- bash -lc "
|
||||
set +e
|
||||
/opt/rocm/bin/rocminfo >/tmp/rocminfo.out 2>&1
|
||||
ROCINFO_RC=\$?
|
||||
if [[ -x /opt/rocm/bin/rocm-smi ]]; then
|
||||
/opt/rocm/bin/rocm-smi >/tmp/rocm-smi.out 2>&1
|
||||
fi
|
||||
set -e
|
||||
|
||||
echo \"rocminfo exit code: \${ROCINFO_RC}\"
|
||||
if [[ \${ROCINFO_RC} -ne 0 ]]; then
|
||||
echo \"rocminfo did not succeed. Check /tmp/rocminfo.out in container.\"
|
||||
fi
|
||||
"
|
||||
|
||||
echo "Done."
|
||||
Reference in New Issue
Block a user