diff --git a/comfyui-build/docker-entrypoint.sh b/comfyui-build/docker-entrypoint.sh index a52c317..dab1112 100644 --- a/comfyui-build/docker-entrypoint.sh +++ b/comfyui-build/docker-entrypoint.sh @@ -102,10 +102,11 @@ fi echo "Using PyTorch index URL: ${PYTORCH_INDEX_URL}" if echo "${PYTORCH_INDEX_URL}" | grep -q "rocm.nightlies.amd.com"; then - pip install --pre torch torchvision torchaudio --extra-index-url ${PYTORCH_INDEX_URL} + pip install --pre torch torchvision torchaudio pytorch-triton-rocm --extra-index-url ${PYTORCH_INDEX_URL} else - pip install --pre torch torchvision torchaudio --index-url ${PYTORCH_INDEX_URL} + pip install --pre torch torchvision torchaudio pytorch-triton-rocm --index-url ${PYTORCH_INDEX_URL} fi +pip install flash-attn --index-url https://pypi.org/simple echo "Installing ComfyUI requirements..." pip install -r requirements.txt @@ -116,7 +117,7 @@ if [ -f "start.sh" ]; then ./start.sh else echo "No start.sh found, creating default startup script..." - echo "python main.py --listen 0.0.0.0 --port 8188 --use-split-cross-attention" > start.sh + echo "python main.py --listen 0.0.0.0 --port 8188 --use-split-cross-attention --use-quad-cross-attention" > start.sh chmod +x start.sh ./start.sh fi diff --git a/docker-compose.yaml b/docker-compose.yaml index ae2000f..3efd93b 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -97,15 +97,42 @@ services: image: docker.io/getterup/comfyui-rocm7.1:latest container_name: comfyui environment: - - ROCR_VISIBLE_DEVICES=1 - COMFYUI_ENABLE_ROCM=True - GPU_ARCH=gfx110X - PYTORCH_TUNABLEOP_ENABLED=0 - - MIOPEN_FIND_MODE=NORMAL - - TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 - AMD_SERIALIZE_KERNEL=1 - MIOPEN_USER_DB_PATH=/tmp/.miopen - MIOPEN_CUSTOM_CACHE_DIR=/tmp/.miopen + # === ROCm paths === + - HIP_VISIBLE_DEVICES=0 + - RROCR_VISIBLE_DEVICES=1 + # === GPU targeting === + - HCC_AMDGPU_TARGET="gfx1100" # Change for your GPU + - PYTORCH_ROCM_ARCH="gfx1100" # e.g., gfx1030 for RX 6800/6900 + # === Memory allocator tuning === + - PYTORCH_HIP_ALLOC_CONF="garbage_collection_threshold:0.6,max_split_size_mb:6144" + # === Precision and performance === + - TORCH_BLAS_PREFER_HIPBLASLT=0 + - TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS="CK,TRITON,ROCBLAS" + - TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_SEARCH_SPACE="BEST" + - TORCHINDUCTOR_FORCE_FALLBACK=0 + # === Flash Attention === + - FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" + - FLASH_ATTENTION_BACKEND="flash_attn_triton_amd" + - FLASH_ATTENTION_TRITON_AMD_SEQ_LEN=4096 + - USE_CK=ON + - TRANSFORMERS_USE_FLASH_ATTENTION=1 + - TRITON_USE_ROCM=ON + - TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 + # === CPU threading === + - OMP_NUM_THREADS=8 + - MKL_NUM_THREADS=8 + - NUMEXPR_NUM_THREADS=8 + # === Experimental ROCm flags === + - HSA_ENABLE_ASYNC_COPY=1 + - HSA_ENABLE_SDMA=1 + - MIOPEN_FIND_MODE=2 + - MIOPEN_ENABLE_CACHE=1 ports: - "8188:8188" networks: