diff --git a/comfyui-build/docker-entrypoint.sh b/comfyui-build/docker-entrypoint.sh
index a52c317..dab1112 100644
--- a/comfyui-build/docker-entrypoint.sh
+++ b/comfyui-build/docker-entrypoint.sh
@@ -102,10 +102,11 @@ fi
 
 echo "Using PyTorch index URL: ${PYTORCH_INDEX_URL}"
 if echo "${PYTORCH_INDEX_URL}" | grep -q "rocm.nightlies.amd.com"; then
-    pip install --pre torch torchvision torchaudio --extra-index-url ${PYTORCH_INDEX_URL}
+    pip install --pre torch torchvision torchaudio pytorch-triton-rocm --extra-index-url ${PYTORCH_INDEX_URL}
 else
-    pip install --pre torch torchvision torchaudio --index-url ${PYTORCH_INDEX_URL}
+    pip install --pre torch torchvision torchaudio pytorch-triton-rocm --index-url ${PYTORCH_INDEX_URL}
 fi
+pip install flash-attn --index-url https://pypi.org/simple
 echo "Installing ComfyUI requirements..."
 pip install -r requirements.txt
 
@@ -116,7 +117,7 @@ if [ -f "start.sh" ]; then
     ./start.sh
 else
     echo "No start.sh found, creating default startup script..."
-    echo "python main.py --listen 0.0.0.0 --port 8188 --use-split-cross-attention" > start.sh
+    echo "python main.py --listen 0.0.0.0 --port 8188 --use-split-cross-attention --use-quad-cross-attention" > start.sh
     chmod +x start.sh
     ./start.sh
 fi
diff --git a/docker-compose.yaml b/docker-compose.yaml
index ae2000f..3efd93b 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -97,15 +97,42 @@ services:
     image: docker.io/getterup/comfyui-rocm7.1:latest
     container_name: comfyui
     environment:
-      - ROCR_VISIBLE_DEVICES=1
       - COMFYUI_ENABLE_ROCM=True
       - GPU_ARCH=gfx110X
       - PYTORCH_TUNABLEOP_ENABLED=0
-      - MIOPEN_FIND_MODE=NORMAL
-      - TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1
       - AMD_SERIALIZE_KERNEL=1
       - MIOPEN_USER_DB_PATH=/tmp/.miopen
       - MIOPEN_CUSTOM_CACHE_DIR=/tmp/.miopen
+      # === ROCm paths ===
+      - HIP_VISIBLE_DEVICES=0
+      - RROCR_VISIBLE_DEVICES=1
+    # === GPU targeting ===
+      - HCC_AMDGPU_TARGET="gfx1100"   # Change for your GPU
+      - PYTORCH_ROCM_ARCH="gfx1100"   # e.g., gfx1030 for RX 6800/6900
+    # === Memory allocator tuning ===
+      - PYTORCH_HIP_ALLOC_CONF="garbage_collection_threshold:0.6,max_split_size_mb:6144"
+    # === Precision and performance ===
+      - TORCH_BLAS_PREFER_HIPBLASLT=0
+      - TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS="CK,TRITON,ROCBLAS"
+      - TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_SEARCH_SPACE="BEST"
+      - TORCHINDUCTOR_FORCE_FALLBACK=0
+    # === Flash Attention ===
+      - FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
+      - FLASH_ATTENTION_BACKEND="flash_attn_triton_amd"
+      - FLASH_ATTENTION_TRITON_AMD_SEQ_LEN=4096
+      - USE_CK=ON
+      - TRANSFORMERS_USE_FLASH_ATTENTION=1
+      - TRITON_USE_ROCM=ON
+      - TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1
+    # === CPU threading ===
+      - OMP_NUM_THREADS=8
+      - MKL_NUM_THREADS=8
+      - NUMEXPR_NUM_THREADS=8
+    # === Experimental ROCm flags ===
+      - HSA_ENABLE_ASYNC_COPY=1
+      - HSA_ENABLE_SDMA=1
+      - MIOPEN_FIND_MODE=2
+      - MIOPEN_ENABLE_CACHE=1
     ports:
       - "8188:8188"
     networks: