From 3f80135940e579075c5454684ecffdcb09cb36f5 Mon Sep 17 00:00:00 2001 From: Junya Morioka Date: Thu, 4 Dec 2025 18:27:04 +0900 Subject: [PATCH] feat: Added logic to dynamically determine the number of build parallelizations --- .github/workflows/test-self-hosted-linux.yml | 17 ++--------- build_linux.sh | 30 +++++++++++++++++++- 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/.github/workflows/test-self-hosted-linux.yml b/.github/workflows/test-self-hosted-linux.yml index 6ad3410..1e3a9ca 100644 --- a/.github/workflows/test-self-hosted-linux.yml +++ b/.github/workflows/test-self-hosted-linux.yml @@ -70,13 +70,9 @@ jobs: with: python-version: ${{ matrix.python-version }} - - uses: Jimver/cuda-toolkit@master + - uses: mjun0812/setup-cuda@v1 with: - cuda: ${{ matrix.cuda-version }} - sub-packages: '["nvcc", "toolkit"]' - method: "network" - use-github-cache: false - use-local-cache: false + version: "${{ matrix.cuda-version }}" - name: Install build dependencies shell: bash @@ -84,19 +80,10 @@ jobs: sudo apt install -y ninja-build clang pip install -U pip setuptools==75.8.0 wheel setuptools packaging psutil - - name: Set environment variables - shell: bash - run: | - export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH - export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH - - name: Build wheels timeout-minutes: 1200 id: build_wheels shell: bash - env: - MAX_JOBS: 4 - NVCC_THREADS: 4 run: | chmod +x build_linux.sh ./build_linux.sh ${{ matrix.flash-attn-version }} ${{ matrix.python-version }} ${{ matrix.torch-version }} ${{ matrix.cuda-version }} diff --git a/build_linux.sh b/build_linux.sh index e98f08f..85512d9 100755 --- a/build_linux.sh +++ b/build_linux.sh @@ -44,10 +44,38 @@ python -c "from torch.utils import cpp_extension; print(cpp_extension.CUDA_HOME) echo "Checking out flash-attention v$FLASH_ATTN_VERSION..." git clone https://github.com/Dao-AILab/flash-attention.git -b "v$FLASH_ATTN_VERSION" +# Determine MAX_JOBS and NVCC_THREADS based on system resources +NUM_THREADS=$(nproc) +RAM_GB=$(free -g | awk '/^Mem:/{print $2}') +echo "System resources:" +echo " CPU threads: $NUM_THREADS" +echo " RAM: ${RAM_GB}GB" + +# Calculate max product based on constraints: +# - MAX_JOBS x NVCC_THREADS <= NUM_THREADS +# - 4GB x MAX_JOBS x NVCC_THREADS <= RAM_GB +MAX_PRODUCT_CPU=$NUM_THREADS +MAX_PRODUCT_RAM=$((RAM_GB / 4)) +MAX_PRODUCT=$((MAX_PRODUCT_CPU < MAX_PRODUCT_RAM ? MAX_PRODUCT_CPU : MAX_PRODUCT_RAM)) + +# Set MAX_JOBS = NVCC_THREADS = floor(sqrt(MAX_PRODUCT)) +# This balances parallelism across both dimensions +MAX_JOBS=$(awk -v max="$MAX_PRODUCT" 'BEGIN {print int(sqrt(max))}') +NVCC_THREADS=$MAX_JOBS + +# Ensure minimum values of 1 +MAX_JOBS=$((MAX_JOBS < 1 ? 1 : MAX_JOBS)) +NVCC_THREADS=$((NVCC_THREADS < 1 ? 1 : NVCC_THREADS)) + +echo "Build parallelism settings:" +echo " MAX_JOBS: $MAX_JOBS" +echo " NVCC_THREADS: $NVCC_THREADS" + # Build wheels echo "Building wheels..." cd flash-attention LOCAL_VERSION_LABEL="cu${MATRIX_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}" -FLASH_ATTENTION_FORCE_BUILD=TRUE FLASH_ATTN_LOCAL_VERSION=${LOCAL_VERSION_LABEL} python setup.py bdist_wheel --dist-dir=dist +NVCC_THREADS=$NVCC_THREADS MAX_JOBS=$MAX_JOBS FLASH_ATTENTION_FORCE_BUILD=TRUE FLASH_ATTN_LOCAL_VERSION=${LOCAL_VERSION_LABEL} \ + python setup.py bdist_wheel --dist-dir=dist wheel_name=$(basename $(ls dist/*.whl | head -n 1)) echo "Built wheel: $wheel_name"