From b0325a3ec3aae98b6bf6661e75ab7940654e8196 Mon Sep 17 00:00:00 2001
From: Junya Morioka <mjun@mjunya.com>
Date: Sun, 8 Jun 2025 06:11:34 +0900
Subject: [PATCH] [WIP] update linux workflow

---
 .github/workflows/build.yml         | 524 ++++++++++++----------------
 .github/workflows/test_workflow.yml | 246 +++++++++++++
 build_linux.sh                      |  70 ++++
 3 files changed, 531 insertions(+), 309 deletions(-)
 create mode 100644 .github/workflows/test_workflow.yml
 create mode 100755 build_linux.sh

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 5d4ba8b..619068f 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -32,339 +32,245 @@ jobs:
   # #########################################################
   # Build wheels with GitHub hosted runner
   # #########################################################
-  # build_wheels:
-  #   name: Build wheels and Upload
-  #   needs: create_releases
-  #   runs-on: ubuntu-22.04
-  #   env:
-  #     DEBIAN_FRONTEND: noninteractive
-  #     TERM: xterm-256color
-  #   timeout-minutes: 1000
-  #   strategy:
-  #     fail-fast: false
-  #     matrix:
-  #       # flash-attn-version: ["2.4.3", "2.5.9", "2.6.3"]
-  #       # python-version: ["3.10", "3.11", "3.12"]
-  #       # torch-version: ["2.8.0.dev20250523"]
-  #       # # https://developer.nvidia.com/cuda-toolkit-archive
-  #       # cuda-version: ["12.8.1"]
-  #       flash-attn-version: []
-  #       python-version: []
-  #       torch-version: []
-  #       # https://developer.nvidia.com/cuda-toolkit-archive
-  #       cuda-version: []
-  #       exclude:
-  #         # torch < 2.2 does not support Python 3.12
-  #         - python-version: "3.12"
-  #           torch-version: "2.0.1"
-  #         - python-version: "3.12"
-  #           torch-version: "2.1.2"
-  #         # torch 2.0.1 does not support CUDA 12.x
-  #         - torch-version: "2.0.1"
-  #           cuda-version: "12.1.1"
-  #         - torch-version: "2.0.1"
-  #           cuda-version: "12.4.1"
-  #         - torch-version: "2.0.1"
-  #           cuda-version: "12.6.3"
-  #         - torch-version: "2.0.1"
-  #           cuda-version: "12.8.1"
-  #         # torch 2.7.0 does not support CUDA 12.4
-  #         - torch-version: "2.7.0"
-  #           cuda-version: "12.4.1"
-  #   steps:
-  #     - uses: actions/checkout@v4
+  build_wheels:
+    name: Build wheels and Upload
+    needs: create_releases
+    runs-on: ubuntu-22.04
+    env:
+      DEBIAN_FRONTEND: noninteractive
+      TERM: xterm-256color
+    timeout-minutes: 1000
+    strategy:
+      fail-fast: false
+      matrix:
+        flash-attn-version: ["2.4.3", "2.5.9", "2.6.3"]
+        python-version: ["3.10", "3.11", "3.12"]
+        torch-version: ["2.8.0.dev20250523"]
+        # # https://developer.nvidia.com/cuda-toolkit-archive
+        cuda-version: ["12.8.1"]
+        exclude:
+          # torch < 2.2 does not support Python 3.12
+          - python-version: "3.12"
+            torch-version: "2.0.1"
+          - python-version: "3.12"
+            torch-version: "2.1.2"
+          # torch 2.0.1 does not support CUDA 12.x
+          - torch-version: "2.0.1"
+            cuda-version: "12.1.1"
+          - torch-version: "2.0.1"
+            cuda-version: "12.4.1"
+          - torch-version: "2.0.1"
+            cuda-version: "12.6.3"
+          - torch-version: "2.0.1"
+            cuda-version: "12.8.1"
+          # torch 2.7.0 does not support CUDA 12.4
+          - torch-version: "2.7.0"
+            cuda-version: "12.4.1"
+    steps:
+      - uses: actions/checkout@v4
 
-  #     - name: Maximize build space
-  #       run: |
-  #         df -h
-  #         echo "-----------------------------"
-  #         sudo rm -rf /usr/share/dotnet
-  #         sudo rm -rf /usr/local/lib/android
-  #         sudo rm -rf /opt/ghc
-  #         sudo rm -rf /opt/hostedtoolcache/CodeQL
-  #         df -h
+      - name: Maximize build space
+        run: |
+          df -h
+          echo "-----------------------------"
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+          df -h
 
-  #     - name: Set Swap Space
-  #       uses: pierotofy/set-swap-space@master
-  #       with:
-  #         swap-size-gb: 48
+      - name: Set Swap Space
+        uses: pierotofy/set-swap-space@master
+        with:
+          swap-size-gb: 48
 
-  #     - uses: actions/setup-python@v5
-  #       with:
-  #         python-version: ${{ matrix.python-version }}
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
 
-  #     - uses: Jimver/cuda-toolkit@master
-  #       with:
-  #         cuda: ${{ matrix.cuda-version }}
-  #         linux-local-args: '["--toolkit"]'
-  #         method: "network"
+      - uses: Jimver/cuda-toolkit@master
+        with:
+          cuda: ${{ matrix.cuda-version }}
+          linux-local-args: '["--toolkit"]'
+          method: "network"
 
-  #     - name: Set CUDA and PyTorch versions
-  #       run: |
-  #         echo "MATRIX_CUDA_VERSION=$(echo ${{ matrix.cuda-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
-  #         echo "MATRIX_TORCH_VERSION=$(echo ${{ matrix.torch-version }} | awk -F \. {'print $1 "." $2'})" >> $GITHUB_ENV
+      - name: Install build dependencies
+        run: |
+          sudo apt install -y ninja-build clang
+          pip install -U pip setuptools==75.8.0 wheel setuptools packaging psutil
 
-  #     - name: Install build dependencies
-  #       run: |
-  #         sudo apt install -y ninja-build clang
-  #         pip install -U pip setuptools==75.8.0 wheel setuptools packaging psutil
+      - name: Set CUDA and PyTorch versions and environment variables
+        run: |
+          export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
+          export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+          export MAX_JOBS=2
+          export NVCC_THREADS=2
+          export FLASH_ATTENTION_FORCE_BUILD=TRUE
 
-  #     - name: Install PyTorch ${{ matrix.torch-version }}+cu${{ matrix.cuda-version }}
-  #       run: |
-  #         export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \
-  #             support_cuda_versions = { \
-  #                 '2.0': [117, 118], \
-  #                 '2.1': [118, 121], \
-  #                 '2.2': [118, 121], \
-  #                 '2.3': [118, 121], \
-  #                 '2.4': [118, 121, 124], \
-  #                 '2.5': [118, 121, 124], \
-  #                 '2.6': [118, 124, 126], \
-  #                 '2.7': [118, 126, 128], \
-  #                 '2.8': [128], \
-  #             }; \
-  #             target_cuda_versions = support_cuda_versions[env['MATRIX_TORCH_VERSION']]; \
-  #             cuda_version = int(env['MATRIX_CUDA_VERSION']); \
-  #             closest_version = min(target_cuda_versions, key=lambda x: abs(x - cuda_version)); \
-  #             print(closest_version) \
-  #         ")
+      - name: Build wheels
+        timeout-minutes: 800
+        run: |
+          chmod +x build_linux.sh
+          ./build_linux.sh ${{ matrix.flash-attn-version }} ${{ matrix.python-version }} ${{ matrix.torch-version }} ${{ matrix.cuda-version }}
+          wheel_name=$(basename $(ls flash-attention/dist/*.whl | head -n 1))
+          echo "wheel_name=$wheel_name" >> $GITHUB_ENV
 
-  #         if [[ ${{ matrix.torch-version }} == *"dev"* ]]; then
-  #           pip install --pre torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION}
-  #         else
-  #           pip install --no-cache-dir torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION}
-  #         fi
+      - name: Install Test
+        run: |
+          pip install flash-attention/dist/${{ env.wheel_name }}
+          python -c "import flash_attn; print(flash_attn.__version__)"
 
-  #         nvcc --version
-  #         python -V
-  #         python -c "import torch; print('PyTorch:', torch.__version__)"
-  #         python -c "import torch; print('CUDA:', torch.version.cuda)"
-  #         python -c "from torch.utils import cpp_extension; print(cpp_extension.CUDA_HOME)"
+      - name: Get the tag version
+        id: extract_branch
+        run: echo "branch=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT
 
-  #     - name: Checkout flash-attn
-  #       run: |
-  #         git clone https://github.com/Dao-AILab/flash-attention.git -b "v${{ matrix.flash-attn-version }}"
+      - name: Get Release with Tag
+        id: get_release
+        uses: joutvhu/get-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ steps.extract_branch.outputs.branch }}
 
-  #     - name: Build wheels
-  #       timeout-minutes: 800
-  #       run: |
-  #         export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
-  #         export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-  #         export MAX_JOBS=2
-  #         export NVCC_THREADS=2
-  #         export FLASH_ATTENTION_FORCE_BUILD=TRUE
+      - name: Upload Release Asset
+        uses: actions/upload-release-asset@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ steps.get_release.outputs.upload_url }}
+          asset_path: flash-attention/dist/${{ env.wheel_name }}
+          asset_name: ${{ env.wheel_name }}
+          asset_content_type: application/*
 
-  #         cd flash-attention
-  #         python setup.py bdist_wheel --dist-dir=dist
-  #         base_wheel_name=$(basename $(ls dist/*.whl | head -n 1))
-  #         wheel_name=$(echo $base_wheel_name | sed "s/${{ matrix.flash-attn-version }}/${{ matrix.flash-attn-version }}+cu${{ env.MATRIX_CUDA_VERSION }}torch${{ env.MATRIX_TORCH_VERSION }}/")
-  #         mv dist/$base_wheel_name dist/$wheel_name
-  #         echo "wheel_name=$wheel_name" >> $GITHUB_ENV
+  # #########################################################
+  # Build wheels with self-hosted runner
+  # #########################################################
+  build_wheels_self_hosted:
+    name: Build wheels and Upload
+    needs: create_releases
+    runs-on: self-hosted
+    container:
+      image: ubuntu:22.04
+    defaults:
+      run:
+        shell: bash
+    env:
+      DEBIAN_FRONTEND: noninteractive
+      TERM: xterm-256color
+    timeout-minutes: 1000
+    strategy:
+      fail-fast: false
+      matrix:
+        flash-attn-version: ["2.7.4"]
+        python-version: ["3.10", "3.11", "3.12"]
+        torch-version: ["2.8.0.dev20250523"]
+        # https://developer.nvidia.com/cuda-toolkit-archive
+        cuda-version: ["12.8.1"]
+        exclude:
+          # torch < 2.2 does not support Python 3.12
+          - python-version: "3.12"
+            torch-version: "2.0.1"
+          - python-version: "3.12"
+            torch-version: "2.1.2"
+          # torch 2.0.1 does not support CUDA 12.x
+          - torch-version: "2.0.1"
+            cuda-version: "12.1.1"
+          - torch-version: "2.0.1"
+            cuda-version: "12.4.1"
+          - torch-version: "2.0.1"
+            cuda-version: "12.6.3"
+          - torch-version: "2.0.1"
+            cuda-version: "12.8.1"
+          # torch 2.7.0 does not support CUDA 12.4
+          - torch-version: "2.7.0"
+            cuda-version: "12.4.1"
 
-  #     - name: Install Test
-  #       run: |
-  #         pip install flash-attention/dist/${{ env.wheel_name }}
-  #         python -c "import flash_attn; print(flash_attn.__version__)"
+    steps:
+      - name: Install tools
+        shell: bash
+        run: |
+          apt-get update && apt-get install -y --no-install-recommends \
+            curl \
+            ca-certificates \
+            sudo \
+            software-properties-common \
+            wget \
+            unzip \
+            zip \
+            git \
+            build-essential \
+            gcc \
+            g++ \
+            clang \
+            ninja-build \
+            keyboard-configuration
 
-  #     - name: Get the tag version
-  #       id: extract_branch
-  #       run: echo "branch=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT
+      - uses: actions/checkout@v4
 
-  #     - name: Get Release with Tag
-  #       id: get_release
-  #       uses: joutvhu/get-release@v1
-  #       env:
-  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  #       with:
-  #         tag_name: ${{ steps.extract_branch.outputs.branch }}
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
 
-  #     - name: Upload Release Asset
-  #       uses: actions/upload-release-asset@v1
-  #       env:
-  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  #       with:
-  #         upload_url: ${{ steps.get_release.outputs.upload_url }}
-  #         asset_path: flash-attention/dist/${{ env.wheel_name }}
-  #         asset_name: ${{ env.wheel_name }}
-  #         asset_content_type: application/*
+      - uses: Jimver/cuda-toolkit@master
+        env:
+          DEBIAN_FRONTEND: noninteractive
+        with:
+          cuda: ${{ matrix.cuda-version }}
+          linux-local-args: '["--toolkit"]'
+          method: "network"
 
-  # # #########################################################
-  # # Build wheels with self-hosted runner
-  # # #########################################################
-  # build_wheels_self_hosted:
-  #   name: Build wheels and Upload
-  #   needs: create_releases
-  #   runs-on: self-hosted
-  #   container:
-  #     image: ubuntu:22.04
-  #   defaults:
-  #     run:
-  #       shell: bash
-  #   env:
-  #     DEBIAN_FRONTEND: noninteractive
-  #     TERM: xterm-256color
-  #   timeout-minutes: 1000
-  #   strategy:
-  #     fail-fast: false
-  #     matrix:
-  #       # flash-attn-version: ["2.7.4"]
-  #       # python-version: ["3.10", "3.11", "3.12"]
-  #       # torch-version: ["2.8.0.dev20250523"]
-  #       # https://developer.nvidia.com/cuda-toolkit-archive
-  #       # cuda-version: ["12.8.1"]
-  #       flash-attn-version: []
-  #       python-version: []
-  #       torch-version: []
-  #       cuda-version: []
-  #       exclude:
-  #         # torch < 2.2 does not support Python 3.12
-  #         - python-version: "3.12"
-  #           torch-version: "2.0.1"
-  #         - python-version: "3.12"
-  #           torch-version: "2.1.2"
-  #         # torch 2.0.1 does not support CUDA 12.x
-  #         - torch-version: "2.0.1"
-  #           cuda-version: "12.1.1"
-  #         - torch-version: "2.0.1"
-  #           cuda-version: "12.4.1"
-  #         - torch-version: "2.0.1"
-  #           cuda-version: "12.6.3"
-  #         - torch-version: "2.0.1"
-  #           cuda-version: "12.8.1"
-  #         # torch 2.7.0 does not support CUDA 12.4
-  #         - torch-version: "2.7.0"
-  #           cuda-version: "12.4.1"
+      - name: Install build dependencies
+        run: |
+          pip install -U pip setuptools==75.8.0 wheel setuptools packaging psutil
 
-  #   steps:
-  #     - name: Install tools
-  #       run: |
-  #         apt-get update && apt-get install -y --no-install-recommends \
-  #           curl \
-  #           ca-certificates \
-  #           sudo \
-  #           software-properties-common \
-  #           wget \
-  #           unzip \
-  #           zip \
-  #           git \
-  #           build-essential \
-  #           gcc \
-  #           g++ \
-  #           clang \
-  #           ninja-build \
-  #           keyboard-configuration
+          - name: Set CUDA and PyTorch versions and environment variables
+          run: |
+            export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
+            export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+            export MAX_JOBS=2
+            export NVCC_THREADS=2
+            export FLASH_ATTENTION_FORCE_BUILD=TRUE
 
-  #     - uses: actions/checkout@v4
+      - name: Build wheels
+        timeout-minutes: 800
+        run: |
+          chmod +x build_linux.sh
+          ./build_linux.sh ${{ matrix.flash-attn-version }} ${{ matrix.python-version }} ${{ matrix.torch-version }} ${{ matrix.cuda-version }}
+          wheel_name=$(basename $(ls flash-attention/dist/*.whl | head -n 1))
+          echo "wheel_name=$wheel_name" >> $GITHUB_ENV
 
-  #     - uses: actions/setup-python@v5
-  #       with:
-  #         python-version: ${{ matrix.python-version }}
+      - name: Install Test
+        run: |
+          pip install flash-attention/dist/${{ env.wheel_name }}
+          python -c "import flash_attn; print(flash_attn.__version__)"
 
-  #     - uses: Jimver/cuda-toolkit@master
-  #       env:
-  #         DEBIAN_FRONTEND: noninteractive
-  #       with:
-  #         cuda: ${{ matrix.cuda-version }}
-  #         linux-local-args: '["--toolkit"]'
-  #         method: "network"
+      - name: Get the tag version
+        id: extract_branch
+        shell: bash
+        run: echo "branch=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT
 
-  #     - name: Set CUDA and PyTorch versions
-  #       run: |
-  #         echo "MATRIX_CUDA_VERSION=$(echo ${{ matrix.cuda-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
-  #         echo "MATRIX_TORCH_VERSION=$(echo ${{ matrix.torch-version }} | awk -F \. {'print $1 "." $2'})" >> $GITHUB_ENV
-  #       shell: bash
+      - name: Get Release with Tag
+        id: get_release
+        uses: joutvhu/get-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ steps.extract_branch.outputs.branch }}
 
-  #     - name: Install build dependencies
-  #       run: |
-  #         sudo apt install -y ninja-build clang
-  #         pip install -U pip setuptools==75.8.0 wheel setuptools packaging psutil
+      - name: Upload Release Asset
+        uses: actions/upload-release-asset@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ steps.get_release.outputs.upload_url }}
+          asset_path: flash-attention/dist/${{ env.wheel_name }}
+          asset_name: ${{ env.wheel_name }}
+          asset_content_type: application/*
 
-  #     - name: Install PyTorch ${{ matrix.torch-version }}+cu${{ matrix.cuda-version }}
-  #       shell: bash
-  #       run: |
-  #         export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \
-  #             support_cuda_versions = { \
-  #                 '2.0': [117, 118], \
-  #                 '2.1': [118, 121], \
-  #                 '2.2': [118, 121], \
-  #                 '2.3': [118, 121], \
-  #                 '2.4': [118, 121, 124], \
-  #                 '2.5': [118, 121, 124], \
-  #                 '2.6': [118, 124, 126], \
-  #                 '2.7': [118, 126, 128], \
-  #                 '2.8': [128], \
-  #             }; \
-  #             target_cuda_versions = support_cuda_versions[env['MATRIX_TORCH_VERSION']]; \
-  #             cuda_version = int(env['MATRIX_CUDA_VERSION']); \
-  #             closest_version = min(target_cuda_versions, key=lambda x: abs(x - cuda_version)); \
-  #             print(closest_version) \
-  #         ")
-
-  #         if [[ ${{ matrix.torch-version }} == *"dev"* ]]; then
-  #           pip install --pre --force-reinstall --no-cache-dir torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION}
-  #         else
-  #           pip install --force-reinstall --no-cache-dir torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION}
-  #         fi
-
-  #         nvcc --version
-  #         python -V
-  #         python -c "import torch; print('PyTorch:', torch.__version__)"
-  #         python -c "import torch; print('CUDA:', torch.version.cuda)"
-  #         python -c "from torch.utils import cpp_extension; print(cpp_extension.CUDA_HOME)"
-
-  #     - name: Checkout flash-attn
-  #       run: |
-  #         git clone https://github.com/Dao-AILab/flash-attention.git -b "v${{ matrix.flash-attn-version }}"
-
-  #     - name: Build wheels
-  #       timeout-minutes: 800
-  #       shell: bash
-  #       run: |
-  #         export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
-  #         export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-  #         export MAX_JOBS=2
-  #         export NVCC_THREADS=4
-  #         export FLASH_ATTENTION_FORCE_BUILD=TRUE
-
-  #         cd flash-attention
-  #         python setup.py bdist_wheel --dist-dir=dist
-  #         base_wheel_name=$(basename $(ls dist/*.whl | head -n 1))
-  #         wheel_name=$(echo $base_wheel_name | sed "s/${{ matrix.flash-attn-version }}/${{ matrix.flash-attn-version }}+cu${{ env.MATRIX_CUDA_VERSION }}torch${{ env.MATRIX_TORCH_VERSION }}/")
-  #         mv dist/$base_wheel_name dist/$wheel_name
-  #         echo "wheel_name=$wheel_name" >> $GITHUB_ENV
-
-  #     - name: Install Test
-  #       run: |
-  #         pip install flash-attention/dist/${{ env.wheel_name }}
-  #         python -c "import flash_attn; print(flash_attn.__version__)"
-
-  #     - name: Get the tag version
-  #       id: extract_branch
-  #       shell: bash
-  #       run: echo "branch=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT
-
-  #     - name: Get Release with Tag
-  #       id: get_release
-  #       uses: joutvhu/get-release@v1
-  #       env:
-  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  #       with:
-  #         tag_name: ${{ steps.extract_branch.outputs.branch }}
-
-  #     - name: Upload Release Asset
-  #       uses: actions/upload-release-asset@v1
-  #       env:
-  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  #       with:
-  #         upload_url: ${{ steps.get_release.outputs.upload_url }}
-  #         asset_path: flash-attention/dist/${{ env.wheel_name }}
-  #         asset_name: ${{ env.wheel_name }}
-  #         asset_content_type: application/*
-
-  #     - name: Clean up
-  #       if: always()
-  #       run: |
-  #         sudo rm -rf /opt/hostedtoolcache/Python/${{ matrix.python-version }}*
+      - name: Clean up
+        if: always()
+        run: |
+          sudo rm -rf /opt/hostedtoolcache/Python/${{ matrix.python-version }}*
 
   build_windows_wheels:
     name: Build wheels and Upload
diff --git a/.github/workflows/test_workflow.yml b/.github/workflows/test_workflow.yml
new file mode 100644
index 0000000..6046ff8
--- /dev/null
+++ b/.github/workflows/test_workflow.yml
@@ -0,0 +1,246 @@
+name: Test workflow
+
+on:
+  workflow_dispatch:
+
+jobs:
+  # #########################################################
+  # Build wheels with GitHub hosted runner
+  # #########################################################
+  build_wheels:
+    name: Build wheels
+    runs-on: ubuntu-22.04
+    env:
+      DEBIAN_FRONTEND: noninteractive
+      TERM: xterm-256color
+    timeout-minutes: 1000
+    strategy:
+      fail-fast: false
+      matrix:
+        flash-attn-version: ["2.4.3", "2.5.9", "2.6.3"]
+        python-version: ["3.10", "3.11", "3.12"]
+        torch-version: ["2.8.0.dev20250523"]
+        # # https://developer.nvidia.com/cuda-toolkit-archive
+        cuda-version: ["12.8.1"]
+        exclude:
+          # torch < 2.2 does not support Python 3.12
+          - python-version: "3.12"
+            torch-version: "2.0.1"
+          - python-version: "3.12"
+            torch-version: "2.1.2"
+          # torch 2.0.1 does not support CUDA 12.x
+          - torch-version: "2.0.1"
+            cuda-version: "12.1.1"
+          - torch-version: "2.0.1"
+            cuda-version: "12.4.1"
+          - torch-version: "2.0.1"
+            cuda-version: "12.6.3"
+          - torch-version: "2.0.1"
+            cuda-version: "12.8.1"
+          # torch 2.7.0 does not support CUDA 12.4
+          - torch-version: "2.7.0"
+            cuda-version: "12.4.1"
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Maximize build space
+        run: |
+          df -h
+          echo "-----------------------------"
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+          df -h
+
+      - name: Set Swap Space
+        uses: pierotofy/set-swap-space@master
+        with:
+          swap-size-gb: 48
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - uses: Jimver/cuda-toolkit@master
+        with:
+          cuda: ${{ matrix.cuda-version }}
+          linux-local-args: '["--toolkit"]'
+          method: "network"
+
+      - name: Install build dependencies
+        run: |
+          sudo apt install -y ninja-build clang
+          pip install -U pip setuptools==75.8.0 wheel setuptools packaging psutil
+
+      - name: Set CUDA and PyTorch versions and environment variables
+        run: |
+          export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
+          export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+          export MAX_JOBS=2
+          export NVCC_THREADS=2
+          export FLASH_ATTENTION_FORCE_BUILD=TRUE
+
+      - name: Build wheels
+        timeout-minutes: 800
+        run: |
+          chmod +x build_linux.sh
+          ./build_linux.sh ${{ matrix.flash-attn-version }} ${{ matrix.python-version }} ${{ matrix.torch-version }} ${{ matrix.cuda-version }}
+          wheel_name=$(basename $(ls flash-attention/dist/*.whl | head -n 1))
+          echo "wheel_name=$wheel_name" >> $GITHUB_ENV
+
+      - name: Install Test
+        run: |
+          pip install flash-attention/dist/${{ env.wheel_name }}
+          python -c "import flash_attn; print(flash_attn.__version__)"
+
+      - name: Get the tag version
+        id: extract_branch
+        run: echo "branch=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT
+
+      - name: Get Release with Tag
+        id: get_release
+        uses: joutvhu/get-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ steps.extract_branch.outputs.branch }}
+
+      - name: Upload Release Asset
+        uses: actions/upload-release-asset@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ steps.get_release.outputs.upload_url }}
+          asset_path: flash-attention/dist/${{ env.wheel_name }}
+          asset_name: ${{ env.wheel_name }}
+          asset_content_type: application/*
+
+  # #########################################################
+  # Build wheels with self-hosted runner
+  # #########################################################
+  build_wheels_self_hosted:
+    name: Build wheels
+    runs-on: self-hosted
+    container:
+      image: ubuntu:22.04
+    defaults:
+      run:
+        shell: bash
+    env:
+      DEBIAN_FRONTEND: noninteractive
+      TERM: xterm-256color
+    timeout-minutes: 1000
+    strategy:
+      fail-fast: false
+      matrix:
+        flash-attn-version: ["2.7.4"]
+        python-version: ["3.10", "3.11", "3.12"]
+        torch-version: ["2.8.0.dev20250523"]
+        # https://developer.nvidia.com/cuda-toolkit-archive
+        cuda-version: ["12.8.1"]
+        exclude:
+          # torch < 2.2 does not support Python 3.12
+          - python-version: "3.12"
+            torch-version: "2.0.1"
+          - python-version: "3.12"
+            torch-version: "2.1.2"
+          # torch 2.0.1 does not support CUDA 12.x
+          - torch-version: "2.0.1"
+            cuda-version: "12.1.1"
+          - torch-version: "2.0.1"
+            cuda-version: "12.4.1"
+          - torch-version: "2.0.1"
+            cuda-version: "12.6.3"
+          - torch-version: "2.0.1"
+            cuda-version: "12.8.1"
+          # torch 2.7.0 does not support CUDA 12.4
+          - torch-version: "2.7.0"
+            cuda-version: "12.4.1"
+
+    steps:
+      - name: Install tools
+        shell: bash
+        run: |
+          apt-get update && apt-get install -y --no-install-recommends \
+            curl \
+            ca-certificates \
+            sudo \
+            software-properties-common \
+            wget \
+            unzip \
+            zip \
+            git \
+            build-essential \
+            gcc \
+            g++ \
+            clang \
+            ninja-build \
+            keyboard-configuration
+
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - uses: Jimver/cuda-toolkit@master
+        env:
+          DEBIAN_FRONTEND: noninteractive
+        with:
+          cuda: ${{ matrix.cuda-version }}
+          linux-local-args: '["--toolkit"]'
+          method: "network"
+
+      - name: Install build dependencies
+        run: |
+          pip install -U pip setuptools==75.8.0 wheel setuptools packaging psutil
+
+          - name: Set CUDA and PyTorch versions and environment variables
+          run: |
+            export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
+            export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+            export MAX_JOBS=2
+            export NVCC_THREADS=2
+            export FLASH_ATTENTION_FORCE_BUILD=TRUE
+
+      - name: Build wheels
+        timeout-minutes: 800
+        run: |
+          chmod +x build_linux.sh
+          ./build_linux.sh ${{ matrix.flash-attn-version }} ${{ matrix.python-version }} ${{ matrix.torch-version }} ${{ matrix.cuda-version }}
+          wheel_name=$(basename $(ls flash-attention/dist/*.whl | head -n 1))
+          echo "wheel_name=$wheel_name" >> $GITHUB_ENV
+
+      - name: Install Test
+        run: |
+          pip install flash-attention/dist/${{ env.wheel_name }}
+          python -c "import flash_attn; print(flash_attn.__version__)"
+
+      - name: Get the tag version
+        id: extract_branch
+        shell: bash
+        run: echo "branch=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT
+
+      - name: Get Release with Tag
+        id: get_release
+        uses: joutvhu/get-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ steps.extract_branch.outputs.branch }}
+
+      - name: Upload Release Asset
+        uses: actions/upload-release-asset@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ steps.get_release.outputs.upload_url }}
+          asset_path: flash-attention/dist/${{ env.wheel_name }}
+          asset_name: ${{ env.wheel_name }}
+          asset_content_type: application/*
+
+      - name: Clean up
+        if: always()
+        run: |
+          sudo rm -rf /opt/hostedtoolcache/Python/${{ matrix.python-version }}*
diff --git a/build_linux.sh b/build_linux.sh
new file mode 100755
index 0000000..2949824
--- /dev/null
+++ b/build_linux.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+
+set -e
+
+# Parameters with defaults
+FLASH_ATTN_VERSION=$1
+PYTHON_VERSION=$2
+TORCH_VERSION=$3
+CUDA_VERSION=$4
+
+echo "Building Flash Attention with parameters:"
+echo "  Flash-Attention: $FLASH_ATTN_VERSION"
+echo "  Python: $PYTHON_VERSION"
+echo "  PyTorch: $TORCH_VERSION"
+echo "  CUDA: $CUDA_VERSION"
+
+# Set CUDA and PyTorch versions
+MATRIX_CUDA_VERSION=$(echo $CUDA_VERSION | awk -F \. {'print $1 $2'})
+MATRIX_TORCH_VERSION=$(echo $TORCH_VERSION | awk -F \. {'print $1 "." $2'})
+
+echo "Derived versions:"
+echo "  CUDA Matrix: $MATRIX_CUDA_VERSION"
+echo "  Torch Matrix: $MATRIX_TORCH_VERSION"
+
+# Install PyTorch
+echo "Installing PyTorch $TORCH_VERSION+cu$CUDA_VERSION..."
+TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \
+    support_cuda_versions = { \
+        '2.0': [117, 118], \
+        '2.1': [118, 121], \
+        '2.2': [118, 121], \
+        '2.3': [118, 121], \
+        '2.4': [118, 121, 124], \
+        '2.5': [118, 121, 124], \
+        '2.6': [118, 124, 126], \
+        '2.7': [118, 126, 128], \
+        '2.8': [128], \
+    }; \
+    target_cuda_versions = support_cuda_versions[env['MATRIX_TORCH_VERSION']]; \
+    cuda_version = int(env['MATRIX_CUDA_VERSION']); \
+    closest_version = min(target_cuda_versions, key=lambda x: abs(x - cuda_version)); \
+    print(closest_version) \
+")
+
+if [[ $TORCH_VERSION == *"dev"* ]]; then
+  pip install --pre torch==$TORCH_VERSION --index-url https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION}
+else
+  pip install --no-cache-dir torch==$TORCH_VERSION --index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION}
+fi
+
+# Verify installation
+echo "Verifying installations..."
+nvcc --version
+python -V
+python -c "import torch; print('PyTorch:', torch.__version__)"
+python -c "import torch; print('CUDA:', torch.version.cuda)"
+python -c "from torch.utils import cpp_extension; print(cpp_extension.CUDA_HOME)"
+
+# Checkout flash-attn
+echo "Checking out flash-attention v$FLASH_ATTN_VERSION..."
+git clone https://github.com/Dao-AILab/flash-attention.git -b "v$FLASH_ATTN_VERSION"
+
+# Build wheels
+echo "Building wheels..."
+cd flash-attention
+python setup.py bdist_wheel --dist-dir=dist
+base_wheel_name=$(basename $(ls dist/*.whl | head -n 1))
+wheel_name=$(echo $base_wheel_name | sed "s/$FLASH_ATTN_VERSION/$FLASH_ATTN_VERSION+cu${MATRIX_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}/")
+mv dist/$base_wheel_name dist/$wheel_name
+echo "Built wheel: $wheel_name"