Merge pull request #65 from mjun0812:feat/add-arm64

Feat/add-arm64
2026-07-01 01:37:53 -04:00 · 2025-12-05 01:16:06 +09:00
parent cdfbcd8034 d1715633db
commit 68ae61e4d2
9 changed files with 172 additions and 214 deletions
@@ -2,7 +2,7 @@
 # Build wheels with GitHub-hosted runner
 # #########################################################

-name: "[Linux x86_64] Build wheels and upload to GitHub Releases"
+name: "[Linux] Build wheels and upload to GitHub Releases"

 on:
  workflow_call:
@@ -23,11 +23,21 @@ on:
        description: "CUDA version"
        required: true
        type: string
+      runner:
+        description: "Runner type"
+        required: false
+        type: string
+        default: "ubuntu-22.04"
+      is-upload:
+        description: "Whether to upload the release asset"
+        required: false
+        type: boolean
+        default: true

 jobs:
  build_wheels:
-    name: Build wheels and Upload (Linux x86_64, ${{ inputs.flash-attn-version }}, ${{ inputs.python-version }}, ${{ inputs.torch-version }}, ${{ inputs.cuda-version }})
-    runs-on: ubuntu-22.04
+    name: Build wheels and Upload (Linux x86_64, GitHub hosted runner)
+    runs-on: ${{ inputs.runner }}
    env:
      DEBIAN_FRONTEND: noninteractive
      TERM: xterm-256color
@@ -72,6 +82,7 @@ jobs:
          python -c "import flash_attn; print(flash_attn.__version__)"

      - name: Upload Release Asset
+        if: ${{ inputs.is-upload }}
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
@@ -2,7 +2,7 @@
 # Build wheels with self-hosted runner
 # #########################################################

-name: "[Linux x86_64, self-hosted runner] Build wheels and upload to GitHub Releases"
+name: "[Linux x86_64, self-hosted] Build wheels and upload to GitHub Releases"

 on:
  workflow_call:
@@ -23,11 +23,21 @@ on:
        description: "CUDA version"
        required: true
        type: string
+      runner:
+        description: "Runner type"
+        required: false
+        type: string
+        default: "self-hosted"
+      is-upload:
+        description: "Whether to upload the release asset"
+        required: false
+        type: boolean
+        default: true

 jobs:
  build_wheels_self_hosted:
-    name: Build wheels and Upload (Linux x86_64, self-hosted runner, ${{ inputs.flash-attn-version }}, ${{ inputs.python-version }}, ${{ inputs.torch-version }}, ${{ inputs.cuda-version }})
-    runs-on: self-hosted
+    name: Build wheels and Upload (Linux x86_64, self-hosted runner)
+    runs-on: ${{ inputs.runner }}
    container:
      image: ubuntu:22.04
    defaults:
@@ -109,6 +119,7 @@ jobs:
          python -c "import flash_attn; print(flash_attn.__version__)"

      - name: Upload Release Asset
+        if: ${{ inputs.is-upload }}
        shell: bash
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -23,11 +23,21 @@ on:
        description: "CUDA version"
        required: true
        type: string
+      runner:
+        description: "Runner type"
+        required: false
+        type: string
+        default: "windows-2022"
+      is-upload:
+        description: "Whether to upload the release asset"
+        required: false
+        type: boolean
+        default: true

 jobs:
  build_windows_wheels:
-    name: Build wheels and Upload (Windows x86_64, GitHub hosted runner, ${{ inputs.flash-attn-version }}, ${{ inputs.python-version }}, ${{ inputs.torch-version }}, ${{ inputs.cuda-version }})
-    runs-on: windows-2022
+    name: Build wheels and Upload (Windows x86_64, GitHub hosted runner)
+    runs-on: ${{ inputs.runner }}
    env:
      MAX_JOBS: 2
      NVCC_THREADS: 2
@@ -82,6 +92,7 @@ jobs:
          python -c "import flash_attn; print(flash_attn.__version__)"

      - name: Upload Release Asset
+        if: ${{ inputs.is-upload }}
        shell: pwsh
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -22,10 +22,15 @@ on:
        description: "CUDA version"
        required: true
        type: string
+      is-upload:
+        description: "Whether to upload the release asset"
+        required: false
+        type: boolean
+        default: true

 jobs:
  build_wheels:
-    name: Build wheels and Upload (Windows x86_64, AWS CodeBuild, ${{ inputs.flash-attn-version }}, ${{ inputs.python-version }}, ${{ inputs.torch-version }}, ${{ inputs.cuda-version }})
+    name: Build wheels and Upload (Windows x86_64, AWS CodeBuild)
    timeout-minutes: 2160
    runs-on: codebuild-flash-attention-pre-build-wheel-windows-${{ github.run_id }}-${{ github.run_attempt }}
    # Large Instance
@@ -84,6 +89,7 @@ jobs:
          python -c "import flash_attn; print(flash_attn.__version__)"

      - name: Upload Release Asset
+        if: ${{ inputs.is-upload }}
        shell: pwsh
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -36,7 +36,7 @@ jobs:
  # Linux
  # #########################################################
  build_wheels_linux:
-    name: Build Linux
+    name: Build Linux x86_64
    needs: [create_releases, create_matrix]
    if: ${{ fromjson(needs.create_matrix.outputs.matrix).linux }}
    strategy:
@@ -55,6 +55,27 @@ jobs:
      cuda-version: ${{ matrix.cuda-version }}
    secrets: inherit

+  build_wheels_linux_arm64:
+    name: Build Linux ARM64
+    needs: [create_releases, create_matrix]
+    if: ${{ fromjson(needs.create_matrix.outputs.matrix).linux_arm64 }}
+    strategy:
+      fail-fast: false
+      matrix:
+        flash-attn-version: ${{ fromjson(needs.create_matrix.outputs.matrix).linux_arm64.flash-attn-version }}
+        python-version: ${{ fromjson(needs.create_matrix.outputs.matrix).linux_arm64.python-version }}
+        torch-version: ${{ fromjson(needs.create_matrix.outputs.matrix).linux_arm64.torch-version }}
+        cuda-version: ${{ fromjson(needs.create_matrix.outputs.matrix).linux_arm64.cuda-version }}
+        exclude: ${{ fromjson(needs.create_matrix.outputs.matrix).exclude }}
+    uses: ./.github/workflows/_build_linux.yml
+    with:
+      flash-attn-version: ${{ matrix.flash-attn-version }}
+      python-version: ${{ matrix.python-version }}
+      torch-version: ${{ matrix.torch-version }}
+      cuda-version: ${{ matrix.cuda-version }}
+      runner: "ubuntu-22.04-arm64"
+    secrets: inherit
+
  build_wheels_linux_self_hosted:
    name: Build Linux (self-hosted)
    needs: [create_releases, create_matrix]
@@ -8,61 +8,21 @@ jobs:
  # Build wheels with self-hosted runner
  # #########################################################
  build_wheels_self_hosted:
-    name: Build wheels and Upload (Linux arm64)
-    runs-on: ubuntu-22.04-arm
-    env:
-      DEBIAN_FRONTEND: noninteractive
-      TERM: xterm-256color
+    name: Build wheels and Upload (Linux ARM64, self-hosted runner)
+    uses: ./.github/workflows/_build_linux.yml
    strategy:
      fail-fast: false
      matrix:
-        flash-attn-version: ["2.8.0"]
-        python-version: ["3.11"]
-        torch-version: ["2.7.1"]
+        flash-attn-version: ["2.8.3"]
+        python-version: ["3.13"]
+        torch-version: ["2.9.1"]
        # https://developer.nvidia.com/cuda-toolkit-archive
-        cuda-version: ["12.8.1"]
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Maximize build space
-        run: |
-          df -h
-          echo "-----------------------------"
-          sudo rm -rf /usr/share/dotnet
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /opt/ghc
-          sudo rm -rf /opt/hostedtoolcache/CodeQL
-          df -h
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python-version }}
-
-      - uses: mjun0812/setup-cuda@v1
-        with:
-          version: "${{ matrix.cuda-version }}"
-
-      - name: Install build dependencies
-        shell: bash
-        run: |
-          sudo apt install -y ninja-build clang time
-          pip install -U pip setuptools==75.8.0 wheel setuptools packaging psutil
-
-      - name: Build wheels
-        id: build_wheels
-        shell: bash
-        env:
-          MAX_JOBS: 2
-          NVCC_THREADS: 1
-        run: |
-          chmod +x build_linux.sh
-          ./build_linux.sh ${{ matrix.flash-attn-version }} ${{ matrix.python-version }} ${{ matrix.torch-version }} ${{ matrix.cuda-version }}
-          wheel_name=$(basename $(ls flash-attention/dist/*.whl | head -n 1))
-          echo "wheel_name=$wheel_name" >> $GITHUB_ENV
-
-      - name: Install Test
-        shell: bash
-        run: |
-          pip uninstall -y flash-attn > /dev/null 2>&1
-          pip install --no-cache-dir flash-attention/dist/${{ steps.build_wheels.outputs.WHEEL_NAME }}
-          python -c "import flash_attn; print(flash_attn.__version__)"
+        cuda-version: ["13.0.1"]
+        runner: ["ubuntu-22.04-arm64"]
+    with:
+      flash-attn-version: ${{ matrix.flash-attn-version }}
+      python-version: ${{ matrix.python-version }}
+      torch-version: ${{ matrix.torch-version }}
+      cuda-version: ${{ matrix.cuda-version }}
+      is-upload: false
+      runner: ${{ matrix.cuda-version}}
@@ -9,97 +9,20 @@ jobs:
  # #########################################################
  build_wheels_self_hosted:
    name: Build wheels and Upload (Linux x86_64, self-hosted runner)
-    runs-on: self-hosted
-    container:
-      image: ubuntu:22.04
-    defaults:
-      run:
-        shell: bash
-    env:
-      DEBIAN_FRONTEND: noninteractive
-      TERM: xterm-256color
-    timeout-minutes: 2000
+    uses: ./.github/workflows/_build_linux_self_host.yml
    strategy:
      fail-fast: false
      matrix:
-        flash-attn-version: ["2.8.0"]
-        python-version: ["3.11", "3.11"]
-        torch-version: ["2.7.1"]
+        flash-attn-version: ["2.8.3"]
+        python-version: ["3.13", "3.13"]
+        torch-version: ["2.9.1"]
        # https://developer.nvidia.com/cuda-toolkit-archive
-        cuda-version: ["12.8.1"]
-    steps:
-      - name: Install tools
-        shell: bash
-        run: |
-          apt-get update && apt-get install -y --no-install-recommends \
-            curl \
-            ca-certificates \
-            sudo \
-            software-properties-common \
-            wget \
-            unzip \
-            zip \
-            git \
-            build-essential \
-            gcc \
-            g++ \
-            clang \
-            ninja-build \
-            keyboard-configuration \
-            time
-
-      - name: Install gh
-        shell: bash
-        run: |
-          sudo mkdir -p -m 755 /etc/apt/keyrings
-          out=$(mktemp)
-          wget -nv -O$out https://cli.github.com/packages/githubcli-archive-keyring.gpg
-          cat $out | sudo tee /etc/apt/keyrings/githubcli-archive-keyring.gpg > /dev/null
-          sudo chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg
-          echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | sudo tee /etc/apt/sources.list.d/github-cli.list > /dev/null
-          sudo apt update
-          sudo apt install gh -y
-
-      - uses: actions/checkout@v4
-
-      - name: Configure Git safe directory
-        shell: bash
-        run: |
-          git config --global --add safe.directory $(pwd)
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python-version }}
-
-      - uses: mjun0812/setup-cuda@v1
-        with:
-          version: "${{ matrix.cuda-version }}"
-
-      - name: Install build dependencies
-        shell: bash
-        run: |
-          sudo apt install -y ninja-build clang
-          pip install -U pip setuptools==75.8.0 wheel setuptools packaging psutil
-
-      - name: Build wheels
-        timeout-minutes: 1200
-        id: build_wheels
-        shell: bash
-        run: |
-          chmod +x build_linux.sh
-          ./build_linux.sh ${{ matrix.flash-attn-version }} ${{ matrix.python-version }} ${{ matrix.torch-version }} ${{ matrix.cuda-version }}
-          wheel_name=$(basename $(ls flash-attention/dist/*.whl | head -n 1))
-          echo "WHEEL_NAME=$wheel_name" >> $GITHUB_OUTPUT
-
-      - name: Install Test
-        shell: bash
-        run: |
-          pip uninstall -y flash-attn > /dev/null 2>&1
-          pip install --no-cache-dir flash-attention/dist/${{ steps.build_wheels.outputs.WHEEL_NAME }}
-          python -c "import flash_attn; print(flash_attn.__version__)"
-
-      - name: Clean up
-        shell: bash
-        if: always()
-        run: |
-          rm -rf /opt/hostedtoolcache/Python
+        cuda-version: ["13.0.1"]
+        runner: ["self-hosted", "openci-runner-beta"]
+    with:
+      flash-attn-version: ${{ matrix.flash-attn-version }}
+      python-version: ${{ matrix.python-version }}
+      torch-version: ${{ matrix.torch-version }}
+      cuda-version: ${{ matrix.cuda-version }}
+      is-upload: false
+      runner: ${{ matrix.cuda-version}}
@@ -51,6 +51,58 @@ See [./docs/packages.md](./docs/packages.md) for the full list of available pack

 History of this repository is available [here](./docs/release_history.md).

+## Citation
+
+If you use this repository in your research and find it helpful, please cite the following paper!
+
+```bibtex
+@misc{flash-attention-prebuild-wheels,
+ author = {Morioka, Junya},
+ year = {2025},
+ title = {mjun0812/flash-attention-prebuild-wheels},
+ url = {https://github.com/mjun0812/flash-attention-prebuild-wheels},
+ howpublished = {https://github.com/mjun0812/flash-attention-prebuild-wheels},
+}
+```
+
+## Star History and Download Statistics
+
+<table>
+  <tr>
+    <td>
+      <a href="https://www.star-history.com/#mjun0812/flash-attention-prebuild-wheels&type=date&legend=top-left">
+        <picture>
+          <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=mjun0812/flash-attention-prebuild-wheels&type=date&theme=dark&legend=top-left" />
+          <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=mjun0812/flash-attention-prebuild-wheels&type=date&legend=top-left" />
+          <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=mjun0812/flash-attention-prebuild-wheels&type=date&legend=top-left" height="300"/>
+        </picture>
+      </a>
+    </td>
+    <td>
+      <img alt="Download Statistics" src="./docs/data/download_graph.png" height="300"/>
+    </td>
+  </tr>
+</table>
+
+## Original Repository
+
+[repo](https://github.com/Dao-AILab/flash-attention)
+
+```bibtex
+@inproceedings{dao2022flashattention,
+  title={Flash{A}ttention: Fast and Memory-Efficient Exact Attention with {IO}-Awareness},
+  author={Dao, Tri and Fu, Daniel Y. and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
+  booktitle={Advances in Neural Information Processing Systems (NeurIPS)},
+  year={2022}
+}
+@inproceedings{dao2023flashattention2,
+  title={Flash{A}ttention-2: Faster Attention with Better Parallelism and Work Partitioning},
+  author={Dao, Tri},
+  booktitle={International Conference on Learning Representations (ICLR)},
+  year={2024}
+}
+```
+
 ## Self build

 If you cannot find the version you are looking for, you can fork this repository and create a wheel on GitHub Actions.
@@ -121,55 +173,3 @@ gh api \
  -X POST \
  /repos/[OWNER]/[REPOSITORY]/actions/runners/registration-token
 ```
-
-## Citation
-
-If you use this repository in your research and find it helpful, please cite the following paper!
-
-```bibtex
-@misc{flash-attention-prebuild-wheels,
- author = {Morioka, Junya},
- year = {2025},
- title = {mjun0812/flash-attention-prebuild-wheels},
- url = {https://github.com/mjun0812/flash-attention-prebuild-wheels},
- howpublished = {https://github.com/mjun0812/flash-attention-prebuild-wheels},
-}
-```
-
-## Star History and Download Statistics
-
-<table>
-  <tr>
-    <td>
-      <a href="https://www.star-history.com/#mjun0812/flash-attention-prebuild-wheels&type=date&legend=top-left">
-        <picture>
-          <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=mjun0812/flash-attention-prebuild-wheels&type=date&theme=dark&legend=top-left" />
-          <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=mjun0812/flash-attention-prebuild-wheels&type=date&legend=top-left" />
-          <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=mjun0812/flash-attention-prebuild-wheels&type=date&legend=top-left" height="300"/>
-        </picture>
-      </a>
-    </td>
-    <td>
-      <img alt="Download Statistics" src="./docs/data/download_graph.png" height="300"/>
-    </td>
-  </tr>
-</table>
-
-## Original Repository
-
-[repo](https://github.com/Dao-AILab/flash-attention)
-
-```bibtex
-@inproceedings{dao2022flashattention,
-  title={Flash{A}ttention: Fast and Memory-Efficient Exact Attention with {IO}-Awareness},
-  author={Dao, Tri and Fu, Daniel Y. and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
-  booktitle={Advances in Neural Information Processing Systems (NeurIPS)},
-  year={2022}
-}
-@inproceedings{dao2023flashattention2,
-  title={Flash{A}ttention-2: Faster Attention with Better Parallelism and Work Partitioning},
-  author={Dao, Tri},
-  booktitle={International Conference on Learning Representations (ICLR)},
-  year={2024}
-}
-```
@@ -19,6 +19,12 @@ EXCLUDE = [
    {"torch-version": "2.7.1", "cuda-version": "12.9.1"},
    # torch >= 2.9 does not support Python 3.9
    {"torch-version": "2.9.1", "python-version": "3.9"},
+    # torch < 2.9 does not support CUDA 13.0
+    {"torch-version": "2.5.1", "cuda-version": "13.0.1"},
+    {"torch-version": "2.6.0", "cuda-version": "13.0.1"},
+    {"torch-version": "2.7.1", "cuda-version": "13.0.1"},
+    {"torch-version": "2.8.1", "cuda-version": "13.0.1"},
+    {"torch-version": "2.8.0", "cuda-version": "13.0.1"},
 ]

 LINUX_MATRIX = {
@@ -31,7 +37,7 @@ LINUX_MATRIX = {
        "3.10",
        "3.11",
        "3.12",
-        # "3.13"
+        "3.13",
    ],
    "torch-version": [
        "2.5.1",
@@ -42,13 +48,15 @@ LINUX_MATRIX = {
    ],
    "cuda-version": [
        "12.4.1",
-        "12.6.3",
+        # "12.6.3",
        "12.8.1",
        # "12.9.1",
        "13.0.2",
    ],
 }

+LINUX_ARM64_MATRIX = LINUX_MATRIX
+
 LINUX_SELF_HOSTED_MATRIX = {
    "flash-attn-version": ["2.7.4"],
    "python-version": ["3.10", "3.11", "3.12", "3.13"],
@@ -96,14 +104,21 @@ def main():
    print(
        json.dumps(
            {
-                "linux": LINUX_MATRIX,
-                # "linux": False,
-                # "linux_self_hosted": LINUX_SELF_HOSTED_MATRIX,
+                "linux": False,
+                # "linux": LINUX_MATRIX,
+                #
+                # "linux_arm64": False,
+                "linux_arm64": LINUX_ARM64_MATRIX,
+                #
                "linux_self_hosted": False,
-                "windows": WINDOWS_MATRIX,
-                # "windows": False,
-                # "windows_code_build": WINDOWS_CODEBUILD_MATRIX,
+                # "linux_self_hosted": LINUX_SELF_HOSTED_MATRIX,
+                #
+                "windows": False,
+                # "windows": WINDOWS_MATRIX,
+                #
                "windows_code_build": False,
+                # "windows_code_build": WINDOWS_CODEBUILD_MATRIX,
+                #
                "exclude": EXCLUDE,
            }
        )