Merge pull request #65 from mjun0812:feat/add-arm64

Feat/add-arm64
This commit is contained in:
Junya Morioka
2025-12-05 01:16:06 +09:00
committed by GitHub
9 changed files with 172 additions and 214 deletions
+14 -3
View File
@@ -2,7 +2,7 @@
# Build wheels with GitHub-hosted runner
# #########################################################
name: "[Linux x86_64] Build wheels and upload to GitHub Releases"
name: "[Linux] Build wheels and upload to GitHub Releases"
on:
workflow_call:
@@ -23,11 +23,21 @@ on:
description: "CUDA version"
required: true
type: string
runner:
description: "Runner type"
required: false
type: string
default: "ubuntu-22.04"
is-upload:
description: "Whether to upload the release asset"
required: false
type: boolean
default: true
jobs:
build_wheels:
name: Build wheels and Upload (Linux x86_64, ${{ inputs.flash-attn-version }}, ${{ inputs.python-version }}, ${{ inputs.torch-version }}, ${{ inputs.cuda-version }})
runs-on: ubuntu-22.04
name: Build wheels and Upload (Linux x86_64, GitHub hosted runner)
runs-on: ${{ inputs.runner }}
env:
DEBIAN_FRONTEND: noninteractive
TERM: xterm-256color
@@ -72,6 +82,7 @@ jobs:
python -c "import flash_attn; print(flash_attn.__version__)"
- name: Upload Release Asset
if: ${{ inputs.is-upload }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
+14 -3
View File
@@ -2,7 +2,7 @@
# Build wheels with self-hosted runner
# #########################################################
name: "[Linux x86_64, self-hosted runner] Build wheels and upload to GitHub Releases"
name: "[Linux x86_64, self-hosted] Build wheels and upload to GitHub Releases"
on:
workflow_call:
@@ -23,11 +23,21 @@ on:
description: "CUDA version"
required: true
type: string
runner:
description: "Runner type"
required: false
type: string
default: "self-hosted"
is-upload:
description: "Whether to upload the release asset"
required: false
type: boolean
default: true
jobs:
build_wheels_self_hosted:
name: Build wheels and Upload (Linux x86_64, self-hosted runner, ${{ inputs.flash-attn-version }}, ${{ inputs.python-version }}, ${{ inputs.torch-version }}, ${{ inputs.cuda-version }})
runs-on: self-hosted
name: Build wheels and Upload (Linux x86_64, self-hosted runner)
runs-on: ${{ inputs.runner }}
container:
image: ubuntu:22.04
defaults:
@@ -109,6 +119,7 @@ jobs:
python -c "import flash_attn; print(flash_attn.__version__)"
- name: Upload Release Asset
if: ${{ inputs.is-upload }}
shell: bash
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+13 -2
View File
@@ -23,11 +23,21 @@ on:
description: "CUDA version"
required: true
type: string
runner:
description: "Runner type"
required: false
type: string
default: "windows-2022"
is-upload:
description: "Whether to upload the release asset"
required: false
type: boolean
default: true
jobs:
build_windows_wheels:
name: Build wheels and Upload (Windows x86_64, GitHub hosted runner, ${{ inputs.flash-attn-version }}, ${{ inputs.python-version }}, ${{ inputs.torch-version }}, ${{ inputs.cuda-version }})
runs-on: windows-2022
name: Build wheels and Upload (Windows x86_64, GitHub hosted runner)
runs-on: ${{ inputs.runner }}
env:
MAX_JOBS: 2
NVCC_THREADS: 2
@@ -82,6 +92,7 @@ jobs:
python -c "import flash_attn; print(flash_attn.__version__)"
- name: Upload Release Asset
if: ${{ inputs.is-upload }}
shell: pwsh
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -22,10 +22,15 @@ on:
description: "CUDA version"
required: true
type: string
is-upload:
description: "Whether to upload the release asset"
required: false
type: boolean
default: true
jobs:
build_wheels:
name: Build wheels and Upload (Windows x86_64, AWS CodeBuild, ${{ inputs.flash-attn-version }}, ${{ inputs.python-version }}, ${{ inputs.torch-version }}, ${{ inputs.cuda-version }})
name: Build wheels and Upload (Windows x86_64, AWS CodeBuild)
timeout-minutes: 2160
runs-on: codebuild-flash-attention-pre-build-wheel-windows-${{ github.run_id }}-${{ github.run_attempt }}
# Large Instance
@@ -84,6 +89,7 @@ jobs:
python -c "import flash_attn; print(flash_attn.__version__)"
- name: Upload Release Asset
if: ${{ inputs.is-upload }}
shell: pwsh
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+22 -1
View File
@@ -36,7 +36,7 @@ jobs:
# Linux
# #########################################################
build_wheels_linux:
name: Build Linux
name: Build Linux x86_64
needs: [create_releases, create_matrix]
if: ${{ fromjson(needs.create_matrix.outputs.matrix).linux }}
strategy:
@@ -55,6 +55,27 @@ jobs:
cuda-version: ${{ matrix.cuda-version }}
secrets: inherit
build_wheels_linux_arm64:
name: Build Linux ARM64
needs: [create_releases, create_matrix]
if: ${{ fromjson(needs.create_matrix.outputs.matrix).linux_arm64 }}
strategy:
fail-fast: false
matrix:
flash-attn-version: ${{ fromjson(needs.create_matrix.outputs.matrix).linux_arm64.flash-attn-version }}
python-version: ${{ fromjson(needs.create_matrix.outputs.matrix).linux_arm64.python-version }}
torch-version: ${{ fromjson(needs.create_matrix.outputs.matrix).linux_arm64.torch-version }}
cuda-version: ${{ fromjson(needs.create_matrix.outputs.matrix).linux_arm64.cuda-version }}
exclude: ${{ fromjson(needs.create_matrix.outputs.matrix).exclude }}
uses: ./.github/workflows/_build_linux.yml
with:
flash-attn-version: ${{ matrix.flash-attn-version }}
python-version: ${{ matrix.python-version }}
torch-version: ${{ matrix.torch-version }}
cuda-version: ${{ matrix.cuda-version }}
runner: "ubuntu-22.04-arm64"
secrets: inherit
build_wheels_linux_self_hosted:
name: Build Linux (self-hosted)
needs: [create_releases, create_matrix]
+14 -54
View File
@@ -8,61 +8,21 @@ jobs:
# Build wheels with self-hosted runner
# #########################################################
build_wheels_self_hosted:
name: Build wheels and Upload (Linux arm64)
runs-on: ubuntu-22.04-arm
env:
DEBIAN_FRONTEND: noninteractive
TERM: xterm-256color
name: Build wheels and Upload (Linux ARM64, self-hosted runner)
uses: ./.github/workflows/_build_linux.yml
strategy:
fail-fast: false
matrix:
flash-attn-version: ["2.8.0"]
python-version: ["3.11"]
torch-version: ["2.7.1"]
flash-attn-version: ["2.8.3"]
python-version: ["3.13"]
torch-version: ["2.9.1"]
# https://developer.nvidia.com/cuda-toolkit-archive
cuda-version: ["12.8.1"]
steps:
- uses: actions/checkout@v4
- name: Maximize build space
run: |
df -h
echo "-----------------------------"
sudo rm -rf /usr/share/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
df -h
- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- uses: mjun0812/setup-cuda@v1
with:
version: "${{ matrix.cuda-version }}"
- name: Install build dependencies
shell: bash
run: |
sudo apt install -y ninja-build clang time
pip install -U pip setuptools==75.8.0 wheel setuptools packaging psutil
- name: Build wheels
id: build_wheels
shell: bash
env:
MAX_JOBS: 2
NVCC_THREADS: 1
run: |
chmod +x build_linux.sh
./build_linux.sh ${{ matrix.flash-attn-version }} ${{ matrix.python-version }} ${{ matrix.torch-version }} ${{ matrix.cuda-version }}
wheel_name=$(basename $(ls flash-attention/dist/*.whl | head -n 1))
echo "wheel_name=$wheel_name" >> $GITHUB_ENV
- name: Install Test
shell: bash
run: |
pip uninstall -y flash-attn > /dev/null 2>&1
pip install --no-cache-dir flash-attention/dist/${{ steps.build_wheels.outputs.WHEEL_NAME }}
python -c "import flash_attn; print(flash_attn.__version__)"
cuda-version: ["13.0.1"]
runner: ["ubuntu-22.04-arm64"]
with:
flash-attn-version: ${{ matrix.flash-attn-version }}
python-version: ${{ matrix.python-version }}
torch-version: ${{ matrix.torch-version }}
cuda-version: ${{ matrix.cuda-version }}
is-upload: false
runner: ${{ matrix.cuda-version}}
+13 -90
View File
@@ -9,97 +9,20 @@ jobs:
# #########################################################
build_wheels_self_hosted:
name: Build wheels and Upload (Linux x86_64, self-hosted runner)
runs-on: self-hosted
container:
image: ubuntu:22.04
defaults:
run:
shell: bash
env:
DEBIAN_FRONTEND: noninteractive
TERM: xterm-256color
timeout-minutes: 2000
uses: ./.github/workflows/_build_linux_self_host.yml
strategy:
fail-fast: false
matrix:
flash-attn-version: ["2.8.0"]
python-version: ["3.11", "3.11"]
torch-version: ["2.7.1"]
flash-attn-version: ["2.8.3"]
python-version: ["3.13", "3.13"]
torch-version: ["2.9.1"]
# https://developer.nvidia.com/cuda-toolkit-archive
cuda-version: ["12.8.1"]
steps:
- name: Install tools
shell: bash
run: |
apt-get update && apt-get install -y --no-install-recommends \
curl \
ca-certificates \
sudo \
software-properties-common \
wget \
unzip \
zip \
git \
build-essential \
gcc \
g++ \
clang \
ninja-build \
keyboard-configuration \
time
- name: Install gh
shell: bash
run: |
sudo mkdir -p -m 755 /etc/apt/keyrings
out=$(mktemp)
wget -nv -O$out https://cli.github.com/packages/githubcli-archive-keyring.gpg
cat $out | sudo tee /etc/apt/keyrings/githubcli-archive-keyring.gpg > /dev/null
sudo chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | sudo tee /etc/apt/sources.list.d/github-cli.list > /dev/null
sudo apt update
sudo apt install gh -y
- uses: actions/checkout@v4
- name: Configure Git safe directory
shell: bash
run: |
git config --global --add safe.directory $(pwd)
- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- uses: mjun0812/setup-cuda@v1
with:
version: "${{ matrix.cuda-version }}"
- name: Install build dependencies
shell: bash
run: |
sudo apt install -y ninja-build clang
pip install -U pip setuptools==75.8.0 wheel setuptools packaging psutil
- name: Build wheels
timeout-minutes: 1200
id: build_wheels
shell: bash
run: |
chmod +x build_linux.sh
./build_linux.sh ${{ matrix.flash-attn-version }} ${{ matrix.python-version }} ${{ matrix.torch-version }} ${{ matrix.cuda-version }}
wheel_name=$(basename $(ls flash-attention/dist/*.whl | head -n 1))
echo "WHEEL_NAME=$wheel_name" >> $GITHUB_OUTPUT
- name: Install Test
shell: bash
run: |
pip uninstall -y flash-attn > /dev/null 2>&1
pip install --no-cache-dir flash-attention/dist/${{ steps.build_wheels.outputs.WHEEL_NAME }}
python -c "import flash_attn; print(flash_attn.__version__)"
- name: Clean up
shell: bash
if: always()
run: |
rm -rf /opt/hostedtoolcache/Python
cuda-version: ["13.0.1"]
runner: ["self-hosted", "openci-runner-beta"]
with:
flash-attn-version: ${{ matrix.flash-attn-version }}
python-version: ${{ matrix.python-version }}
torch-version: ${{ matrix.torch-version }}
cuda-version: ${{ matrix.cuda-version }}
is-upload: false
runner: ${{ matrix.cuda-version}}
+52 -52
View File
@@ -51,6 +51,58 @@ See [./docs/packages.md](./docs/packages.md) for the full list of available pack
History of this repository is available [here](./docs/release_history.md).
## Citation
If you use this repository in your research and find it helpful, please cite the following paper!
```bibtex
@misc{flash-attention-prebuild-wheels,
author = {Morioka, Junya},
year = {2025},
title = {mjun0812/flash-attention-prebuild-wheels},
url = {https://github.com/mjun0812/flash-attention-prebuild-wheels},
howpublished = {https://github.com/mjun0812/flash-attention-prebuild-wheels},
}
```
## Star History and Download Statistics
<table>
<tr>
<td>
<a href="https://www.star-history.com/#mjun0812/flash-attention-prebuild-wheels&type=date&legend=top-left">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=mjun0812/flash-attention-prebuild-wheels&type=date&theme=dark&legend=top-left" />
<source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=mjun0812/flash-attention-prebuild-wheels&type=date&legend=top-left" />
<img alt="Star History Chart" src="https://api.star-history.com/svg?repos=mjun0812/flash-attention-prebuild-wheels&type=date&legend=top-left" height="300"/>
</picture>
</a>
</td>
<td>
<img alt="Download Statistics" src="./docs/data/download_graph.png" height="300"/>
</td>
</tr>
</table>
## Original Repository
[repo](https://github.com/Dao-AILab/flash-attention)
```bibtex
@inproceedings{dao2022flashattention,
title={Flash{A}ttention: Fast and Memory-Efficient Exact Attention with {IO}-Awareness},
author={Dao, Tri and Fu, Daniel Y. and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
booktitle={Advances in Neural Information Processing Systems (NeurIPS)},
year={2022}
}
@inproceedings{dao2023flashattention2,
title={Flash{A}ttention-2: Faster Attention with Better Parallelism and Work Partitioning},
author={Dao, Tri},
booktitle={International Conference on Learning Representations (ICLR)},
year={2024}
}
```
## Self build
If you cannot find the version you are looking for, you can fork this repository and create a wheel on GitHub Actions.
@@ -121,55 +173,3 @@ gh api \
-X POST \
/repos/[OWNER]/[REPOSITORY]/actions/runners/registration-token
```
## Citation
If you use this repository in your research and find it helpful, please cite the following paper!
```bibtex
@misc{flash-attention-prebuild-wheels,
author = {Morioka, Junya},
year = {2025},
title = {mjun0812/flash-attention-prebuild-wheels},
url = {https://github.com/mjun0812/flash-attention-prebuild-wheels},
howpublished = {https://github.com/mjun0812/flash-attention-prebuild-wheels},
}
```
## Star History and Download Statistics
<table>
<tr>
<td>
<a href="https://www.star-history.com/#mjun0812/flash-attention-prebuild-wheels&type=date&legend=top-left">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=mjun0812/flash-attention-prebuild-wheels&type=date&theme=dark&legend=top-left" />
<source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=mjun0812/flash-attention-prebuild-wheels&type=date&legend=top-left" />
<img alt="Star History Chart" src="https://api.star-history.com/svg?repos=mjun0812/flash-attention-prebuild-wheels&type=date&legend=top-left" height="300"/>
</picture>
</a>
</td>
<td>
<img alt="Download Statistics" src="./docs/data/download_graph.png" height="300"/>
</td>
</tr>
</table>
## Original Repository
[repo](https://github.com/Dao-AILab/flash-attention)
```bibtex
@inproceedings{dao2022flashattention,
title={Flash{A}ttention: Fast and Memory-Efficient Exact Attention with {IO}-Awareness},
author={Dao, Tri and Fu, Daniel Y. and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
booktitle={Advances in Neural Information Processing Systems (NeurIPS)},
year={2022}
}
@inproceedings{dao2023flashattention2,
title={Flash{A}ttention-2: Faster Attention with Better Parallelism and Work Partitioning},
author={Dao, Tri},
booktitle={International Conference on Learning Representations (ICLR)},
year={2024}
}
```
+23 -8
View File
@@ -19,6 +19,12 @@ EXCLUDE = [
{"torch-version": "2.7.1", "cuda-version": "12.9.1"},
# torch >= 2.9 does not support Python 3.9
{"torch-version": "2.9.1", "python-version": "3.9"},
# torch < 2.9 does not support CUDA 13.0
{"torch-version": "2.5.1", "cuda-version": "13.0.1"},
{"torch-version": "2.6.0", "cuda-version": "13.0.1"},
{"torch-version": "2.7.1", "cuda-version": "13.0.1"},
{"torch-version": "2.8.1", "cuda-version": "13.0.1"},
{"torch-version": "2.8.0", "cuda-version": "13.0.1"},
]
LINUX_MATRIX = {
@@ -31,7 +37,7 @@ LINUX_MATRIX = {
"3.10",
"3.11",
"3.12",
# "3.13"
"3.13",
],
"torch-version": [
"2.5.1",
@@ -42,13 +48,15 @@ LINUX_MATRIX = {
],
"cuda-version": [
"12.4.1",
"12.6.3",
# "12.6.3",
"12.8.1",
# "12.9.1",
"13.0.2",
],
}
LINUX_ARM64_MATRIX = LINUX_MATRIX
LINUX_SELF_HOSTED_MATRIX = {
"flash-attn-version": ["2.7.4"],
"python-version": ["3.10", "3.11", "3.12", "3.13"],
@@ -96,14 +104,21 @@ def main():
print(
json.dumps(
{
"linux": LINUX_MATRIX,
# "linux": False,
# "linux_self_hosted": LINUX_SELF_HOSTED_MATRIX,
"linux": False,
# "linux": LINUX_MATRIX,
#
# "linux_arm64": False,
"linux_arm64": LINUX_ARM64_MATRIX,
#
"linux_self_hosted": False,
"windows": WINDOWS_MATRIX,
# "windows": False,
# "windows_code_build": WINDOWS_CODEBUILD_MATRIX,
# "linux_self_hosted": LINUX_SELF_HOSTED_MATRIX,
#
"windows": False,
# "windows": WINDOWS_MATRIX,
#
"windows_code_build": False,
# "windows_code_build": WINDOWS_CODEBUILD_MATRIX,
#
"exclude": EXCLUDE,
}
)