diff --git a/.github/workflows/test-self-hosted-linux.yml b/.github/workflows/test-self-hosted-linux.yml index d3a8d65..6ad3410 100644 --- a/.github/workflows/test-self-hosted-linux.yml +++ b/.github/workflows/test-self-hosted-linux.yml @@ -23,7 +23,7 @@ jobs: fail-fast: false matrix: flash-attn-version: ["2.8.0"] - python-version: ["3.11"] + python-version: ["3.11", "3.12"] torch-version: ["2.7.1"] # https://developer.nvidia.com/cuda-toolkit-archive cuda-version: ["12.8.1"] @@ -92,6 +92,7 @@ jobs: - name: Build wheels timeout-minutes: 1200 + id: build_wheels shell: bash env: MAX_JOBS: 4 @@ -105,7 +106,8 @@ jobs: - name: Install Test shell: bash run: | - pip install --no-cache-dir --force-reinstall flash-attention/dist/${{ env.wheel_name }} + pip uninstall -y flash-attn > /dev/null 2>&1 + pip install --no-cache-dir flash-attention/dist/${{ steps.build_wheels.outputs.WHEEL_NAME }} python -c "import flash_attn; print(flash_attn.__version__)" - name: Clean up @@ -113,4 +115,3 @@ jobs: if: always() run: | rm -rf /opt/hostedtoolcache/Python - rm -rf ~/.cache/pip diff --git a/README.md b/README.md index febb2ca..fa865e1 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,6 @@ The built packages are available on the [release page](https://github.com/mjun08 **This repository uses a self-hosted runner and AWS CodeBuild for building the wheels. If you find this project helpful, please consider sponsoring to help maintain the infrastructure!** [![github-sponsor](https://img.shields.io/badge/sponsor-30363D?style=for-the-badge&logo=GitHub-Sponsors&logoColor=#white)](https://github.com/sponsors/mjun0812) - [![buy-me-a-coffee](https://img.shields.io/badge/Buy_Me_A_Coffee-FFDD00?style=for-the-badge&logo=buy-me-a-coffee&logoColor=black)](https://buymeacoffee.com/mjun0812) ## Install @@ -57,8 +56,8 @@ History of this repository is available [here](./docs/release_history.md). If you cannot find the version you are looking for, you can fork this repository and create a wheel on GitHub Actions. 1. Fork this repository -2. Edit workflow file [`.github/workflows/build.yml`](https://github.com/mjun0812/flash-attention-prebuild-wheels/blob/main/.github/workflows/build.yml) to set the version you want to build. -3. Add tag `v*.*.*` to trigger the build workflow. +2. Edit Python script [`create_matrix.py`](https://github.com/mjun0812/flash-attention-prebuild-wheels/blob/main/create_matrix.py) to set the version you want to build. +3. Add tag `v*.*.*` to trigger the build workflow. `git tag v*.*.* && git push --tags` Please note that depending on the combination of versions, it may not be possible to build. @@ -76,8 +75,13 @@ cp env.template env Edit `env` file to set the environment variables. ```bash -# Edit env +# Registry Token for GitHub Personal Access Token PERSONAL_ACCESS_TOKEN=[Github Personal Access Token] +# or Registry Token for GitHub Actions Runner +REGISTRY_TOKEN=[Runner Registry Token] + +# Optional +RUNNER_LABELS=Linux,self-hosted ``` Edit compose.yml file if you use repository folked from this repository. @@ -86,16 +90,19 @@ Edit compose.yml file if you use repository folked from this repository. services: runner: privileged: true + restart: always + env_file: + - .env + environment: + REPOSITORY_URL: https://github.com/[OWNER]/[REPOSITORY] + RUNNER_NAME: self-hosted-runner + RUNNER_GROUP: default + TARGET_ARCH: x64 build: context: . dockerfile: Dockerfile args: - REPOSITORY_URL: [Target Repository URL] - PERSONAL_ACCESS_TOKEN: $PERSONAL_ACCESS_TOKEN - GH_RUNNER_VERSION: 2.324.0 - RUNNER_NAME: self-hosted-runner - RUNNER_GROUP: default - RUNNER_LABELS: self-hosted + GH_RUNNER_VERSION: 2.329.0 TARGET_ARCH: x64 ``` @@ -107,21 +114,28 @@ docker compose build docker compose up -d ``` +### Getting One-Time Registry Token for GitHub Actions Runner + +```bash +gh api \ + -X POST \ + /repos/[OWNER]/[REPOSITORY]/actions/runners/registration-token +``` + +## Citation + +If you use this repository in your research and find it helpful, please cite the following paper! + +```bibtex +@misc{flash-attention-prebuild-wheels, + author = {Morioka, Junya}, + year = {2025}, + title = {mjun0812/flash-attention-prebuild-wheels}, + url = {https://github.com/mjun0812/flash-attention-prebuild-wheels}, + howpublished = {https://github.com/mjun0812/flash-attention-prebuild-wheels}, +} +``` + ## Original Repository [repo](https://github.com/Dao-AILab/flash-attention) - -```bibtex -@inproceedings{dao2022flashattention, - title={Flash{A}ttention: Fast and Memory-Efficient Exact Attention with {IO}-Awareness}, - author={Dao, Tri and Fu, Daniel Y. and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher}, - booktitle={Advances in Neural Information Processing Systems (NeurIPS)}, - year={2022} -} -@inproceedings{dao2023flashattention2, - title={Flash{A}ttention-2: Faster Attention with Better Parallelism and Work Partitioning}, - author={Dao, Tri}, - booktitle={International Conference on Learning Representations (ICLR)}, - year={2024} -} -``` diff --git a/self-hosted-runner/Dockerfile b/self-hosted-runner/Dockerfile index f231690..f46da23 100644 --- a/self-hosted-runner/Dockerfile +++ b/self-hosted-runner/Dockerfile @@ -1,11 +1,6 @@ FROM ubuntu:24.04 -ARG REPOSITORY_URL -ARG PERSONAL_ACCESS_TOKEN -ARG GH_RUNNER_VERSION="2.324.0" -ARG RUNNER_NAME="self-hosted-github-actions-runner" -ARG RUNNER_GROUP="default" -ARG RUNNER_LABELS="self-hosted,Linux" +ARG GH_RUNNER_VERSION="2.329.0" ARG TARGET_ARCH="x64" ENV DEBIAN_FRONTEND=noninteractive \ @@ -64,15 +59,4 @@ WORKDIR /home/ubuntu RUN curl -fsSL -o actions-runner.tar.gz -L "https://github.com/actions/runner/releases/download/v${GH_RUNNER_VERSION}/actions-runner-linux-${TARGET_ARCH}-${GH_RUNNER_VERSION}.tar.gz" \ && tar xf actions-runner.tar.gz \ && rm actions-runner.tar.gz \ - && sudo ./bin/installdependencies.sh \ - && ./config.sh \ - --unattended \ - --url $REPOSITORY_URL \ - --pat $PERSONAL_ACCESS_TOKEN \ - --name $RUNNER_NAME \ - --runnergroup $RUNNER_GROUP \ - --labels "${RUNNER_LABELS},${TARGET_ARCH}" \ - --work /home/ubuntu/actions-runner \ - --replace - -CMD ["./run.sh"] + && sudo ./bin/installdependencies.sh diff --git a/self-hosted-runner/compose.yml b/self-hosted-runner/compose.yml index c4e92ab..0e03fd4 100644 --- a/self-hosted-runner/compose.yml +++ b/self-hosted-runner/compose.yml @@ -2,14 +2,16 @@ services: runner: privileged: true restart: always + env_file: + - .env + environment: + REPOSITORY_URL: https://github.com/mjun0812/flash-attention-prebuild-wheels + RUNNER_NAME: self-hosted-runner + RUNNER_GROUP: default + TARGET_ARCH: x64 build: context: . dockerfile: Dockerfile args: - REPOSITORY_URL: https://github.com/mjun0812/flash-attention-prebuild-wheels - PERSONAL_ACCESS_TOKEN: $PERSONAL_ACCESS_TOKEN - GH_RUNNER_VERSION: 2.327.1 - RUNNER_NAME: self-hosted-runner - RUNNER_GROUP: default - RUNNER_LABELS: self-hosted + GH_RUNNER_VERSION: 2.329.0 TARGET_ARCH: x64 diff --git a/self-hosted-runner/entrypoint.sh b/self-hosted-runner/entrypoint.sh index d56b933..1a04ffe 100644 --- a/self-hosted-runner/entrypoint.sh +++ b/self-hosted-runner/entrypoint.sh @@ -1,7 +1,30 @@ #!/bin/bash -id # Start docker daemon sudo service docker start -exec "$@" \ No newline at end of file +if [ -n "$PERSONAL_ACCESS_TOKEN" ]; then + echo "Using personal access token"; + ./config.sh \ + --unattended \ + --url $REPOSITORY_URL \ + --pat "$PERSONAL_ACCESS_TOKEN" \ + --name $RUNNER_NAME \ + --runnergroup $RUNNER_GROUP \ + --labels "${RUNNER_LABELS},${TARGET_ARCH}" \ + --work /home/ubuntu/actions-runner \ + --replace; +else + echo "Using registry token"; + ./config.sh \ + --unattended \ + --url $REPOSITORY_URL \ + --token "$REGISTRY_TOKEN" \ + --name $RUNNER_NAME \ + --runnergroup $RUNNER_GROUP \ + --labels "${RUNNER_LABELS},${TARGET_ARCH}" \ + --work /home/ubuntu/actions-runner \ + --replace; +fi + +exec "./run.sh" diff --git a/self-hosted-runner/env.template b/self-hosted-runner/env.template index cf1aca7..923ae88 100644 --- a/self-hosted-runner/env.template +++ b/self-hosted-runner/env.template @@ -1 +1,3 @@ PERSONAL_ACCESS_TOKEN= +REGISTRY_TOKEN= +RUNNER_LABELS=Linux,self-hosted \ No newline at end of file