Migrate Python usage to uv workspace (#20414)

I was having trouble getting benchmarks to gen data. ## Summary - Replace three independent `requirements.txt` files with a uv workspace (`benchmarks`, `dev`, `docs` projects) - Single `uv.lock` lockfile for reproducible dependency resolution - Simplify `bench.sh` by removing all ad-hoc venv/pip logic in favor of `uv run` ## Test plan - [ ] `uv sync` resolves all deps from repo root - [ ] `uv run --project benchmarks python3 benchmarks/compare.py` works - [ ] `uv run --project docs sphinx-build docs/source docs/build` builds docs - [ ] Run a benchmark from `bench.sh` that uses Python (e.g., h2o data gen or compare flow) 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-06-30 21:27:59 -04:00 · 2026-02-20 16:29:56 +00:00
parent ace9cd44b7
commit 1ee782f783
18 changed files with 1199 additions and 224 deletions
@@ -40,17 +40,11 @@ jobs:
          ref: asf-site
          path: asf-site

-      - name: Setup Python
-        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: "3.12"
+      - name: Setup uv
+        uses: astral-sh/setup-uv@f0ec1fc3b38f5e7cd731bb6ce540c5af426746bb  # v6.1.0

      - name: Install dependencies
-        run: |
-          set -x
-          python3 -m venv venv
-          source venv/bin/activate
-          pip install -r docs/requirements.txt
+        run: uv sync --package datafusion-docs
      - name: Install dependency graph tooling
        run: |
          set -x
@@ -61,9 +55,8 @@ jobs:
      - name: Build docs
        run: |
          set -x
-          source venv/bin/activate
          cd docs
-          ./build.sh
+          uv run --package datafusion-docs ./build.sh

      - name: Copy & push the generated HTML
        run: |
@@ -44,16 +44,10 @@ jobs:
        with:
          submodules: true
          fetch-depth: 1
-      - name: Setup Python
-        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
-        with:
-          python-version: "3.12"
+      - name: Setup uv
+        uses: astral-sh/setup-uv@f0ec1fc3b38f5e7cd731bb6ce540c5af426746bb  # v6.1.0
      - name: Install doc dependencies
-        run: |
-          set -x
-          python3 -m venv venv
-          source venv/bin/activate
-          pip install -r docs/requirements.txt
+        run: uv sync --package datafusion-docs
      - name: Install dependency graph tooling
        run: |
          set -x
@@ -63,6 +57,5 @@ jobs:
      - name: Build docs html and check for warnings
        run: |
          set -x
-          source venv/bin/activate
          cd docs
-          ./build.sh # fails on errors
+          uv run --package datafusion-docs ./build.sh # fails on errors
@@ -42,7 +42,6 @@ DATAFUSION_DIR=${DATAFUSION_DIR:-$SCRIPT_DIR/..}
 DATA_DIR=${DATA_DIR:-$SCRIPT_DIR/data}
 CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --release"}
 PREFER_HASH_JOIN=${PREFER_HASH_JOIN:-true}
-VIRTUAL_ENV=${VIRTUAL_ENV:-$SCRIPT_DIR/venv}

 usage() {
    echo "
@@ -53,7 +52,6 @@ $0 data [benchmark]
 $0 run [benchmark] [query]
 $0 compare <branch1> <branch2>
 $0 compare_detail <branch1> <branch2>
-$0 venv

 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 Examples:
@@ -71,7 +69,6 @@ data:            Generates or downloads data needed for benchmarking
 run:             Runs the named benchmark
 compare:         Compares fastest results from benchmark runs
 compare_detail:  Compares minimum, average (±stddev), and maximum results from benchmark runs
-venv:            Creates new venv (unless already exists) and installs compare's requirements into it

 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 Benchmarks
@@ -144,7 +141,6 @@ CARGO_COMMAND       command that runs the benchmark binary
 DATAFUSION_DIR      directory to use (default $DATAFUSION_DIR)
 RESULTS_NAME        folder where the benchmark files are stored
 PREFER_HASH_JOIN    Prefer hash join algorithm (default true)
-VENV_PATH           Python venv to use for compare and venv commands (default ./venv, override by <your-venv>/bin/activate)
 DATAFUSION_*        Set the given datafusion configuration
 "
    exit 1
@@ -542,9 +538,6 @@ main() {
        compare_detail)
            compare_benchmarks "$ARG2" "$ARG3" "--detailed"
            ;;
-        venv)
-            setup_venv
-            ;;
        "")
            usage
            ;;
@@ -708,7 +701,7 @@ run_compile_profile() {
    local data_path="${DATA_DIR}/tpch_sf1"

    echo "Running compile profile benchmark..."
-    local cmd=(python3 "${runner}" --data "${data_path}")
+    local cmd=(uv run python3 "${runner}" --data "${data_path}")
    if [ ${#profiles[@]} -gt 0 ]; then
        cmd+=(--profiles "${profiles[@]}")
    fi
@@ -923,75 +916,13 @@ data_h2o() {
    SIZE=${1:-"SMALL"}
    DATA_FORMAT=${2:-"CSV"}

-    # Function to compare Python versions
-    version_ge() {
-        [ "$(printf '%s\n' "$1" "$2" | sort -V | head -n1)" = "$2" ]
-    }
-
-    export PYO3_USE_ABI3_FORWARD_COMPATIBILITY=1
-
-    # Find the highest available Python version (3.10 or higher)
-    REQUIRED_VERSION="3.10"
-    PYTHON_CMD=$(command -v python3 || true)
-
-    if [ -n "$PYTHON_CMD" ]; then
-        PYTHON_VERSION=$($PYTHON_CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
-        if version_ge "$PYTHON_VERSION" "$REQUIRED_VERSION"; then
-            echo "Found Python version $PYTHON_VERSION, which is suitable."
-        else
-            echo "Python version $PYTHON_VERSION found, but version $REQUIRED_VERSION or higher is required."
-            PYTHON_CMD=""
-        fi
-    fi
-
-   # Search for suitable Python versions if the default is unsuitable
-   if [ -z "$PYTHON_CMD" ]; then
-       # Loop through all available Python3 commands on the system
-       for CMD in $(compgen -c | grep -E '^python3(\.[0-9]+)?$'); do
-           if command -v "$CMD" &> /dev/null; then
-               PYTHON_VERSION=$($CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
-               if version_ge "$PYTHON_VERSION" "$REQUIRED_VERSION"; then
-                   PYTHON_CMD="$CMD"
-                   echo "Found suitable Python version: $PYTHON_VERSION ($CMD)"
-                   break
-               fi
-           fi
-       done
-   fi
-
-    # If no suitable Python version found, exit with an error
-    if [ -z "$PYTHON_CMD" ]; then
-        echo "Python 3.10 or higher is required. Please install it."
-        return 1
-    fi
-
-    echo "Using Python command: $PYTHON_CMD"
-
-    # Install falsa and other dependencies
-    echo "Installing falsa..."
-
-    # Set virtual environment directory
-    VIRTUAL_ENV="${PWD}/venv"
-
-    # Create a virtual environment using the detected Python command
-    $PYTHON_CMD -m venv "$VIRTUAL_ENV"
-
-    # Activate the virtual environment and install dependencies
-    source "$VIRTUAL_ENV/bin/activate"
-
-    # Ensure 'falsa' is installed (avoid unnecessary reinstall)
-    pip install --quiet --upgrade falsa
-
    # Create directory if it doesn't exist
    H2O_DIR="${DATA_DIR}/h2o"
    mkdir -p "${H2O_DIR}"

    # Generate h2o test data
    echo "Generating h2o test data in ${H2O_DIR} with size=${SIZE} and format=${DATA_FORMAT}"
-    falsa groupby --path-prefix="${H2O_DIR}" --size "${SIZE}" --data-format "${DATA_FORMAT}"
-
-    # Deactivate virtual environment after completion
-    deactivate
+    uv run falsa groupby --path-prefix="${H2O_DIR}" --size "${SIZE}" --data-format "${DATA_FORMAT}"
 }

 data_h2o_join() {
@@ -999,75 +930,13 @@ data_h2o_join() {
    SIZE=${1:-"SMALL"}
    DATA_FORMAT=${2:-"CSV"}

-    # Function to compare Python versions
-    version_ge() {
-        [ "$(printf '%s\n' "$1" "$2" | sort -V | head -n1)" = "$2" ]
-    }
-
-    export PYO3_USE_ABI3_FORWARD_COMPATIBILITY=1
-
-    # Find the highest available Python version (3.10 or higher)
-    REQUIRED_VERSION="3.10"
-    PYTHON_CMD=$(command -v python3 || true)
-
-    if [ -n "$PYTHON_CMD" ]; then
-        PYTHON_VERSION=$($PYTHON_CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
-        if version_ge "$PYTHON_VERSION" "$REQUIRED_VERSION"; then
-            echo "Found Python version $PYTHON_VERSION, which is suitable."
-        else
-            echo "Python version $PYTHON_VERSION found, but version $REQUIRED_VERSION or higher is required."
-            PYTHON_CMD=""
-        fi
-    fi
-
-   # Search for suitable Python versions if the default is unsuitable
-   if [ -z "$PYTHON_CMD" ]; then
-       # Loop through all available Python3 commands on the system
-       for CMD in $(compgen -c | grep -E '^python3(\.[0-9]+)?$'); do
-           if command -v "$CMD" &> /dev/null; then
-               PYTHON_VERSION=$($CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
-               if version_ge "$PYTHON_VERSION" "$REQUIRED_VERSION"; then
-                   PYTHON_CMD="$CMD"
-                   echo "Found suitable Python version: $PYTHON_VERSION ($CMD)"
-                   break
-               fi
-           fi
-       done
-   fi
-
-    # If no suitable Python version found, exit with an error
-    if [ -z "$PYTHON_CMD" ]; then
-        echo "Python 3.10 or higher is required. Please install it."
-        return 1
-    fi
-
-    echo "Using Python command: $PYTHON_CMD"
-
-    # Install falsa and other dependencies
-    echo "Installing falsa..."
-
-    # Set virtual environment directory
-    VIRTUAL_ENV="${PWD}/venv"
-
-    # Create a virtual environment using the detected Python command
-    $PYTHON_CMD -m venv "$VIRTUAL_ENV"
-
-    # Activate the virtual environment and install dependencies
-    source "$VIRTUAL_ENV/bin/activate"
-
-    # Ensure 'falsa' is installed (avoid unnecessary reinstall)
-    pip install --quiet --upgrade falsa
-
    # Create directory if it doesn't exist
    H2O_DIR="${DATA_DIR}/h2o"
    mkdir -p "${H2O_DIR}"

    # Generate h2o test data
    echo "Generating h2o test data in ${H2O_DIR} with size=${SIZE} and format=${DATA_FORMAT}"
-    falsa join --path-prefix="${H2O_DIR}" --size "${SIZE}" --data-format "${DATA_FORMAT}"
-
-    # Deactivate virtual environment after completion
-    deactivate
+    uv run falsa join --path-prefix="${H2O_DIR}" --size "${SIZE}" --data-format "${DATA_FORMAT}"
 }

 # Runner for h2o groupby benchmark
@@ -1269,7 +1138,7 @@ compare_benchmarks() {
            echo "--------------------"
            echo "Benchmark ${BENCH}"
            echo "--------------------"
-            PATH=$VIRTUAL_ENV/bin:$PATH python3 "${SCRIPT_DIR}"/compare.py $OPTS "${RESULTS_FILE1}" "${RESULTS_FILE2}"
+            uv run python3 "${SCRIPT_DIR}"/compare.py $OPTS "${RESULTS_FILE1}" "${RESULTS_FILE2}"
        else
            echo "Note: Skipping ${RESULTS_FILE1} as ${RESULTS_FILE2} does not exist"
        fi
@@ -1384,10 +1253,6 @@ run_clickbench_sorted() {
        ${QUERY_ARG}
 }

-setup_venv() {
-    python3 -m venv "$VIRTUAL_ENV"
-    PATH=$VIRTUAL_ENV/bin:$PATH python3 -m pip install -r requirements.txt
-}

 # And start the process up
 main
@@ -0,0 +1,6 @@
+[project]
+name = "datafusion-benchmarks"
+version = "0.1.0"
+requires-python = ">=3.11"
+# typing_extensions is an undeclared dependency of falsa
+dependencies = ["rich", "falsa", "typing_extensions"]
@@ -1,18 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-rich
@@ -0,0 +1,5 @@
+[project]
+name = "datafusion-dev"
+version = "0.1.0"
+requires-python = ">=3.11"
+dependencies = ["tomlkit", "PyGithub", "requests"]
@@ -178,10 +178,10 @@ We maintain a [changelog] so our users know what has been changed between releas

 The changelog is generated using a Python script.

-To run the script, you will need a GitHub Personal Access Token (described in the prerequisites section) and the `PyGitHub` library. First install the `PyGitHub` dependency via `pip`:
+To run the script, you will need a GitHub Personal Access Token (described in the prerequisites section) and the `PyGitHub` library. First install the dev dependencies via `uv`:

 ```shell
-pip3 install PyGitHub
+uv sync
 ```

 To generate the changelog, set the `GITHUB_TOKEN` environment variable and then run `./dev/release/generate-changelog.py`
@@ -199,7 +199,7 @@ to generate a change log of all changes between the `50.3.0` tag and `branch-51`

 ```shell
 export GITHUB_TOKEN=<your-token-here>
-./dev/release/generate-changelog.py 50.3.0 branch-51 51.0.0 > dev/changelog/51.0.0.md
+uv run ./dev/release/generate-changelog.py 50.3.0 branch-51 51.0.0 > dev/changelog/51.0.0.md
 ```

 This script creates a changelog from GitHub PRs based on the labels associated with them as well as looking for
@@ -1,2 +0,0 @@
-tomlkit
-PyGitHub
@@ -19,7 +19,7 @@
 # Script that updates the arrow dependencies in datafusion locally
 #
 # installation:
-# pip install tomlkit requests
+# uv sync
 #
 # pin all arrow crates deps to a specific version:
 #
@@ -19,7 +19,7 @@
 # Script that updates versions for datafusion crates, locally
 #
 # dependencies:
-# pip install tomlkit
+# uv sync

 import re
 import argparse
@@ -25,19 +25,12 @@ https://datafusion.apache.org/ as part of the release process.

 ## Dependencies

-It's recommended to install build dependencies and build the documentation
-inside a Python virtualenv.
+Install build dependencies and build the documentation using
+[uv](https://docs.astral.sh/uv/):

 ```sh
-python3 -m venv venv
-pip install -r requirements.txt
-```
-
-If using [uv](https://docs.astral.sh/uv/) the script can be run like so without
-needing to create a virtual environment:
-
-```sh
-uv run --with-requirements requirements.txt bash build.sh
+uv sync
+uv run bash build.sh
 ```

 The docs build regenerates the workspace dependency graph via
@@ -0,0 +1,13 @@
+[project]
+name = "datafusion-docs"
+version = "0.1.0"
+requires-python = ">=3.11"
+dependencies = [
+  "sphinx>=9,<10",
+  "sphinx-reredirects>=1.1,<2",
+  "pydata-sphinx-theme>=0.16,<1",
+  "myst-parser>=5,<6",
+  "maturin>=1.11,<2",
+  "jinja2>=3.1,<4",
+  "setuptools>=82,<83",
+]
@@ -1,24 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-sphinx==9.1.0
-sphinx-reredirects==1.1.0
-pydata-sphinx-theme==0.16.1
-myst-parser==5.0.0
-maturin==1.12.2
-jinja2==3.1.6
-setuptools==82.0.0
@@ -26,7 +26,7 @@ For example:

 ```toml
 [dependencies]
-datafusion = "52.0.0"
+datafusion = "52.1.0"
 ```

 While DataFusion is distributed via [crates.io] as a convenience, the
@@ -156,7 +156,7 @@ By default, Datafusion returns errors as a plain text message. You can enable mo
 such as backtraces by enabling the `backtrace` feature to your `Cargo.toml` file like this:

 ```toml
-datafusion = { version = "52.0.0", features = ["backtrace"]}
+datafusion = { version = "52.1.0", features = ["backtrace"]}
 ```

 Set environment [variables](https://doc.rust-lang.org/std/backtrace/index.html#environment-variables)
@@ -29,7 +29,7 @@ Find latest available Datafusion version on [DataFusion's
 crates.io] page. Add the dependency to your `Cargo.toml` file:

 ```toml
-datafusion = "52.0.0"
+datafusion = "52.1.0"
 tokio = { version = "1.0", features = ["rt-multi-thread"] }
 ```

@@ -0,0 +1,2 @@
+[tool.uv.workspace]
+members = ["benchmarks", "dev", "docs"]