Migrate Python usage to uv workspace (#20414)

I was having trouble getting benchmarks to gen data.

## Summary
- Replace three independent `requirements.txt` files with a uv workspace
(`benchmarks`, `dev`, `docs` projects)
- Single `uv.lock` lockfile for reproducible dependency resolution
- Simplify `bench.sh` by removing all ad-hoc venv/pip logic in favor of
`uv run`

## Test plan
- [ ] `uv sync` resolves all deps from repo root
- [ ] `uv run --project benchmarks python3 benchmarks/compare.py` works
- [ ] `uv run --project docs sphinx-build docs/source docs/build` builds
docs
- [ ] Run a benchmark from `bench.sh` that uses Python (e.g., h2o data
gen or compare flow)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Adrian Garcia Badaracco
2026-02-20 16:29:56 +00:00
committed by GitHub
parent ace9cd44b7
commit 1ee782f783
18 changed files with 1199 additions and 224 deletions
+4 -11
View File
@@ -40,17 +40,11 @@ jobs:
ref: asf-site
path: asf-site
- name: Setup Python
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: "3.12"
- name: Setup uv
uses: astral-sh/setup-uv@f0ec1fc3b38f5e7cd731bb6ce540c5af426746bb # v6.1.0
- name: Install dependencies
run: |
set -x
python3 -m venv venv
source venv/bin/activate
pip install -r docs/requirements.txt
run: uv sync --package datafusion-docs
- name: Install dependency graph tooling
run: |
set -x
@@ -61,9 +55,8 @@ jobs:
- name: Build docs
run: |
set -x
source venv/bin/activate
cd docs
./build.sh
uv run --package datafusion-docs ./build.sh
- name: Copy & push the generated HTML
run: |
+4 -11
View File
@@ -44,16 +44,10 @@ jobs:
with:
submodules: true
fetch-depth: 1
- name: Setup Python
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: "3.12"
- name: Setup uv
uses: astral-sh/setup-uv@f0ec1fc3b38f5e7cd731bb6ce540c5af426746bb # v6.1.0
- name: Install doc dependencies
run: |
set -x
python3 -m venv venv
source venv/bin/activate
pip install -r docs/requirements.txt
run: uv sync --package datafusion-docs
- name: Install dependency graph tooling
run: |
set -x
@@ -63,6 +57,5 @@ jobs:
- name: Build docs html and check for warnings
run: |
set -x
source venv/bin/activate
cd docs
./build.sh # fails on errors
uv run --package datafusion-docs ./build.sh # fails on errors
+4 -139
View File
@@ -42,7 +42,6 @@ DATAFUSION_DIR=${DATAFUSION_DIR:-$SCRIPT_DIR/..}
DATA_DIR=${DATA_DIR:-$SCRIPT_DIR/data}
CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --release"}
PREFER_HASH_JOIN=${PREFER_HASH_JOIN:-true}
VIRTUAL_ENV=${VIRTUAL_ENV:-$SCRIPT_DIR/venv}
usage() {
echo "
@@ -53,7 +52,6 @@ $0 data [benchmark]
$0 run [benchmark] [query]
$0 compare <branch1> <branch2>
$0 compare_detail <branch1> <branch2>
$0 venv
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Examples:
@@ -71,7 +69,6 @@ data: Generates or downloads data needed for benchmarking
run: Runs the named benchmark
compare: Compares fastest results from benchmark runs
compare_detail: Compares minimum, average (±stddev), and maximum results from benchmark runs
venv: Creates new venv (unless already exists) and installs compare's requirements into it
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Benchmarks
@@ -144,7 +141,6 @@ CARGO_COMMAND command that runs the benchmark binary
DATAFUSION_DIR directory to use (default $DATAFUSION_DIR)
RESULTS_NAME folder where the benchmark files are stored
PREFER_HASH_JOIN Prefer hash join algorithm (default true)
VENV_PATH Python venv to use for compare and venv commands (default ./venv, override by <your-venv>/bin/activate)
DATAFUSION_* Set the given datafusion configuration
"
exit 1
@@ -542,9 +538,6 @@ main() {
compare_detail)
compare_benchmarks "$ARG2" "$ARG3" "--detailed"
;;
venv)
setup_venv
;;
"")
usage
;;
@@ -708,7 +701,7 @@ run_compile_profile() {
local data_path="${DATA_DIR}/tpch_sf1"
echo "Running compile profile benchmark..."
local cmd=(python3 "${runner}" --data "${data_path}")
local cmd=(uv run python3 "${runner}" --data "${data_path}")
if [ ${#profiles[@]} -gt 0 ]; then
cmd+=(--profiles "${profiles[@]}")
fi
@@ -923,75 +916,13 @@ data_h2o() {
SIZE=${1:-"SMALL"}
DATA_FORMAT=${2:-"CSV"}
# Function to compare Python versions
version_ge() {
[ "$(printf '%s\n' "$1" "$2" | sort -V | head -n1)" = "$2" ]
}
export PYO3_USE_ABI3_FORWARD_COMPATIBILITY=1
# Find the highest available Python version (3.10 or higher)
REQUIRED_VERSION="3.10"
PYTHON_CMD=$(command -v python3 || true)
if [ -n "$PYTHON_CMD" ]; then
PYTHON_VERSION=$($PYTHON_CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
if version_ge "$PYTHON_VERSION" "$REQUIRED_VERSION"; then
echo "Found Python version $PYTHON_VERSION, which is suitable."
else
echo "Python version $PYTHON_VERSION found, but version $REQUIRED_VERSION or higher is required."
PYTHON_CMD=""
fi
fi
# Search for suitable Python versions if the default is unsuitable
if [ -z "$PYTHON_CMD" ]; then
# Loop through all available Python3 commands on the system
for CMD in $(compgen -c | grep -E '^python3(\.[0-9]+)?$'); do
if command -v "$CMD" &> /dev/null; then
PYTHON_VERSION=$($CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
if version_ge "$PYTHON_VERSION" "$REQUIRED_VERSION"; then
PYTHON_CMD="$CMD"
echo "Found suitable Python version: $PYTHON_VERSION ($CMD)"
break
fi
fi
done
fi
# If no suitable Python version found, exit with an error
if [ -z "$PYTHON_CMD" ]; then
echo "Python 3.10 or higher is required. Please install it."
return 1
fi
echo "Using Python command: $PYTHON_CMD"
# Install falsa and other dependencies
echo "Installing falsa..."
# Set virtual environment directory
VIRTUAL_ENV="${PWD}/venv"
# Create a virtual environment using the detected Python command
$PYTHON_CMD -m venv "$VIRTUAL_ENV"
# Activate the virtual environment and install dependencies
source "$VIRTUAL_ENV/bin/activate"
# Ensure 'falsa' is installed (avoid unnecessary reinstall)
pip install --quiet --upgrade falsa
# Create directory if it doesn't exist
H2O_DIR="${DATA_DIR}/h2o"
mkdir -p "${H2O_DIR}"
# Generate h2o test data
echo "Generating h2o test data in ${H2O_DIR} with size=${SIZE} and format=${DATA_FORMAT}"
falsa groupby --path-prefix="${H2O_DIR}" --size "${SIZE}" --data-format "${DATA_FORMAT}"
# Deactivate virtual environment after completion
deactivate
uv run falsa groupby --path-prefix="${H2O_DIR}" --size "${SIZE}" --data-format "${DATA_FORMAT}"
}
data_h2o_join() {
@@ -999,75 +930,13 @@ data_h2o_join() {
SIZE=${1:-"SMALL"}
DATA_FORMAT=${2:-"CSV"}
# Function to compare Python versions
version_ge() {
[ "$(printf '%s\n' "$1" "$2" | sort -V | head -n1)" = "$2" ]
}
export PYO3_USE_ABI3_FORWARD_COMPATIBILITY=1
# Find the highest available Python version (3.10 or higher)
REQUIRED_VERSION="3.10"
PYTHON_CMD=$(command -v python3 || true)
if [ -n "$PYTHON_CMD" ]; then
PYTHON_VERSION=$($PYTHON_CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
if version_ge "$PYTHON_VERSION" "$REQUIRED_VERSION"; then
echo "Found Python version $PYTHON_VERSION, which is suitable."
else
echo "Python version $PYTHON_VERSION found, but version $REQUIRED_VERSION or higher is required."
PYTHON_CMD=""
fi
fi
# Search for suitable Python versions if the default is unsuitable
if [ -z "$PYTHON_CMD" ]; then
# Loop through all available Python3 commands on the system
for CMD in $(compgen -c | grep -E '^python3(\.[0-9]+)?$'); do
if command -v "$CMD" &> /dev/null; then
PYTHON_VERSION=$($CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
if version_ge "$PYTHON_VERSION" "$REQUIRED_VERSION"; then
PYTHON_CMD="$CMD"
echo "Found suitable Python version: $PYTHON_VERSION ($CMD)"
break
fi
fi
done
fi
# If no suitable Python version found, exit with an error
if [ -z "$PYTHON_CMD" ]; then
echo "Python 3.10 or higher is required. Please install it."
return 1
fi
echo "Using Python command: $PYTHON_CMD"
# Install falsa and other dependencies
echo "Installing falsa..."
# Set virtual environment directory
VIRTUAL_ENV="${PWD}/venv"
# Create a virtual environment using the detected Python command
$PYTHON_CMD -m venv "$VIRTUAL_ENV"
# Activate the virtual environment and install dependencies
source "$VIRTUAL_ENV/bin/activate"
# Ensure 'falsa' is installed (avoid unnecessary reinstall)
pip install --quiet --upgrade falsa
# Create directory if it doesn't exist
H2O_DIR="${DATA_DIR}/h2o"
mkdir -p "${H2O_DIR}"
# Generate h2o test data
echo "Generating h2o test data in ${H2O_DIR} with size=${SIZE} and format=${DATA_FORMAT}"
falsa join --path-prefix="${H2O_DIR}" --size "${SIZE}" --data-format "${DATA_FORMAT}"
# Deactivate virtual environment after completion
deactivate
uv run falsa join --path-prefix="${H2O_DIR}" --size "${SIZE}" --data-format "${DATA_FORMAT}"
}
# Runner for h2o groupby benchmark
@@ -1269,7 +1138,7 @@ compare_benchmarks() {
echo "--------------------"
echo "Benchmark ${BENCH}"
echo "--------------------"
PATH=$VIRTUAL_ENV/bin:$PATH python3 "${SCRIPT_DIR}"/compare.py $OPTS "${RESULTS_FILE1}" "${RESULTS_FILE2}"
uv run python3 "${SCRIPT_DIR}"/compare.py $OPTS "${RESULTS_FILE1}" "${RESULTS_FILE2}"
else
echo "Note: Skipping ${RESULTS_FILE1} as ${RESULTS_FILE2} does not exist"
fi
@@ -1384,10 +1253,6 @@ run_clickbench_sorted() {
${QUERY_ARG}
}
setup_venv() {
python3 -m venv "$VIRTUAL_ENV"
PATH=$VIRTUAL_ENV/bin:$PATH python3 -m pip install -r requirements.txt
}
# And start the process up
main
+6
View File
@@ -0,0 +1,6 @@
[project]
name = "datafusion-benchmarks"
version = "0.1.0"
requires-python = ">=3.11"
# typing_extensions is an undeclared dependency of falsa
dependencies = ["rich", "falsa", "typing_extensions"]
-18
View File
@@ -1,18 +0,0 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
rich
+5
View File
@@ -0,0 +1,5 @@
[project]
name = "datafusion-dev"
version = "0.1.0"
requires-python = ">=3.11"
dependencies = ["tomlkit", "PyGithub", "requests"]
+3 -3
View File
@@ -178,10 +178,10 @@ We maintain a [changelog] so our users know what has been changed between releas
The changelog is generated using a Python script.
To run the script, you will need a GitHub Personal Access Token (described in the prerequisites section) and the `PyGitHub` library. First install the `PyGitHub` dependency via `pip`:
To run the script, you will need a GitHub Personal Access Token (described in the prerequisites section) and the `PyGitHub` library. First install the dev dependencies via `uv`:
```shell
pip3 install PyGitHub
uv sync
```
To generate the changelog, set the `GITHUB_TOKEN` environment variable and then run `./dev/release/generate-changelog.py`
@@ -199,7 +199,7 @@ to generate a change log of all changes between the `50.3.0` tag and `branch-51`
```shell
export GITHUB_TOKEN=<your-token-here>
./dev/release/generate-changelog.py 50.3.0 branch-51 51.0.0 > dev/changelog/51.0.0.md
uv run ./dev/release/generate-changelog.py 50.3.0 branch-51 51.0.0 > dev/changelog/51.0.0.md
```
This script creates a changelog from GitHub PRs based on the labels associated with them as well as looking for
-2
View File
@@ -1,2 +0,0 @@
tomlkit
PyGitHub
+1 -1
View File
@@ -19,7 +19,7 @@
# Script that updates the arrow dependencies in datafusion locally
#
# installation:
# pip install tomlkit requests
# uv sync
#
# pin all arrow crates deps to a specific version:
#
+1 -1
View File
@@ -19,7 +19,7 @@
# Script that updates versions for datafusion crates, locally
#
# dependencies:
# pip install tomlkit
# uv sync
import re
import argparse
+4 -11
View File
@@ -25,19 +25,12 @@ https://datafusion.apache.org/ as part of the release process.
## Dependencies
It's recommended to install build dependencies and build the documentation
inside a Python virtualenv.
Install build dependencies and build the documentation using
[uv](https://docs.astral.sh/uv/):
```sh
python3 -m venv venv
pip install -r requirements.txt
```
If using [uv](https://docs.astral.sh/uv/) the script can be run like so without
needing to create a virtual environment:
```sh
uv run --with-requirements requirements.txt bash build.sh
uv sync
uv run bash build.sh
```
The docs build regenerates the workspace dependency graph via
+13
View File
@@ -0,0 +1,13 @@
[project]
name = "datafusion-docs"
version = "0.1.0"
requires-python = ">=3.11"
dependencies = [
"sphinx>=9,<10",
"sphinx-reredirects>=1.1,<2",
"pydata-sphinx-theme>=0.16,<1",
"myst-parser>=5,<6",
"maturin>=1.11,<2",
"jinja2>=3.1,<4",
"setuptools>=82,<83",
]
-24
View File
@@ -1,24 +0,0 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
sphinx==9.1.0
sphinx-reredirects==1.1.0
pydata-sphinx-theme==0.16.1
myst-parser==5.0.0
maturin==1.12.2
jinja2==3.1.6
setuptools==82.0.0
+1 -1
View File
@@ -26,7 +26,7 @@ For example:
```toml
[dependencies]
datafusion = "52.0.0"
datafusion = "52.1.0"
```
While DataFusion is distributed via [crates.io] as a convenience, the
@@ -156,7 +156,7 @@ By default, Datafusion returns errors as a plain text message. You can enable mo
such as backtraces by enabling the `backtrace` feature to your `Cargo.toml` file like this:
```toml
datafusion = { version = "52.0.0", features = ["backtrace"]}
datafusion = { version = "52.1.0", features = ["backtrace"]}
```
Set environment [variables](https://doc.rust-lang.org/std/backtrace/index.html#environment-variables)
+1 -1
View File
@@ -29,7 +29,7 @@ Find latest available Datafusion version on [DataFusion's
crates.io] page. Add the dependency to your `Cargo.toml` file:
```toml
datafusion = "52.0.0"
datafusion = "52.1.0"
tokio = { version = "1.0", features = ["rt-multi-thread"] }
```
+2
View File
@@ -0,0 +1,2 @@
[tool.uv.workspace]
members = ["benchmarks", "dev", "docs"]
Generated
+1149
View File
File diff suppressed because it is too large Load Diff