mirror of
https://github.com/langchain-ai/datafusion.git
synced 2026-06-30 21:27:59 -04:00
Migrate Python usage to uv workspace (#20414)
I was having trouble getting benchmarks to gen data. ## Summary - Replace three independent `requirements.txt` files with a uv workspace (`benchmarks`, `dev`, `docs` projects) - Single `uv.lock` lockfile for reproducible dependency resolution - Simplify `bench.sh` by removing all ad-hoc venv/pip logic in favor of `uv run` ## Test plan - [ ] `uv sync` resolves all deps from repo root - [ ] `uv run --project benchmarks python3 benchmarks/compare.py` works - [ ] `uv run --project docs sphinx-build docs/source docs/build` builds docs - [ ] Run a benchmark from `bench.sh` that uses Python (e.g., h2o data gen or compare flow) 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
committed by
GitHub
parent
ace9cd44b7
commit
1ee782f783
@@ -40,17 +40,11 @@ jobs:
|
||||
ref: asf-site
|
||||
path: asf-site
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
|
||||
with:
|
||||
python-version: "3.12"
|
||||
- name: Setup uv
|
||||
uses: astral-sh/setup-uv@f0ec1fc3b38f5e7cd731bb6ce540c5af426746bb # v6.1.0
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
set -x
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r docs/requirements.txt
|
||||
run: uv sync --package datafusion-docs
|
||||
- name: Install dependency graph tooling
|
||||
run: |
|
||||
set -x
|
||||
@@ -61,9 +55,8 @@ jobs:
|
||||
- name: Build docs
|
||||
run: |
|
||||
set -x
|
||||
source venv/bin/activate
|
||||
cd docs
|
||||
./build.sh
|
||||
uv run --package datafusion-docs ./build.sh
|
||||
|
||||
- name: Copy & push the generated HTML
|
||||
run: |
|
||||
|
||||
@@ -44,16 +44,10 @@ jobs:
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
|
||||
with:
|
||||
python-version: "3.12"
|
||||
- name: Setup uv
|
||||
uses: astral-sh/setup-uv@f0ec1fc3b38f5e7cd731bb6ce540c5af426746bb # v6.1.0
|
||||
- name: Install doc dependencies
|
||||
run: |
|
||||
set -x
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r docs/requirements.txt
|
||||
run: uv sync --package datafusion-docs
|
||||
- name: Install dependency graph tooling
|
||||
run: |
|
||||
set -x
|
||||
@@ -63,6 +57,5 @@ jobs:
|
||||
- name: Build docs html and check for warnings
|
||||
run: |
|
||||
set -x
|
||||
source venv/bin/activate
|
||||
cd docs
|
||||
./build.sh # fails on errors
|
||||
uv run --package datafusion-docs ./build.sh # fails on errors
|
||||
|
||||
+4
-139
@@ -42,7 +42,6 @@ DATAFUSION_DIR=${DATAFUSION_DIR:-$SCRIPT_DIR/..}
|
||||
DATA_DIR=${DATA_DIR:-$SCRIPT_DIR/data}
|
||||
CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --release"}
|
||||
PREFER_HASH_JOIN=${PREFER_HASH_JOIN:-true}
|
||||
VIRTUAL_ENV=${VIRTUAL_ENV:-$SCRIPT_DIR/venv}
|
||||
|
||||
usage() {
|
||||
echo "
|
||||
@@ -53,7 +52,6 @@ $0 data [benchmark]
|
||||
$0 run [benchmark] [query]
|
||||
$0 compare <branch1> <branch2>
|
||||
$0 compare_detail <branch1> <branch2>
|
||||
$0 venv
|
||||
|
||||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
Examples:
|
||||
@@ -71,7 +69,6 @@ data: Generates or downloads data needed for benchmarking
|
||||
run: Runs the named benchmark
|
||||
compare: Compares fastest results from benchmark runs
|
||||
compare_detail: Compares minimum, average (±stddev), and maximum results from benchmark runs
|
||||
venv: Creates new venv (unless already exists) and installs compare's requirements into it
|
||||
|
||||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
Benchmarks
|
||||
@@ -144,7 +141,6 @@ CARGO_COMMAND command that runs the benchmark binary
|
||||
DATAFUSION_DIR directory to use (default $DATAFUSION_DIR)
|
||||
RESULTS_NAME folder where the benchmark files are stored
|
||||
PREFER_HASH_JOIN Prefer hash join algorithm (default true)
|
||||
VENV_PATH Python venv to use for compare and venv commands (default ./venv, override by <your-venv>/bin/activate)
|
||||
DATAFUSION_* Set the given datafusion configuration
|
||||
"
|
||||
exit 1
|
||||
@@ -542,9 +538,6 @@ main() {
|
||||
compare_detail)
|
||||
compare_benchmarks "$ARG2" "$ARG3" "--detailed"
|
||||
;;
|
||||
venv)
|
||||
setup_venv
|
||||
;;
|
||||
"")
|
||||
usage
|
||||
;;
|
||||
@@ -708,7 +701,7 @@ run_compile_profile() {
|
||||
local data_path="${DATA_DIR}/tpch_sf1"
|
||||
|
||||
echo "Running compile profile benchmark..."
|
||||
local cmd=(python3 "${runner}" --data "${data_path}")
|
||||
local cmd=(uv run python3 "${runner}" --data "${data_path}")
|
||||
if [ ${#profiles[@]} -gt 0 ]; then
|
||||
cmd+=(--profiles "${profiles[@]}")
|
||||
fi
|
||||
@@ -923,75 +916,13 @@ data_h2o() {
|
||||
SIZE=${1:-"SMALL"}
|
||||
DATA_FORMAT=${2:-"CSV"}
|
||||
|
||||
# Function to compare Python versions
|
||||
version_ge() {
|
||||
[ "$(printf '%s\n' "$1" "$2" | sort -V | head -n1)" = "$2" ]
|
||||
}
|
||||
|
||||
export PYO3_USE_ABI3_FORWARD_COMPATIBILITY=1
|
||||
|
||||
# Find the highest available Python version (3.10 or higher)
|
||||
REQUIRED_VERSION="3.10"
|
||||
PYTHON_CMD=$(command -v python3 || true)
|
||||
|
||||
if [ -n "$PYTHON_CMD" ]; then
|
||||
PYTHON_VERSION=$($PYTHON_CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
|
||||
if version_ge "$PYTHON_VERSION" "$REQUIRED_VERSION"; then
|
||||
echo "Found Python version $PYTHON_VERSION, which is suitable."
|
||||
else
|
||||
echo "Python version $PYTHON_VERSION found, but version $REQUIRED_VERSION or higher is required."
|
||||
PYTHON_CMD=""
|
||||
fi
|
||||
fi
|
||||
|
||||
# Search for suitable Python versions if the default is unsuitable
|
||||
if [ -z "$PYTHON_CMD" ]; then
|
||||
# Loop through all available Python3 commands on the system
|
||||
for CMD in $(compgen -c | grep -E '^python3(\.[0-9]+)?$'); do
|
||||
if command -v "$CMD" &> /dev/null; then
|
||||
PYTHON_VERSION=$($CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
|
||||
if version_ge "$PYTHON_VERSION" "$REQUIRED_VERSION"; then
|
||||
PYTHON_CMD="$CMD"
|
||||
echo "Found suitable Python version: $PYTHON_VERSION ($CMD)"
|
||||
break
|
||||
fi
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
# If no suitable Python version found, exit with an error
|
||||
if [ -z "$PYTHON_CMD" ]; then
|
||||
echo "Python 3.10 or higher is required. Please install it."
|
||||
return 1
|
||||
fi
|
||||
|
||||
echo "Using Python command: $PYTHON_CMD"
|
||||
|
||||
# Install falsa and other dependencies
|
||||
echo "Installing falsa..."
|
||||
|
||||
# Set virtual environment directory
|
||||
VIRTUAL_ENV="${PWD}/venv"
|
||||
|
||||
# Create a virtual environment using the detected Python command
|
||||
$PYTHON_CMD -m venv "$VIRTUAL_ENV"
|
||||
|
||||
# Activate the virtual environment and install dependencies
|
||||
source "$VIRTUAL_ENV/bin/activate"
|
||||
|
||||
# Ensure 'falsa' is installed (avoid unnecessary reinstall)
|
||||
pip install --quiet --upgrade falsa
|
||||
|
||||
# Create directory if it doesn't exist
|
||||
H2O_DIR="${DATA_DIR}/h2o"
|
||||
mkdir -p "${H2O_DIR}"
|
||||
|
||||
# Generate h2o test data
|
||||
echo "Generating h2o test data in ${H2O_DIR} with size=${SIZE} and format=${DATA_FORMAT}"
|
||||
falsa groupby --path-prefix="${H2O_DIR}" --size "${SIZE}" --data-format "${DATA_FORMAT}"
|
||||
|
||||
# Deactivate virtual environment after completion
|
||||
deactivate
|
||||
uv run falsa groupby --path-prefix="${H2O_DIR}" --size "${SIZE}" --data-format "${DATA_FORMAT}"
|
||||
}
|
||||
|
||||
data_h2o_join() {
|
||||
@@ -999,75 +930,13 @@ data_h2o_join() {
|
||||
SIZE=${1:-"SMALL"}
|
||||
DATA_FORMAT=${2:-"CSV"}
|
||||
|
||||
# Function to compare Python versions
|
||||
version_ge() {
|
||||
[ "$(printf '%s\n' "$1" "$2" | sort -V | head -n1)" = "$2" ]
|
||||
}
|
||||
|
||||
export PYO3_USE_ABI3_FORWARD_COMPATIBILITY=1
|
||||
|
||||
# Find the highest available Python version (3.10 or higher)
|
||||
REQUIRED_VERSION="3.10"
|
||||
PYTHON_CMD=$(command -v python3 || true)
|
||||
|
||||
if [ -n "$PYTHON_CMD" ]; then
|
||||
PYTHON_VERSION=$($PYTHON_CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
|
||||
if version_ge "$PYTHON_VERSION" "$REQUIRED_VERSION"; then
|
||||
echo "Found Python version $PYTHON_VERSION, which is suitable."
|
||||
else
|
||||
echo "Python version $PYTHON_VERSION found, but version $REQUIRED_VERSION or higher is required."
|
||||
PYTHON_CMD=""
|
||||
fi
|
||||
fi
|
||||
|
||||
# Search for suitable Python versions if the default is unsuitable
|
||||
if [ -z "$PYTHON_CMD" ]; then
|
||||
# Loop through all available Python3 commands on the system
|
||||
for CMD in $(compgen -c | grep -E '^python3(\.[0-9]+)?$'); do
|
||||
if command -v "$CMD" &> /dev/null; then
|
||||
PYTHON_VERSION=$($CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
|
||||
if version_ge "$PYTHON_VERSION" "$REQUIRED_VERSION"; then
|
||||
PYTHON_CMD="$CMD"
|
||||
echo "Found suitable Python version: $PYTHON_VERSION ($CMD)"
|
||||
break
|
||||
fi
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
# If no suitable Python version found, exit with an error
|
||||
if [ -z "$PYTHON_CMD" ]; then
|
||||
echo "Python 3.10 or higher is required. Please install it."
|
||||
return 1
|
||||
fi
|
||||
|
||||
echo "Using Python command: $PYTHON_CMD"
|
||||
|
||||
# Install falsa and other dependencies
|
||||
echo "Installing falsa..."
|
||||
|
||||
# Set virtual environment directory
|
||||
VIRTUAL_ENV="${PWD}/venv"
|
||||
|
||||
# Create a virtual environment using the detected Python command
|
||||
$PYTHON_CMD -m venv "$VIRTUAL_ENV"
|
||||
|
||||
# Activate the virtual environment and install dependencies
|
||||
source "$VIRTUAL_ENV/bin/activate"
|
||||
|
||||
# Ensure 'falsa' is installed (avoid unnecessary reinstall)
|
||||
pip install --quiet --upgrade falsa
|
||||
|
||||
# Create directory if it doesn't exist
|
||||
H2O_DIR="${DATA_DIR}/h2o"
|
||||
mkdir -p "${H2O_DIR}"
|
||||
|
||||
# Generate h2o test data
|
||||
echo "Generating h2o test data in ${H2O_DIR} with size=${SIZE} and format=${DATA_FORMAT}"
|
||||
falsa join --path-prefix="${H2O_DIR}" --size "${SIZE}" --data-format "${DATA_FORMAT}"
|
||||
|
||||
# Deactivate virtual environment after completion
|
||||
deactivate
|
||||
uv run falsa join --path-prefix="${H2O_DIR}" --size "${SIZE}" --data-format "${DATA_FORMAT}"
|
||||
}
|
||||
|
||||
# Runner for h2o groupby benchmark
|
||||
@@ -1269,7 +1138,7 @@ compare_benchmarks() {
|
||||
echo "--------------------"
|
||||
echo "Benchmark ${BENCH}"
|
||||
echo "--------------------"
|
||||
PATH=$VIRTUAL_ENV/bin:$PATH python3 "${SCRIPT_DIR}"/compare.py $OPTS "${RESULTS_FILE1}" "${RESULTS_FILE2}"
|
||||
uv run python3 "${SCRIPT_DIR}"/compare.py $OPTS "${RESULTS_FILE1}" "${RESULTS_FILE2}"
|
||||
else
|
||||
echo "Note: Skipping ${RESULTS_FILE1} as ${RESULTS_FILE2} does not exist"
|
||||
fi
|
||||
@@ -1384,10 +1253,6 @@ run_clickbench_sorted() {
|
||||
${QUERY_ARG}
|
||||
}
|
||||
|
||||
setup_venv() {
|
||||
python3 -m venv "$VIRTUAL_ENV"
|
||||
PATH=$VIRTUAL_ENV/bin:$PATH python3 -m pip install -r requirements.txt
|
||||
}
|
||||
|
||||
# And start the process up
|
||||
main
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
[project]
|
||||
name = "datafusion-benchmarks"
|
||||
version = "0.1.0"
|
||||
requires-python = ">=3.11"
|
||||
# typing_extensions is an undeclared dependency of falsa
|
||||
dependencies = ["rich", "falsa", "typing_extensions"]
|
||||
@@ -1,18 +0,0 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
rich
|
||||
@@ -0,0 +1,5 @@
|
||||
[project]
|
||||
name = "datafusion-dev"
|
||||
version = "0.1.0"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = ["tomlkit", "PyGithub", "requests"]
|
||||
@@ -178,10 +178,10 @@ We maintain a [changelog] so our users know what has been changed between releas
|
||||
|
||||
The changelog is generated using a Python script.
|
||||
|
||||
To run the script, you will need a GitHub Personal Access Token (described in the prerequisites section) and the `PyGitHub` library. First install the `PyGitHub` dependency via `pip`:
|
||||
To run the script, you will need a GitHub Personal Access Token (described in the prerequisites section) and the `PyGitHub` library. First install the dev dependencies via `uv`:
|
||||
|
||||
```shell
|
||||
pip3 install PyGitHub
|
||||
uv sync
|
||||
```
|
||||
|
||||
To generate the changelog, set the `GITHUB_TOKEN` environment variable and then run `./dev/release/generate-changelog.py`
|
||||
@@ -199,7 +199,7 @@ to generate a change log of all changes between the `50.3.0` tag and `branch-51`
|
||||
|
||||
```shell
|
||||
export GITHUB_TOKEN=<your-token-here>
|
||||
./dev/release/generate-changelog.py 50.3.0 branch-51 51.0.0 > dev/changelog/51.0.0.md
|
||||
uv run ./dev/release/generate-changelog.py 50.3.0 branch-51 51.0.0 > dev/changelog/51.0.0.md
|
||||
```
|
||||
|
||||
This script creates a changelog from GitHub PRs based on the labels associated with them as well as looking for
|
||||
|
||||
@@ -1,2 +0,0 @@
|
||||
tomlkit
|
||||
PyGitHub
|
||||
@@ -19,7 +19,7 @@
|
||||
# Script that updates the arrow dependencies in datafusion locally
|
||||
#
|
||||
# installation:
|
||||
# pip install tomlkit requests
|
||||
# uv sync
|
||||
#
|
||||
# pin all arrow crates deps to a specific version:
|
||||
#
|
||||
|
||||
@@ -19,7 +19,7 @@
|
||||
# Script that updates versions for datafusion crates, locally
|
||||
#
|
||||
# dependencies:
|
||||
# pip install tomlkit
|
||||
# uv sync
|
||||
|
||||
import re
|
||||
import argparse
|
||||
|
||||
+4
-11
@@ -25,19 +25,12 @@ https://datafusion.apache.org/ as part of the release process.
|
||||
|
||||
## Dependencies
|
||||
|
||||
It's recommended to install build dependencies and build the documentation
|
||||
inside a Python virtualenv.
|
||||
Install build dependencies and build the documentation using
|
||||
[uv](https://docs.astral.sh/uv/):
|
||||
|
||||
```sh
|
||||
python3 -m venv venv
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
If using [uv](https://docs.astral.sh/uv/) the script can be run like so without
|
||||
needing to create a virtual environment:
|
||||
|
||||
```sh
|
||||
uv run --with-requirements requirements.txt bash build.sh
|
||||
uv sync
|
||||
uv run bash build.sh
|
||||
```
|
||||
|
||||
The docs build regenerates the workspace dependency graph via
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
[project]
|
||||
name = "datafusion-docs"
|
||||
version = "0.1.0"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"sphinx>=9,<10",
|
||||
"sphinx-reredirects>=1.1,<2",
|
||||
"pydata-sphinx-theme>=0.16,<1",
|
||||
"myst-parser>=5,<6",
|
||||
"maturin>=1.11,<2",
|
||||
"jinja2>=3.1,<4",
|
||||
"setuptools>=82,<83",
|
||||
]
|
||||
@@ -1,24 +0,0 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
sphinx==9.1.0
|
||||
sphinx-reredirects==1.1.0
|
||||
pydata-sphinx-theme==0.16.1
|
||||
myst-parser==5.0.0
|
||||
maturin==1.12.2
|
||||
jinja2==3.1.6
|
||||
setuptools==82.0.0
|
||||
@@ -26,7 +26,7 @@ For example:
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
datafusion = "52.0.0"
|
||||
datafusion = "52.1.0"
|
||||
```
|
||||
|
||||
While DataFusion is distributed via [crates.io] as a convenience, the
|
||||
|
||||
@@ -156,7 +156,7 @@ By default, Datafusion returns errors as a plain text message. You can enable mo
|
||||
such as backtraces by enabling the `backtrace` feature to your `Cargo.toml` file like this:
|
||||
|
||||
```toml
|
||||
datafusion = { version = "52.0.0", features = ["backtrace"]}
|
||||
datafusion = { version = "52.1.0", features = ["backtrace"]}
|
||||
```
|
||||
|
||||
Set environment [variables](https://doc.rust-lang.org/std/backtrace/index.html#environment-variables)
|
||||
|
||||
@@ -29,7 +29,7 @@ Find latest available Datafusion version on [DataFusion's
|
||||
crates.io] page. Add the dependency to your `Cargo.toml` file:
|
||||
|
||||
```toml
|
||||
datafusion = "52.0.0"
|
||||
datafusion = "52.1.0"
|
||||
tokio = { version = "1.0", features = ["rt-multi-thread"] }
|
||||
```
|
||||
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
[tool.uv.workspace]
|
||||
members = ["benchmarks", "dev", "docs"]
|
||||
Reference in New Issue
Block a user