From 670dbf481cff6bae9ae0892ac1ecc1b1ab90bea3 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 24 Feb 2026 15:02:59 +0000 Subject: [PATCH] fix: prevent duplicate alias collision with user-provided __datafusion_extracted names (#20432) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Fixes a bug where the optimizer's `AliasGenerator` could produce alias names that collide with`__datafusion_extracted_N` aliases, causing a "Schema contains duplicate unqualified field name" error - I don't expect users themselves to create these aliases, but if you run the optimizers twice (with different `AliasGenerator` instances) you'll hit this. - Adds `AliasGenerator::update_min_id()` to advance the counter past existing aliases - Scans each plan node's expressions during `ExtractLeafExpressions` traversal to seed the generator before any extraction occurs - Switches to controlling the traversal which also means the config-based short circuit more clearly skips the entire rule. Closes https://github.com/apache/datafusion/issues/20430 ## Test plan - [x] Unit test: `test_user_provided_extracted_alias_no_collision` in `extract_leaf_expressions` - [x] SLT regression test in `projection_pushdown.slt` with explicit `__datafusion_extracted_2` alias 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 --- datafusion/common/src/alias.rs | 10 +++++ .../optimizer/src/extract_leaf_expressions.rs | 41 ++++++++++++++++--- .../test_files/projection_pushdown.slt | 41 +++++++++++++++++++ 3 files changed, 87 insertions(+), 5 deletions(-) diff --git a/datafusion/common/src/alias.rs b/datafusion/common/src/alias.rs index 2ee2cb4dc..99f6447a6 100644 --- a/datafusion/common/src/alias.rs +++ b/datafusion/common/src/alias.rs @@ -37,6 +37,16 @@ impl AliasGenerator { Self::default() } + /// Advance the counter to at least `min_id`, ensuring future aliases + /// won't collide with already-existing ones. + /// + /// For example, if the query already contains an alias `alias_42`, then calling + /// `update_min_id(42)` will ensure that future aliases generated by this + /// [`AliasGenerator`] will start from `alias_43`. + pub fn update_min_id(&self, min_id: usize) { + self.next_id.fetch_max(min_id + 1, Ordering::Relaxed); + } + /// Return a unique alias with the provided prefix pub fn next(&self, prefix: &str) -> String { let id = self.next_id.fetch_add(1, Ordering::Relaxed); diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index f5f4982e3..922ea7933 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -114,10 +114,6 @@ impl OptimizerRule for ExtractLeafExpressions { "extract_leaf_expressions" } - fn apply_order(&self) -> Option { - Some(ApplyOrder::TopDown) - } - fn rewrite( &self, plan: LogicalPlan, @@ -127,10 +123,45 @@ impl OptimizerRule for ExtractLeafExpressions { return Ok(Transformed::no(plan)); } let alias_generator = config.alias_generator(); - extract_from_plan(plan, alias_generator) + + // Advance the alias generator past any user-provided __datafusion_extracted_N + // aliases to prevent collisions when generating new extraction aliases. + advance_generator_past_existing(&plan, alias_generator)?; + + plan.transform_down_with_subqueries(|plan| { + extract_from_plan(plan, alias_generator) + }) } } +/// Scans the current plan node's expressions for pre-existing +/// `__datafusion_extracted_N` aliases and advances the generator +/// counter past them to avoid collisions with user-provided aliases. +fn advance_generator_past_existing( + plan: &LogicalPlan, + alias_generator: &AliasGenerator, +) -> Result<()> { + plan.apply(|plan| { + plan.expressions().iter().try_for_each(|expr| { + expr.apply(|e| { + if let Expr::Alias(alias) = e + && let Some(id) = alias + .name + .strip_prefix(EXTRACTED_EXPR_PREFIX) + .and_then(|s| s.strip_prefix('_')) + .and_then(|s| s.parse().ok()) + { + alias_generator.update_min_id(id); + } + Ok(TreeNodeRecursion::Continue) + })?; + Ok::<(), datafusion_common::error::DataFusionError>(()) + })?; + Ok(TreeNodeRecursion::Continue) + }) + .map(|_| ()) +} + /// Extracts `MoveTowardsLeafNodes` sub-expressions from a plan node. /// /// Works for any number of inputs (0, 1, 2, …N). For multi-input nodes diff --git a/datafusion/sqllogictest/test_files/projection_pushdown.slt b/datafusion/sqllogictest/test_files/projection_pushdown.slt index c25b80a0d..dbb77b33c 100644 --- a/datafusion/sqllogictest/test_files/projection_pushdown.slt +++ b/datafusion/sqllogictest/test_files/projection_pushdown.slt @@ -1949,3 +1949,44 @@ ORDER BY simple_struct.id; 3 3 4 4 5 5 + +# ========================================================================= +# Regression: user-provided __datafusion_extracted aliases must not +# collide with optimizer-generated ones +# (https://github.com/apache/datafusion/issues/20430) +# ========================================================================= + +statement ok +COPY ( select {f1: 1, f2: 2} as s +) TO 'test_files/scratch/projection_pushdown/test.parquet' +STORED AS PARQUET; + +statement ok +CREATE EXTERNAL TABLE t +STORED AS PARQUET +LOCATION 'test_files/scratch/projection_pushdown/test.parquet'; + +# Verify that the user-provided __datafusion_extracted_2 alias is preserved +# and the optimizer skips to _3 and _4 for its generated aliases. +query TT +EXPLAIN SELECT + get_field(s, 'f1') AS __datafusion_extracted_2 +FROM t +WHERE COALESCE(get_field(s, 'f1'), get_field(s, 'f2')) = 1; +---- +logical_plan +01)Projection: __datafusion_extracted_2 +02)--Filter: CASE WHEN __datafusion_extracted_3 IS NOT NULL THEN __datafusion_extracted_3 ELSE __datafusion_extracted_4 END = Int64(1) +03)----Projection: get_field(t.s, Utf8("f1")) AS __datafusion_extracted_3, get_field(t.s, Utf8("f2")) AS __datafusion_extracted_4, get_field(t.s, Utf8("f1")) AS __datafusion_extracted_2 +04)------TableScan: t projection=[s], partial_filters=[CASE WHEN get_field(t.s, Utf8("f1")) IS NOT NULL THEN get_field(t.s, Utf8("f1")) ELSE get_field(t.s, Utf8("f2")) END = Int64(1)] +physical_plan +01)FilterExec: CASE WHEN __datafusion_extracted_3@0 IS NOT NULL THEN __datafusion_extracted_3@0 ELSE __datafusion_extracted_4@1 END = 1, projection=[__datafusion_extracted_2@2] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/test.parquet]]}, projection=[get_field(s@0, f1) as __datafusion_extracted_3, get_field(s@0, f2) as __datafusion_extracted_4, get_field(s@0, f1) as __datafusion_extracted_2], file_type=parquet + +query I +SELECT + get_field(s, 'f1') AS __datafusion_extracted_2 +FROM t +WHERE COALESCE(get_field(s, 'f1'), get_field(s, 'f2')) = 1; +---- +1