mirror of
https://github.com/langchain-ai/datafusion.git
synced 2026-07-01 21:24:06 -04:00
fix: prevent duplicate alias collision with user-provided __datafusion_extracted names (#20432)
## Summary - Fixes a bug where the optimizer's `AliasGenerator` could produce alias names that collide with`__datafusion_extracted_N` aliases, causing a "Schema contains duplicate unqualified field name" error - I don't expect users themselves to create these aliases, but if you run the optimizers twice (with different `AliasGenerator` instances) you'll hit this. - Adds `AliasGenerator::update_min_id()` to advance the counter past existing aliases - Scans each plan node's expressions during `ExtractLeafExpressions` traversal to seed the generator before any extraction occurs - Switches to controlling the traversal which also means the config-based short circuit more clearly skips the entire rule. Closes https://github.com/apache/datafusion/issues/20430 ## Test plan - [x] Unit test: `test_user_provided_extracted_alias_no_collision` in `extract_leaf_expressions` - [x] SLT regression test in `projection_pushdown.slt` with explicit `__datafusion_extracted_2` alias 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
committed by
GitHub
parent
17d770d6e5
commit
670dbf481c
@@ -37,6 +37,16 @@ impl AliasGenerator {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Advance the counter to at least `min_id`, ensuring future aliases
|
||||
/// won't collide with already-existing ones.
|
||||
///
|
||||
/// For example, if the query already contains an alias `alias_42`, then calling
|
||||
/// `update_min_id(42)` will ensure that future aliases generated by this
|
||||
/// [`AliasGenerator`] will start from `alias_43`.
|
||||
pub fn update_min_id(&self, min_id: usize) {
|
||||
self.next_id.fetch_max(min_id + 1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
/// Return a unique alias with the provided prefix
|
||||
pub fn next(&self, prefix: &str) -> String {
|
||||
let id = self.next_id.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
@@ -114,10 +114,6 @@ impl OptimizerRule for ExtractLeafExpressions {
|
||||
"extract_leaf_expressions"
|
||||
}
|
||||
|
||||
fn apply_order(&self) -> Option<ApplyOrder> {
|
||||
Some(ApplyOrder::TopDown)
|
||||
}
|
||||
|
||||
fn rewrite(
|
||||
&self,
|
||||
plan: LogicalPlan,
|
||||
@@ -127,10 +123,45 @@ impl OptimizerRule for ExtractLeafExpressions {
|
||||
return Ok(Transformed::no(plan));
|
||||
}
|
||||
let alias_generator = config.alias_generator();
|
||||
extract_from_plan(plan, alias_generator)
|
||||
|
||||
// Advance the alias generator past any user-provided __datafusion_extracted_N
|
||||
// aliases to prevent collisions when generating new extraction aliases.
|
||||
advance_generator_past_existing(&plan, alias_generator)?;
|
||||
|
||||
plan.transform_down_with_subqueries(|plan| {
|
||||
extract_from_plan(plan, alias_generator)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Scans the current plan node's expressions for pre-existing
|
||||
/// `__datafusion_extracted_N` aliases and advances the generator
|
||||
/// counter past them to avoid collisions with user-provided aliases.
|
||||
fn advance_generator_past_existing(
|
||||
plan: &LogicalPlan,
|
||||
alias_generator: &AliasGenerator,
|
||||
) -> Result<()> {
|
||||
plan.apply(|plan| {
|
||||
plan.expressions().iter().try_for_each(|expr| {
|
||||
expr.apply(|e| {
|
||||
if let Expr::Alias(alias) = e
|
||||
&& let Some(id) = alias
|
||||
.name
|
||||
.strip_prefix(EXTRACTED_EXPR_PREFIX)
|
||||
.and_then(|s| s.strip_prefix('_'))
|
||||
.and_then(|s| s.parse().ok())
|
||||
{
|
||||
alias_generator.update_min_id(id);
|
||||
}
|
||||
Ok(TreeNodeRecursion::Continue)
|
||||
})?;
|
||||
Ok::<(), datafusion_common::error::DataFusionError>(())
|
||||
})?;
|
||||
Ok(TreeNodeRecursion::Continue)
|
||||
})
|
||||
.map(|_| ())
|
||||
}
|
||||
|
||||
/// Extracts `MoveTowardsLeafNodes` sub-expressions from a plan node.
|
||||
///
|
||||
/// Works for any number of inputs (0, 1, 2, …N). For multi-input nodes
|
||||
|
||||
@@ -1949,3 +1949,44 @@ ORDER BY simple_struct.id;
|
||||
3 3
|
||||
4 4
|
||||
5 5
|
||||
|
||||
# =========================================================================
|
||||
# Regression: user-provided __datafusion_extracted aliases must not
|
||||
# collide with optimizer-generated ones
|
||||
# (https://github.com/apache/datafusion/issues/20430)
|
||||
# =========================================================================
|
||||
|
||||
statement ok
|
||||
COPY ( select {f1: 1, f2: 2} as s
|
||||
) TO 'test_files/scratch/projection_pushdown/test.parquet'
|
||||
STORED AS PARQUET;
|
||||
|
||||
statement ok
|
||||
CREATE EXTERNAL TABLE t
|
||||
STORED AS PARQUET
|
||||
LOCATION 'test_files/scratch/projection_pushdown/test.parquet';
|
||||
|
||||
# Verify that the user-provided __datafusion_extracted_2 alias is preserved
|
||||
# and the optimizer skips to _3 and _4 for its generated aliases.
|
||||
query TT
|
||||
EXPLAIN SELECT
|
||||
get_field(s, 'f1') AS __datafusion_extracted_2
|
||||
FROM t
|
||||
WHERE COALESCE(get_field(s, 'f1'), get_field(s, 'f2')) = 1;
|
||||
----
|
||||
logical_plan
|
||||
01)Projection: __datafusion_extracted_2
|
||||
02)--Filter: CASE WHEN __datafusion_extracted_3 IS NOT NULL THEN __datafusion_extracted_3 ELSE __datafusion_extracted_4 END = Int64(1)
|
||||
03)----Projection: get_field(t.s, Utf8("f1")) AS __datafusion_extracted_3, get_field(t.s, Utf8("f2")) AS __datafusion_extracted_4, get_field(t.s, Utf8("f1")) AS __datafusion_extracted_2
|
||||
04)------TableScan: t projection=[s], partial_filters=[CASE WHEN get_field(t.s, Utf8("f1")) IS NOT NULL THEN get_field(t.s, Utf8("f1")) ELSE get_field(t.s, Utf8("f2")) END = Int64(1)]
|
||||
physical_plan
|
||||
01)FilterExec: CASE WHEN __datafusion_extracted_3@0 IS NOT NULL THEN __datafusion_extracted_3@0 ELSE __datafusion_extracted_4@1 END = 1, projection=[__datafusion_extracted_2@2]
|
||||
02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/test.parquet]]}, projection=[get_field(s@0, f1) as __datafusion_extracted_3, get_field(s@0, f2) as __datafusion_extracted_4, get_field(s@0, f1) as __datafusion_extracted_2], file_type=parquet
|
||||
|
||||
query I
|
||||
SELECT
|
||||
get_field(s, 'f1') AS __datafusion_extracted_2
|
||||
FROM t
|
||||
WHERE COALESCE(get_field(s, 'f1'), get_field(s, 'f2')) = 1;
|
||||
----
|
||||
1
|
||||
|
||||
Reference in New Issue
Block a user