fix: handle empty delimiter in split_part (closes #20503) (#20542)

## Which issue does this PR close?

- Closes #20503 

## Rationale for this change

`split_part` did not handle empty delimiters in a PostgreSQL-compatible
way (`split("")` in Rust creates leading/trailing empty fields).
This could return unexpected results for positions like `1` / `-1` and
out-of-range values.
This PR aligns behavior with Postgres semantics for empty delimiters.

## What changes are included in this PR?

Small change in how we treat the 1, -1

## Are these changes tested?

Indeed!

## Are there any user-facing changes?

Yes, behavior is now more consistent with PostgreSQL for
`split_part(str, '', n)`.
No API changes.
This commit is contained in:
Gabriel Ferraté
2026-02-26 22:03:34 +01:00
committed by GitHub
parent e76f0eebe3
commit 3ab1301c53
2 changed files with 148 additions and 2 deletions
+128 -2
View File
@@ -231,7 +231,15 @@ where
"split_part index {n} exceeds maximum supported value"
)
})?;
string.split(delimiter).nth(idx)
if delimiter.is_empty() {
// Match PostgreSQL split_part behavior for empty delimiter:
// treat the input as a single field ("ab" -> ["ab"]),
// rather than Rust's split("") result (["", "a", "b", ""]).
(n == 1).then_some(string)
} else {
string.split(delimiter).nth(idx)
}
}
std::cmp::Ordering::Less => {
// Negative index: use rsplit().nth() to efficiently get from the end
@@ -241,7 +249,14 @@ where
"split_part index {n} exceeds minimum supported value"
)
})?;
string.rsplit(delimiter).nth(idx)
if delimiter.is_empty() {
// Match PostgreSQL split_part behavior for empty delimiter:
// treat the input as a single field ("ab" -> ["ab"]),
// rather than Rust's split("") result (["", "a", "b", ""]).
(n == -1).then_some(string)
} else {
string.rsplit(delimiter).nth(idx)
}
}
std::cmp::Ordering::Equal => {
return exec_err!("field position must not be zero");
@@ -341,6 +356,117 @@ mod tests {
Utf8,
StringArray
);
// Edge cases with delimiters
test_function!(
SplitPartFunc::new(),
vec![
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(",")))),
ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
],
Ok(Some("a")),
&str,
Utf8,
StringArray
);
test_function!(
SplitPartFunc::new(),
vec![
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(",")))),
ColumnarValue::Scalar(ScalarValue::Int64(Some(3))),
],
Ok(Some("")),
&str,
Utf8,
StringArray
);
test_function!(
SplitPartFunc::new(),
vec![
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
],
Ok(Some("a,b")),
&str,
Utf8,
StringArray
);
test_function!(
SplitPartFunc::new(),
vec![
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
],
Ok(Some("")),
&str,
Utf8,
StringArray
);
test_function!(
SplitPartFunc::new(),
vec![
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(" ")))),
ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
],
Ok(Some("a,b")),
&str,
Utf8,
StringArray
);
test_function!(
SplitPartFunc::new(),
vec![
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(" ")))),
ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
],
Ok(Some("")),
&str,
Utf8,
StringArray
);
// Edge cases with delimiters with negative n
test_function!(
SplitPartFunc::new(),
vec![
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
ColumnarValue::Scalar(ScalarValue::Int64(Some(-1))),
],
Ok(Some("a,b")),
&str,
Utf8,
StringArray
);
test_function!(
SplitPartFunc::new(),
vec![
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(" ")))),
ColumnarValue::Scalar(ScalarValue::Int64(Some(-1))),
],
Ok(Some("a,b")),
&str,
Utf8,
StringArray
);
test_function!(
SplitPartFunc::new(),
vec![
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
ColumnarValue::Scalar(ScalarValue::Int64(Some(-2))),
],
Ok(Some("")),
&str,
Utf8,
StringArray
);
Ok(())
}
@@ -701,6 +701,26 @@ SELECT split_part('abc~@~def~@~ghi', '~@~', -100)
----
(empty)
query T
SELECT split_part('a,b', '', 1)
----
a,b
query T
SELECT split_part('a,b', '', -1)
----
a,b
query T
SELECT split_part('a,b', '', 2)
----
(empty)
query T
SELECT split_part('a,b', '', -2)
----
(empty)
statement error DataFusion error: Execution error: field position must not be zero
SELECT split_part('abc~@~def~@~ghi', '~@~', 0)