mirror of
https://github.com/langchain-ai/datafusion.git
synced 2026-07-01 21:24:06 -04:00
## Which issue does this PR close? - Closes #20503 ## Rationale for this change `split_part` did not handle empty delimiters in a PostgreSQL-compatible way (`split("")` in Rust creates leading/trailing empty fields). This could return unexpected results for positions like `1` / `-1` and out-of-range values. This PR aligns behavior with Postgres semantics for empty delimiters. ## What changes are included in this PR? Small change in how we treat the 1, -1 ## Are these changes tested? Indeed! ## Are there any user-facing changes? Yes, behavior is now more consistent with PostgreSQL for `split_part(str, '', n)`. No API changes.
This commit is contained in:
@@ -231,7 +231,15 @@ where
|
||||
"split_part index {n} exceeds maximum supported value"
|
||||
)
|
||||
})?;
|
||||
string.split(delimiter).nth(idx)
|
||||
|
||||
if delimiter.is_empty() {
|
||||
// Match PostgreSQL split_part behavior for empty delimiter:
|
||||
// treat the input as a single field ("ab" -> ["ab"]),
|
||||
// rather than Rust's split("") result (["", "a", "b", ""]).
|
||||
(n == 1).then_some(string)
|
||||
} else {
|
||||
string.split(delimiter).nth(idx)
|
||||
}
|
||||
}
|
||||
std::cmp::Ordering::Less => {
|
||||
// Negative index: use rsplit().nth() to efficiently get from the end
|
||||
@@ -241,7 +249,14 @@ where
|
||||
"split_part index {n} exceeds minimum supported value"
|
||||
)
|
||||
})?;
|
||||
string.rsplit(delimiter).nth(idx)
|
||||
if delimiter.is_empty() {
|
||||
// Match PostgreSQL split_part behavior for empty delimiter:
|
||||
// treat the input as a single field ("ab" -> ["ab"]),
|
||||
// rather than Rust's split("") result (["", "a", "b", ""]).
|
||||
(n == -1).then_some(string)
|
||||
} else {
|
||||
string.rsplit(delimiter).nth(idx)
|
||||
}
|
||||
}
|
||||
std::cmp::Ordering::Equal => {
|
||||
return exec_err!("field position must not be zero");
|
||||
@@ -341,6 +356,117 @@ mod tests {
|
||||
Utf8,
|
||||
StringArray
|
||||
);
|
||||
// Edge cases with delimiters
|
||||
test_function!(
|
||||
SplitPartFunc::new(),
|
||||
vec![
|
||||
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
|
||||
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(",")))),
|
||||
ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
|
||||
],
|
||||
Ok(Some("a")),
|
||||
&str,
|
||||
Utf8,
|
||||
StringArray
|
||||
);
|
||||
test_function!(
|
||||
SplitPartFunc::new(),
|
||||
vec![
|
||||
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
|
||||
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(",")))),
|
||||
ColumnarValue::Scalar(ScalarValue::Int64(Some(3))),
|
||||
],
|
||||
Ok(Some("")),
|
||||
&str,
|
||||
Utf8,
|
||||
StringArray
|
||||
);
|
||||
test_function!(
|
||||
SplitPartFunc::new(),
|
||||
vec![
|
||||
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
|
||||
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
|
||||
ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
|
||||
],
|
||||
Ok(Some("a,b")),
|
||||
&str,
|
||||
Utf8,
|
||||
StringArray
|
||||
);
|
||||
test_function!(
|
||||
SplitPartFunc::new(),
|
||||
vec![
|
||||
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
|
||||
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
|
||||
ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
|
||||
],
|
||||
Ok(Some("")),
|
||||
&str,
|
||||
Utf8,
|
||||
StringArray
|
||||
);
|
||||
test_function!(
|
||||
SplitPartFunc::new(),
|
||||
vec![
|
||||
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
|
||||
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(" ")))),
|
||||
ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
|
||||
],
|
||||
Ok(Some("a,b")),
|
||||
&str,
|
||||
Utf8,
|
||||
StringArray
|
||||
);
|
||||
test_function!(
|
||||
SplitPartFunc::new(),
|
||||
vec![
|
||||
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
|
||||
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(" ")))),
|
||||
ColumnarValue::Scalar(ScalarValue::Int64(Some(2))),
|
||||
],
|
||||
Ok(Some("")),
|
||||
&str,
|
||||
Utf8,
|
||||
StringArray
|
||||
);
|
||||
|
||||
// Edge cases with delimiters with negative n
|
||||
test_function!(
|
||||
SplitPartFunc::new(),
|
||||
vec![
|
||||
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
|
||||
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
|
||||
ColumnarValue::Scalar(ScalarValue::Int64(Some(-1))),
|
||||
],
|
||||
Ok(Some("a,b")),
|
||||
&str,
|
||||
Utf8,
|
||||
StringArray
|
||||
);
|
||||
test_function!(
|
||||
SplitPartFunc::new(),
|
||||
vec![
|
||||
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
|
||||
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from(" ")))),
|
||||
ColumnarValue::Scalar(ScalarValue::Int64(Some(-1))),
|
||||
],
|
||||
Ok(Some("a,b")),
|
||||
&str,
|
||||
Utf8,
|
||||
StringArray
|
||||
);
|
||||
test_function!(
|
||||
SplitPartFunc::new(),
|
||||
vec![
|
||||
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("a,b")))),
|
||||
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("")))),
|
||||
ColumnarValue::Scalar(ScalarValue::Int64(Some(-2))),
|
||||
],
|
||||
Ok(Some("")),
|
||||
&str,
|
||||
Utf8,
|
||||
StringArray
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -701,6 +701,26 @@ SELECT split_part('abc~@~def~@~ghi', '~@~', -100)
|
||||
----
|
||||
(empty)
|
||||
|
||||
query T
|
||||
SELECT split_part('a,b', '', 1)
|
||||
----
|
||||
a,b
|
||||
|
||||
query T
|
||||
SELECT split_part('a,b', '', -1)
|
||||
----
|
||||
a,b
|
||||
|
||||
query T
|
||||
SELECT split_part('a,b', '', 2)
|
||||
----
|
||||
(empty)
|
||||
|
||||
query T
|
||||
SELECT split_part('a,b', '', -2)
|
||||
----
|
||||
(empty)
|
||||
|
||||
statement error DataFusion error: Execution error: field position must not be zero
|
||||
SELECT split_part('abc~@~def~@~ghi', '~@~', 0)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user