mirror of
https://github.com/langchain-ai/datafusion.git
synced 2026-07-01 21:24:06 -04:00
Fix incorrect regex pattern in regex_replace_posix_groups (#19827)
The `regex_replace_posix_groups` method was using the pattern `(\d*)` to
match
POSIX capture group references like `\1`. However, `*` matches zero or
more
digits, which caused a lone backslash `\` to incorrectly become `${}`.
Changed to `(\d+)` which requires at least one digit, fixing the issue.
Added unit tests to validate correct behavior.
- Fixes #19766
---------
Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
This commit is contained in:
@@ -189,13 +189,19 @@ fn regexp_replace_func(args: &[ColumnarValue]) -> Result<ArrayRef> {
|
||||
}
|
||||
}
|
||||
|
||||
/// replace POSIX capture groups (like \1) with Rust Regex group (like ${1})
|
||||
/// replace POSIX capture groups (like \1 or \\1) with Rust Regex group (like ${1})
|
||||
/// used by regexp_replace
|
||||
/// Handles both single backslash (\1) and double backslash (\\1) which can occur
|
||||
/// when SQL strings with escaped backslashes are passed through
|
||||
///
|
||||
/// Note: \0 is converted to ${0}, which in Rust's regex replacement syntax
|
||||
/// substitutes the entire match. This is consistent with POSIX behavior where
|
||||
/// \0 (or &) refers to the entire matched string.
|
||||
fn regex_replace_posix_groups(replacement: &str) -> String {
|
||||
static CAPTURE_GROUPS_RE_LOCK: LazyLock<Regex> =
|
||||
LazyLock::new(|| Regex::new(r"(\\)(\d*)").unwrap());
|
||||
LazyLock::new(|| Regex::new(r"\\{1,2}(\d+)").unwrap());
|
||||
CAPTURE_GROUPS_RE_LOCK
|
||||
.replace_all(replacement, "$${$2}")
|
||||
.replace_all(replacement, "$${$1}")
|
||||
.into_owned()
|
||||
}
|
||||
|
||||
@@ -659,6 +665,42 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_regex_replace_posix_groups() {
|
||||
// Test that \1, \2, etc. are replaced with ${1}, ${2}, etc.
|
||||
assert_eq!(regex_replace_posix_groups(r"\1"), "${1}");
|
||||
assert_eq!(regex_replace_posix_groups(r"\12"), "${12}");
|
||||
assert_eq!(regex_replace_posix_groups(r"X\1Y"), "X${1}Y");
|
||||
assert_eq!(regex_replace_posix_groups(r"\1\2"), "${1}${2}");
|
||||
|
||||
// Test double backslash (from SQL escaped strings like '\\1')
|
||||
assert_eq!(regex_replace_posix_groups(r"\\1"), "${1}");
|
||||
assert_eq!(regex_replace_posix_groups(r"X\\1Y"), "X${1}Y");
|
||||
assert_eq!(regex_replace_posix_groups(r"\\1\\2"), "${1}${2}");
|
||||
|
||||
// Test 3 or 4 backslashes before digits to document expected behavior
|
||||
assert_eq!(regex_replace_posix_groups(r"\\\1"), r"\${1}");
|
||||
assert_eq!(regex_replace_posix_groups(r"\\\\1"), r"\\${1}");
|
||||
assert_eq!(regex_replace_posix_groups(r"\\\1\\\\2"), r"\${1}\\${2}");
|
||||
|
||||
// Test that a lone backslash is NOT replaced (requires at least one digit)
|
||||
assert_eq!(regex_replace_posix_groups(r"\"), r"\");
|
||||
assert_eq!(regex_replace_posix_groups(r"foo\bar"), r"foo\bar");
|
||||
|
||||
// Test that backslash followed by non-digit is preserved
|
||||
assert_eq!(regex_replace_posix_groups(r"\n"), r"\n");
|
||||
assert_eq!(regex_replace_posix_groups(r"\t"), r"\t");
|
||||
|
||||
// Test \0 behavior: \0 is converted to ${0}, which in Rust's regex
|
||||
// replacement syntax substitutes the entire match. This is consistent
|
||||
// with POSIX behavior where \0 (or &) refers to the entire matched string.
|
||||
assert_eq!(regex_replace_posix_groups(r"\0"), "${0}");
|
||||
assert_eq!(
|
||||
regex_replace_posix_groups(r"prefix\0suffix"),
|
||||
"prefix${0}suffix"
|
||||
);
|
||||
}
|
||||
|
||||
macro_rules! static_pattern_regexp_replace {
|
||||
($name:ident, $T:ty, $O:ty) => {
|
||||
#[test]
|
||||
|
||||
Reference in New Issue
Block a user