Fix incorrect regex pattern in regex_replace_posix_groups (#19827)

The `regex_replace_posix_groups` method was using the pattern `(\d*)` to
match
POSIX capture group references like `\1`. However, `*` matches zero or
more
digits, which caused a lone backslash `\` to incorrectly become `${}`.

Changed to `(\d+)` which requires at least one digit, fixing the issue.

Added unit tests to validate correct behavior.

- Fixes #19766

---------

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
This commit is contained in:
Ganesh Patil
2026-02-25 01:56:06 +05:30
committed by GitHub
parent e80694e369
commit b8cebdde2a
@@ -189,13 +189,19 @@ fn regexp_replace_func(args: &[ColumnarValue]) -> Result<ArrayRef> {
}
}
/// replace POSIX capture groups (like \1) with Rust Regex group (like ${1})
/// replace POSIX capture groups (like \1 or \\1) with Rust Regex group (like ${1})
/// used by regexp_replace
/// Handles both single backslash (\1) and double backslash (\\1) which can occur
/// when SQL strings with escaped backslashes are passed through
///
/// Note: \0 is converted to ${0}, which in Rust's regex replacement syntax
/// substitutes the entire match. This is consistent with POSIX behavior where
/// \0 (or &) refers to the entire matched string.
fn regex_replace_posix_groups(replacement: &str) -> String {
static CAPTURE_GROUPS_RE_LOCK: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(\\)(\d*)").unwrap());
LazyLock::new(|| Regex::new(r"\\{1,2}(\d+)").unwrap());
CAPTURE_GROUPS_RE_LOCK
.replace_all(replacement, "$${$2}")
.replace_all(replacement, "$${$1}")
.into_owned()
}
@@ -659,6 +665,42 @@ mod tests {
use super::*;
#[test]
fn test_regex_replace_posix_groups() {
// Test that \1, \2, etc. are replaced with ${1}, ${2}, etc.
assert_eq!(regex_replace_posix_groups(r"\1"), "${1}");
assert_eq!(regex_replace_posix_groups(r"\12"), "${12}");
assert_eq!(regex_replace_posix_groups(r"X\1Y"), "X${1}Y");
assert_eq!(regex_replace_posix_groups(r"\1\2"), "${1}${2}");
// Test double backslash (from SQL escaped strings like '\\1')
assert_eq!(regex_replace_posix_groups(r"\\1"), "${1}");
assert_eq!(regex_replace_posix_groups(r"X\\1Y"), "X${1}Y");
assert_eq!(regex_replace_posix_groups(r"\\1\\2"), "${1}${2}");
// Test 3 or 4 backslashes before digits to document expected behavior
assert_eq!(regex_replace_posix_groups(r"\\\1"), r"\${1}");
assert_eq!(regex_replace_posix_groups(r"\\\\1"), r"\\${1}");
assert_eq!(regex_replace_posix_groups(r"\\\1\\\\2"), r"\${1}\\${2}");
// Test that a lone backslash is NOT replaced (requires at least one digit)
assert_eq!(regex_replace_posix_groups(r"\"), r"\");
assert_eq!(regex_replace_posix_groups(r"foo\bar"), r"foo\bar");
// Test that backslash followed by non-digit is preserved
assert_eq!(regex_replace_posix_groups(r"\n"), r"\n");
assert_eq!(regex_replace_posix_groups(r"\t"), r"\t");
// Test \0 behavior: \0 is converted to ${0}, which in Rust's regex
// replacement syntax substitutes the entire match. This is consistent
// with POSIX behavior where \0 (or &) refers to the entire matched string.
assert_eq!(regex_replace_posix_groups(r"\0"), "${0}");
assert_eq!(
regex_replace_posix_groups(r"prefix\0suffix"),
"prefix${0}suffix"
);
}
macro_rules! static_pattern_regexp_replace {
($name:ident, $T:ty, $O:ty) => {
#[test]