fix JSON decoder error checking for UTF16 / surrogate parsing panic (#7721)

# Which issue does this PR close?

- Closes #7712 .

# Rationale for this change

Shouldn't panic, especially in a fallible function.

# What changes are included in this PR?

Validate that the high and low surrogates are in the expected range,
which guarantees that the subtractions won't overflow.

# Are there any user-facing changes?

No (well, things that used to panic now won't, but I don't think that
counts)
This commit is contained in:
Nick Lanham
2025-06-22 05:34:28 -07:00
committed by GitHub
parent e54b72bc4d
commit 2788762c63
+21 -3
View File
@@ -705,9 +705,16 @@ fn err(b: u8, ctx: &str) -> ArrowError {
/// Creates a character from an UTF-16 surrogate pair
fn char_from_surrogate_pair(low: u16, high: u16) -> Result<char, ArrowError> {
let n = (((high - 0xD800) as u32) << 10) | ((low - 0xDC00) as u32 + 0x1_0000);
char::from_u32(n)
.ok_or_else(|| ArrowError::JsonError(format!("Invalid UTF-16 surrogate pair {n}")))
match (low, high) {
(0xDC00..=0xDFFF, 0xD800..=0xDBFF) => {
let n = (((high - 0xD800) as u32) << 10) | ((low - 0xDC00) as u32 + 0x1_0000);
char::from_u32(n)
.ok_or_else(|| ArrowError::JsonError(format!("Invalid UTF-16 surrogate pair {n}")))
}
_ => Err(ArrowError::JsonError(format!(
"Invalid UTF-16 surrogate pair. High: {high:#02X}, Low: {low:#02X}"
))),
}
}
/// Writes `c` as UTF-8 to `out`
@@ -951,4 +958,15 @@ mod tests {
let err = decoder.finish().unwrap_err().to_string();
assert_eq!(err, "Json error: Encountered truncated UTF-8 sequence");
}
#[test]
fn test_invalid_surrogates() {
let mut decoder = TapeDecoder::new(16, 2);
let res = decoder.decode(b"{\"test\": \"\\ud800\\ud801\"}");
assert!(res.is_err());
let mut decoder = TapeDecoder::new(16, 2);
let res = decoder.decode(b"{\"test\": \"\\udc00\\udc01\"}");
assert!(res.is_err());
}
}