mirror of
https://github.com/langchain-ai/arrow-rs.git
synced 2026-07-01 21:34:01 -04:00
fix JSON decoder error checking for UTF16 / surrogate parsing panic (#7721)
# Which issue does this PR close? - Closes #7712 . # Rationale for this change Shouldn't panic, especially in a fallible function. # What changes are included in this PR? Validate that the high and low surrogates are in the expected range, which guarantees that the subtractions won't overflow. # Are there any user-facing changes? No (well, things that used to panic now won't, but I don't think that counts)
This commit is contained in:
@@ -705,9 +705,16 @@ fn err(b: u8, ctx: &str) -> ArrowError {
|
||||
|
||||
/// Creates a character from an UTF-16 surrogate pair
|
||||
fn char_from_surrogate_pair(low: u16, high: u16) -> Result<char, ArrowError> {
|
||||
let n = (((high - 0xD800) as u32) << 10) | ((low - 0xDC00) as u32 + 0x1_0000);
|
||||
char::from_u32(n)
|
||||
.ok_or_else(|| ArrowError::JsonError(format!("Invalid UTF-16 surrogate pair {n}")))
|
||||
match (low, high) {
|
||||
(0xDC00..=0xDFFF, 0xD800..=0xDBFF) => {
|
||||
let n = (((high - 0xD800) as u32) << 10) | ((low - 0xDC00) as u32 + 0x1_0000);
|
||||
char::from_u32(n)
|
||||
.ok_or_else(|| ArrowError::JsonError(format!("Invalid UTF-16 surrogate pair {n}")))
|
||||
}
|
||||
_ => Err(ArrowError::JsonError(format!(
|
||||
"Invalid UTF-16 surrogate pair. High: {high:#02X}, Low: {low:#02X}"
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
/// Writes `c` as UTF-8 to `out`
|
||||
@@ -951,4 +958,15 @@ mod tests {
|
||||
let err = decoder.finish().unwrap_err().to_string();
|
||||
assert_eq!(err, "Json error: Encountered truncated UTF-8 sequence");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_invalid_surrogates() {
|
||||
let mut decoder = TapeDecoder::new(16, 2);
|
||||
let res = decoder.decode(b"{\"test\": \"\\ud800\\ud801\"}");
|
||||
assert!(res.is_err());
|
||||
|
||||
let mut decoder = TapeDecoder::new(16, 2);
|
||||
let res = decoder.decode(b"{\"test\": \"\\udc00\\udc01\"}");
|
||||
assert!(res.is_err());
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user