Fix incorrect UTF-8 detection

This commit is contained in:
dylni
2022-11-20 21:14:10 -05:00
parent 4e25e30302
commit 86c56812b6
2 changed files with 21 additions and 39 deletions
+1 -1
View File
@@ -105,9 +105,9 @@ where
// This condition is optimized to detect surrogate code points.
} else if code_point & 0xFE0 == 0x360 {
self.still_utf8 = false;
if code_point & 0x10 == 0 {
self.surrogate = true;
self.still_utf8 = false;
} else if prev_surrogate {
// Decoding a broken surrogate pair would be lossy.
invalid = true;
+20 -38
View File
@@ -6,15 +6,11 @@ mod common;
use common::Result;
use common::WTF8_STRING;
const INVALID_STRING: &[u8] = b"\xF1foo\xF1\x80bar\xF1\x80\x80baz";
const UTF8_STRING: &str = "string";
fn test_string_is_invalid_utf8(string: &[u8]) {
fn assert_string_is_invalid_utf8(string: &[u8]) {
assert!(str::from_utf8(string).is_err());
}
fn test_invalid_result(result: &Result<()>) {
fn assert_invalid_result(result: &Result<()>) {
if cfg!(windows) {
assert!(result.is_err());
} else {
@@ -23,51 +19,37 @@ fn test_invalid_result(result: &Result<()>) {
}
#[test]
fn test_empty_bytes() {
fn test_empty() {
common::test_utf8_bytes("");
}
#[test]
fn test_empty_vec() {
common::test_utf8_vec("");
}
#[test]
fn test_nonempty_utf8_bytes() {
common::test_utf8_bytes(UTF8_STRING);
}
fn test_nonempty_utf8() {
const UTF8_STRING: &str = "string";
#[test]
fn test_nonempty_utf8_vec() {
common::test_utf8_bytes(UTF8_STRING);
common::test_utf8_vec(UTF8_STRING);
}
#[test]
fn test_invalid_string_is_invalid_utf8() {
test_string_is_invalid_utf8(INVALID_STRING);
fn test_invalid() {
const INVALID_STRING: &[u8] = b"\xF1foo\xF1\x80bar\xF1\x80\x80baz";
assert_string_is_invalid_utf8(INVALID_STRING);
assert_invalid_result(&common::test_bytes(INVALID_STRING));
assert_invalid_result(&common::test_vec(INVALID_STRING));
}
#[test]
fn test_invalid_bytes() {
test_invalid_result(&common::test_bytes(INVALID_STRING));
}
fn test_wtf8() {
const HIGH_SURROGATE: &[u8] = b"\xED\xA0\x80";
const LOW_SURROGATE: &[u8] = b"\xED\xB0\x80";
#[test]
fn test_invalid_vec() {
test_invalid_result(&common::test_vec(INVALID_STRING));
}
for string in [WTF8_STRING, HIGH_SURROGATE, LOW_SURROGATE] {
assert_string_is_invalid_utf8(string);
#[test]
fn test_wtf8_string_is_invalid_utf8() {
test_string_is_invalid_utf8(WTF8_STRING);
}
#[test]
fn test_wtf8_bytes() {
assert_eq!(Ok(()), common::test_bytes(WTF8_STRING));
}
#[test]
fn test_wtf8_vec() {
assert_eq!(Ok(()), common::test_vec(WTF8_STRING));
assert_eq!(Ok(()), common::test_bytes(string));
assert_eq!(Ok(()), common::test_vec(string));
}
}