mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-10-09 03:15:11 +00:00
Bug 1405615 - encoding_rs 0.7.1: Correctly encode U+DC00 followed by another low surrogate from UTF-16. r=emk.
`wrapping_sub()`-based high surrogate check was off by one due to error in copy and paste when defining the constant to compare against. That is, the subtraction that defines the constant was completely wrong but the result of the subtraction was only off by one, which is why the bug wasn't discovered immediately. This lead to the first low surrogate (U+DC00), and only the first low surrogate, getting accepted as a high surrogate. Discovered using cargo-fuzz. MozReview-Commit-ID: K3Ptws31WuV --HG-- extra : rebase_source : ef4c38214bba3bf72133d890fc1cce847024c81a
This commit is contained in:
parent
a928cba1ca
commit
2092982126
File diff suppressed because one or more lines are too long
20
third_party/rust/encoding_rs/Cargo.toml
vendored
20
third_party/rust/encoding_rs/Cargo.toml
vendored
@ -12,7 +12,7 @@
|
||||
|
||||
[package]
|
||||
name = "encoding_rs"
|
||||
version = "0.7.0"
|
||||
version = "0.7.1"
|
||||
authors = ["Henri Sivonen <hsivonen@hsivonen.fi>"]
|
||||
description = "A Gecko-oriented implementation of the Encoding Standard"
|
||||
homepage = "https://docs.rs/encoding_rs/"
|
||||
@ -24,27 +24,27 @@ license = "MIT/Apache-2.0"
|
||||
repository = "https://github.com/hsivonen/encoding_rs"
|
||||
[profile.release]
|
||||
lto = true
|
||||
[dependencies.cfg-if]
|
||||
version = "0.1.0"
|
||||
[dependencies.serde]
|
||||
version = "1.0"
|
||||
optional = true
|
||||
|
||||
[dependencies.simd]
|
||||
version = "0.2.0"
|
||||
optional = true
|
||||
|
||||
[dependencies.serde]
|
||||
version = "1.0"
|
||||
optional = true
|
||||
[dependencies.cfg-if]
|
||||
version = "0.1.0"
|
||||
[dev-dependencies.serde_derive]
|
||||
version = "1.0"
|
||||
|
||||
[dev-dependencies.serde_json]
|
||||
version = "1.0"
|
||||
|
||||
[dev-dependencies.bincode]
|
||||
version = "0.8"
|
||||
|
||||
[dev-dependencies.serde_json]
|
||||
version = "1.0"
|
||||
|
||||
[features]
|
||||
no-static-ideograph-encoder-tables = []
|
||||
simd-accel = ["simd"]
|
||||
no-static-ideograph-encoder-tables = []
|
||||
[badges.travis-ci]
|
||||
repository = "hsivonen/encoding_rs"
|
||||
|
17
third_party/rust/encoding_rs/Ideas.md
vendored
17
third_party/rust/encoding_rs/Ideas.md
vendored
@ -7,14 +7,6 @@ The current plan for a SIMD-accelerated inner loop for handling ASCII bytes
|
||||
makes no use of the bit of information that if the buffers didn't end but the
|
||||
ASCII loop exited, the next byte will not be an ASCII byte.
|
||||
|
||||
## The structure of handles.rs and bound checks
|
||||
|
||||
handles.rs is designed to make it possible to avoid bound checks when writing
|
||||
to the slices. While it would be possible to omit the bound checks manually,
|
||||
it probably makes more sense to carry out an investigation to make sure that
|
||||
the compiler performs the omission. If not, it makes more sense to file a bug
|
||||
on the compiler than to omit the checks manually.
|
||||
|
||||
## Handling ASCII with table lookups when decoding single-byte to UTF-16
|
||||
|
||||
Both uconv and ICU outperform encoding_rs when decoding single-byte to UTF-16.
|
||||
@ -75,3 +67,12 @@ fully Unicode-ordered. Is "mostly" good enough for encode accelelation?
|
||||
Experiment with a function that computes `(i / 94, i % 94)` more efficiently
|
||||
than generic code.
|
||||
|
||||
## Align writes on Aarch64
|
||||
|
||||
On [Cortex-A57](https://stackoverflow.com/questions/45714535/performance-of-unaligned-simd-load-store-on-aarch64/45938112#45938112
|
||||
), it might be a good idea to move the destination into 16-byte alignment.
|
||||
|
||||
## Unalign UTF-8 validation on Aarch64
|
||||
|
||||
Currently, Aarch64 runs the generic ALU UTF-8 validation code that aligns
|
||||
reads. That's probably unnecessary on Aarch64. (SIMD was slower than ALU!)
|
||||
|
23
third_party/rust/encoding_rs/README.md
vendored
23
third_party/rust/encoding_rs/README.md
vendored
@ -63,17 +63,23 @@ using the C++ standard library and [GSL](https://github.com/Microsoft/GSL/) type
|
||||
For the Gecko context, there's a
|
||||
[C++ wrapper using the MFBT/XPCOM types](https://searchfox.org/mozilla-central/source/intl/Encoding.h#100).
|
||||
|
||||
## Sample programs
|
||||
|
||||
* [Rust](https://github.com/hsivonen/recode_rs)
|
||||
* [C](https://github.com/hsivonen/recode_c)
|
||||
* [C++](https://github.com/hsivonen/recode_cpp)
|
||||
|
||||
## Optional features
|
||||
|
||||
There are currently three optional cargo features:
|
||||
|
||||
### `simd-accel`
|
||||
|
||||
Enables SSE2 acceleration on x86, x86_64 and Aarch64. Requires nightly Rust.
|
||||
_Enabling this cargo feature is recommended when building for x86, x86_64 or
|
||||
Aarch64 on nightly Rust._ The intention is for the functionality enabled by
|
||||
this feature to become the normal on-by-default behavior once explicit SIMD
|
||||
becames available on all Rust release channels.
|
||||
Enables SSE2 acceleration on x86 and x86_64 and NEON acceleration on Aarch64.
|
||||
Requires nightly Rust. _Enabling this cargo feature is recommended when
|
||||
building for x86, x86_64 or Aarch64 on nightly Rust._ The intention is for the
|
||||
functionality enabled by this feature to become the normal on-by-default
|
||||
behavior once explicit SIMD becames available on all Rust release channels.
|
||||
|
||||
Enabling this feature breaks the build unless the target is x86 with SSE2
|
||||
(Rust's default 32-bit x86 target, `i686`, has SSE2, but Linux distros may
|
||||
@ -180,13 +186,18 @@ used in Firefox.
|
||||
range per encoding.
|
||||
- [x] Replace uconv with encoding_rs in Gecko.
|
||||
- [x] Implement the rust-encoding API in terms of encoding_rs.
|
||||
- [ ] Add SIMD acceleration for Aarch64.
|
||||
- [x] Add SIMD acceleration for Aarch64.
|
||||
- [ ] Investigate the use of NEON on 32-bit ARM.
|
||||
- [ ] Investigate Björn Höhrmann's lookup table acceleration for UTF-8 as
|
||||
adapted to Rust in rust-encoding.
|
||||
|
||||
## Release Notes
|
||||
|
||||
### 0.7.1
|
||||
|
||||
* When encoding from invalid UTF-16, correctly handle U+DC00 followed by
|
||||
another low surrogate.
|
||||
|
||||
### 0.7.0
|
||||
|
||||
* [Make `replacement` a label of the replacement
|
||||
|
17
third_party/rust/encoding_rs/src/ascii.rs
vendored
17
third_party/rust/encoding_rs/src/ascii.rs
vendored
@ -82,8 +82,8 @@ macro_rules! ascii_alu {
|
||||
// }
|
||||
// dst_until_alignment
|
||||
// }
|
||||
};
|
||||
if until_alignment + STRIDE_SIZE <= len {
|
||||
};
|
||||
if until_alignment + STRIDE_SIZE <= len {
|
||||
// Moving pointers to alignment seems to be a pessimization on
|
||||
// x86_64 for operations that have UTF-16 as the internal
|
||||
// Unicode representation. However, since it seems to be a win
|
||||
@ -355,7 +355,7 @@ macro_rules! ascii_to_basic_latin_simd_stride {
|
||||
if !is_ascii(simd) {
|
||||
return false;
|
||||
}
|
||||
let (first, second) = unpack(simd);
|
||||
let (first, second) = simd_unpack(simd);
|
||||
$store(dst, first);
|
||||
$store(dst.offset(8), second);
|
||||
true
|
||||
@ -371,12 +371,11 @@ macro_rules! basic_latin_to_ascii_simd_stride {
|
||||
pub unsafe fn $name(src: *const u16, dst: *mut u8) -> bool {
|
||||
let first = $load(src);
|
||||
let second = $load(src.offset(8));
|
||||
match pack_basic_latin(first, second) {
|
||||
Some(packed) => {
|
||||
$store(dst, packed);
|
||||
true
|
||||
},
|
||||
None => false,
|
||||
if is_basic_latin(first | second) {
|
||||
$store(dst, simd_pack(first, second));
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
});
|
||||
}
|
||||
|
14
third_party/rust/encoding_rs/src/big5.rs
vendored
14
third_party/rust/encoding_rs/src/big5.rs
vendored
@ -390,4 +390,18 @@ mod tests {
|
||||
assert_eq!(encoding, BIG5);
|
||||
assert_eq!(&cow[..], &expectation[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_big5_encode_from_two_low_surrogates() {
|
||||
let expectation = b"��";
|
||||
let mut output = [0u8; 40];
|
||||
let mut encoder = BIG5.new_encoder();
|
||||
let (result, read, written, had_errors) =
|
||||
encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 2);
|
||||
assert_eq!(written, expectation.len());
|
||||
assert!(had_errors);
|
||||
assert_eq!(&output[..written], expectation);
|
||||
}
|
||||
}
|
||||
|
14
third_party/rust/encoding_rs/src/euc_kr.rs
vendored
14
third_party/rust/encoding_rs/src/euc_kr.rs
vendored
@ -379,4 +379,18 @@ mod tests {
|
||||
assert_eq!(encoding, EUC_KR);
|
||||
assert_eq!(&cow[..], &expectation[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_euc_kr_encode_from_two_low_surrogates() {
|
||||
let expectation = b"��";
|
||||
let mut output = [0u8; 40];
|
||||
let mut encoder = EUC_KR.new_encoder();
|
||||
let (result, read, written, had_errors) =
|
||||
encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 2);
|
||||
assert_eq!(written, expectation.len());
|
||||
assert!(had_errors);
|
||||
assert_eq!(&output[..written], expectation);
|
||||
}
|
||||
}
|
||||
|
30
third_party/rust/encoding_rs/src/handles.rs
vendored
30
third_party/rust/encoding_rs/src/handles.rs
vendored
@ -328,7 +328,7 @@ impl<'a> Utf16Destination<'a> {
|
||||
-> CopyAsciiResult<(DecoderResult, usize, usize), (u8, Utf16BmpHandle<'b, 'a>)> {
|
||||
let non_ascii_ret = {
|
||||
let src_remaining = &source.slice[source.pos..];
|
||||
let mut dst_remaining = &mut self.slice[self.pos..];
|
||||
let dst_remaining = &mut self.slice[self.pos..];
|
||||
let (pending, length) = if dst_remaining.len() < src_remaining.len() {
|
||||
(DecoderResult::OutputFull, dst_remaining.len())
|
||||
} else {
|
||||
@ -364,7 +364,7 @@ impl<'a> Utf16Destination<'a> {
|
||||
let non_ascii_ret = {
|
||||
let dst_len = self.slice.len();
|
||||
let src_remaining = &source.slice[source.pos..];
|
||||
let mut dst_remaining = &mut self.slice[self.pos..];
|
||||
let dst_remaining = &mut self.slice[self.pos..];
|
||||
let (pending, length) = if dst_remaining.len() < src_remaining.len() {
|
||||
(DecoderResult::OutputFull, dst_remaining.len())
|
||||
} else {
|
||||
@ -401,7 +401,7 @@ impl<'a> Utf16Destination<'a> {
|
||||
#[inline(always)]
|
||||
pub fn copy_utf8_up_to_invalid_from(&mut self, source: &mut ByteSource) {
|
||||
let src_remaining = &source.slice[source.pos..];
|
||||
let mut dst_remaining = &mut self.slice[self.pos..];
|
||||
let dst_remaining = &mut self.slice[self.pos..];
|
||||
let (read, written) = convert_utf8_to_utf16_up_to_invalid(src_remaining, dst_remaining);
|
||||
source.pos += read;
|
||||
self.pos += written;
|
||||
@ -623,7 +623,7 @@ impl<'a> Utf8Destination<'a> {
|
||||
let non_ascii_ret = {
|
||||
let dst_len = self.slice.len();
|
||||
let src_remaining = &source.slice[source.pos..];
|
||||
let mut dst_remaining = &mut self.slice[self.pos..];
|
||||
let dst_remaining = &mut self.slice[self.pos..];
|
||||
let (pending, length) = if dst_remaining.len() < src_remaining.len() {
|
||||
(DecoderResult::OutputFull, dst_remaining.len())
|
||||
} else {
|
||||
@ -661,7 +661,7 @@ impl<'a> Utf8Destination<'a> {
|
||||
let non_ascii_ret = {
|
||||
let dst_len = self.slice.len();
|
||||
let src_remaining = &source.slice[source.pos..];
|
||||
let mut dst_remaining = &mut self.slice[self.pos..];
|
||||
let dst_remaining = &mut self.slice[self.pos..];
|
||||
let (pending, length) = if dst_remaining.len() < src_remaining.len() {
|
||||
(DecoderResult::OutputFull, dst_remaining.len())
|
||||
} else {
|
||||
@ -694,7 +694,7 @@ impl<'a> Utf8Destination<'a> {
|
||||
#[inline(always)]
|
||||
pub fn copy_utf8_up_to_invalid_from(&mut self, source: &mut ByteSource) {
|
||||
let src_remaining = &source.slice[source.pos..];
|
||||
let mut dst_remaining = &mut self.slice[self.pos..];
|
||||
let dst_remaining = &mut self.slice[self.pos..];
|
||||
let min_len = ::std::cmp::min(src_remaining.len(), dst_remaining.len());
|
||||
// Validate first, then memcpy to let memcpy do its thing even for
|
||||
// non-ASCII. (And potentially do something better than SSE2 for ASCII.)
|
||||
@ -746,7 +746,7 @@ impl<'a> Utf16Source<'a> {
|
||||
if unit_minus_surrogate_start > (0xDFFF - 0xD800) {
|
||||
return unsafe { ::std::mem::transmute(unit) };
|
||||
}
|
||||
if unit_minus_surrogate_start <= (0xDFFF - 0xDBFF) {
|
||||
if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
|
||||
// high surrogate
|
||||
if self.pos < self.slice.len() {
|
||||
let second = self.slice[self.pos] as u32;
|
||||
@ -783,7 +783,7 @@ impl<'a> Utf16Source<'a> {
|
||||
if unit_minus_surrogate_start > (0xDFFF - 0xD800) {
|
||||
return Unicode::NonAscii(NonAscii::BmpExclAscii(unit));
|
||||
}
|
||||
if unit_minus_surrogate_start <= (0xDFFF - 0xDBFF) {
|
||||
if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
|
||||
// high surrogate
|
||||
if self.pos < self.slice.len() {
|
||||
let second = self.slice[self.pos] as u32;
|
||||
@ -828,7 +828,7 @@ impl<'a> Utf16Source<'a> {
|
||||
let non_ascii_ret = {
|
||||
let dst_len = dest.slice.len();
|
||||
let src_remaining = &self.slice[self.pos..];
|
||||
let mut dst_remaining = &mut dest.slice[dest.pos..];
|
||||
let dst_remaining = &mut dest.slice[dest.pos..];
|
||||
let (pending, length) = if dst_remaining.len() < src_remaining.len() {
|
||||
(EncoderResult::OutputFull, dst_remaining.len())
|
||||
} else {
|
||||
@ -855,7 +855,7 @@ impl<'a> Utf16Source<'a> {
|
||||
let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
|
||||
if unit_minus_surrogate_start > (0xDFFF - 0xD800) {
|
||||
NonAscii::BmpExclAscii(unit)
|
||||
} else if unit_minus_surrogate_start <= (0xDFFF - 0xDBFF) {
|
||||
} else if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
|
||||
// high surrogate
|
||||
if self.pos < self.slice.len() {
|
||||
let second = self.slice[self.pos] as u32;
|
||||
@ -902,7 +902,7 @@ impl<'a> Utf16Source<'a> {
|
||||
let non_ascii_ret = {
|
||||
let dst_len = dest.slice.len();
|
||||
let src_remaining = &self.slice[self.pos..];
|
||||
let mut dst_remaining = &mut dest.slice[dest.pos..];
|
||||
let dst_remaining = &mut dest.slice[dest.pos..];
|
||||
let (pending, length) = if dst_remaining.len() < src_remaining.len() {
|
||||
(EncoderResult::OutputFull, dst_remaining.len())
|
||||
} else {
|
||||
@ -929,7 +929,7 @@ impl<'a> Utf16Source<'a> {
|
||||
let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
|
||||
if unit_minus_surrogate_start > (0xDFFF - 0xD800) {
|
||||
NonAscii::BmpExclAscii(unit)
|
||||
} else if unit_minus_surrogate_start <= (0xDFFF - 0xDBFF) {
|
||||
} else if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
|
||||
// high surrogate
|
||||
if self.pos == self.slice.len() {
|
||||
// Unpaired surrogate at the end of the buffer.
|
||||
@ -1123,7 +1123,7 @@ impl<'a> Utf8Source<'a> {
|
||||
-> CopyAsciiResult<(EncoderResult, usize, usize), (NonAscii, ByteOneHandle<'b, 'a>)> {
|
||||
let non_ascii_ret = {
|
||||
let src_remaining = &self.slice[self.pos..];
|
||||
let mut dst_remaining = &mut dest.slice[dest.pos..];
|
||||
let dst_remaining = &mut dest.slice[dest.pos..];
|
||||
let (pending, length) = if dst_remaining.len() < src_remaining.len() {
|
||||
(EncoderResult::OutputFull, dst_remaining.len())
|
||||
} else {
|
||||
@ -1175,7 +1175,7 @@ impl<'a> Utf8Source<'a> {
|
||||
let non_ascii_ret = {
|
||||
let dst_len = dest.slice.len();
|
||||
let src_remaining = &self.slice[self.pos..];
|
||||
let mut dst_remaining = &mut dest.slice[dest.pos..];
|
||||
let dst_remaining = &mut dest.slice[dest.pos..];
|
||||
let (pending, length) = if dst_remaining.len() < src_remaining.len() {
|
||||
(EncoderResult::OutputFull, dst_remaining.len())
|
||||
} else {
|
||||
@ -1231,7 +1231,7 @@ impl<'a> Utf8Source<'a> {
|
||||
let non_ascii_ret = {
|
||||
let dst_len = dest.slice.len();
|
||||
let src_remaining = &self.slice[self.pos..];
|
||||
let mut dst_remaining = &mut dest.slice[dest.pos..];
|
||||
let dst_remaining = &mut dest.slice[dest.pos..];
|
||||
let (pending, length) = if dst_remaining.len() < src_remaining.len() {
|
||||
(EncoderResult::OutputFull, dst_remaining.len())
|
||||
} else {
|
||||
|
14
third_party/rust/encoding_rs/src/iso_2022_jp.rs
vendored
14
third_party/rust/encoding_rs/src/iso_2022_jp.rs
vendored
@ -963,4 +963,18 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_iso_2022_jp_encode_from_two_low_surrogates() {
|
||||
let expectation = b"��";
|
||||
let mut output = [0u8; 40];
|
||||
let mut encoder = ISO_2022_JP.new_encoder();
|
||||
let (result, read, written, had_errors) =
|
||||
encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 2);
|
||||
assert_eq!(written, expectation.len());
|
||||
assert!(had_errors);
|
||||
assert_eq!(&output[..written], expectation);
|
||||
}
|
||||
|
||||
}
|
||||
|
6
third_party/rust/encoding_rs/src/lib.rs
vendored
6
third_party/rust/encoding_rs/src/lib.rs
vendored
@ -8,7 +8,7 @@
|
||||
// except according to those terms.
|
||||
|
||||
#![cfg_attr(feature = "cargo-clippy", allow(doc_markdown, inline_always, new_ret_no_self))]
|
||||
#![doc(html_root_url = "https://docs.rs/encoding_rs/0.7.0")]
|
||||
#![doc(html_root_url = "https://docs.rs/encoding_rs/0.7.1")]
|
||||
|
||||
//! encoding_rs is a Gecko-oriented Free Software / Open Source implementation
|
||||
//! of the [Encoding Standard](https://encoding.spec.whatwg.org/) in Rust.
|
||||
@ -2455,7 +2455,7 @@ impl Encoding {
|
||||
.unwrap()
|
||||
);
|
||||
unsafe {
|
||||
let mut vec = string.as_mut_vec();
|
||||
let vec = string.as_mut_vec();
|
||||
vec.set_len(valid_up_to);
|
||||
std::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
|
||||
}
|
||||
@ -2556,7 +2556,7 @@ impl Encoding {
|
||||
.unwrap()
|
||||
);
|
||||
unsafe {
|
||||
let mut vec = string.as_mut_vec();
|
||||
let vec = string.as_mut_vec();
|
||||
vec.set_len(valid_up_to);
|
||||
std::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
|
||||
}
|
||||
|
147
third_party/rust/encoding_rs/src/simd_funcs.rs
vendored
147
third_party/rust/encoding_rs/src/simd_funcs.rs
vendored
@ -68,18 +68,9 @@ cfg_if! {
|
||||
use simd::i16x8;
|
||||
use simd::i8x16;
|
||||
extern "platform-intrinsic" {
|
||||
fn x86_mm_packus_epi16(x: i16x8, y: i16x8) -> u8x16;
|
||||
fn x86_mm_movemask_epi8(x: i8x16) -> i32;
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_ascii(s: u8x16) -> bool {
|
||||
unsafe {
|
||||
let signed: i8x16 = ::std::mem::transmute_copy(&s);
|
||||
x86_mm_movemask_epi8(signed) == 0
|
||||
}
|
||||
}
|
||||
|
||||
// Expose low-level mask instead of higher-level conclusion,
|
||||
// because the non-ASCII case would perform less well otherwise.
|
||||
#[inline(always)]
|
||||
@ -90,27 +81,23 @@ cfg_if! {
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
cfg_if! {
|
||||
if #[cfg(target_feature = "sse2")] {
|
||||
#[inline(always)]
|
||||
pub unsafe fn pack_basic_latin(a: u16x8, b: u16x8) -> Option<u8x16> {
|
||||
// If the 16-bit lane is out of range positive, the 8-bit lane becomes 0xFF
|
||||
// when packing, which would allow us to pack first and then check for
|
||||
// ASCII, but if the 16-bit lane is negative, the 8-bit lane becomes 0x00.
|
||||
// Sigh. Hence, check first.
|
||||
let highest_ascii = u16x8::splat(0x7F);
|
||||
let combined = a | b;
|
||||
if combined.gt(highest_ascii).any() {
|
||||
None
|
||||
} else {
|
||||
let first: i16x8 = ::std::mem::transmute_copy(&a);
|
||||
let second: i16x8 = ::std::mem::transmute_copy(&b);
|
||||
Some(x86_mm_packus_epi16(first, second))
|
||||
pub fn is_ascii(s: u8x16) -> bool {
|
||||
unsafe {
|
||||
let signed: i8x16 = ::std::mem::transmute_copy(&s);
|
||||
x86_mm_movemask_epi8(signed) == 0
|
||||
}
|
||||
}
|
||||
} else if #[cfg(target_arch = "aarch64")]{
|
||||
|
||||
extern "platform-intrinsic" {
|
||||
fn aarch64_vmaxvq_u8(x: u8x16) -> u8;
|
||||
fn aarch64_vmaxvq_u16(x: u16x8) -> u16;
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
@ -119,56 +106,38 @@ cfg_if! {
|
||||
aarch64_vmaxvq_u8(s) < 0x80
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub unsafe fn pack_basic_latin(a: u16x8, b: u16x8) -> Option<u8x16> {
|
||||
let combined = a | b;
|
||||
if aarch64_vmaxvq_u16(combined) < 0x80 {
|
||||
let first: u8x16 = ::std::mem::transmute_copy(&a);
|
||||
let second: u8x16 = ::std::mem::transmute_copy(&b);
|
||||
let lower: u8x16 = simd_shuffle16(
|
||||
first,
|
||||
second,
|
||||
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30],
|
||||
);
|
||||
Some(lower)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_ascii(s: u8x16) -> bool {
|
||||
let highest_ascii = u8x16::splat(0x7F);
|
||||
!s.gt(highest_ascii).any()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub unsafe fn pack_basic_latin(a: u16x8, b: u16x8) -> Option<u8x16> {
|
||||
let highest_ascii = u16x8::splat(0x7F);
|
||||
let combined = a | b;
|
||||
if combined.gt(highest_ascii).any() {
|
||||
None
|
||||
} else {
|
||||
let first: u8x16 = ::std::mem::transmute_copy(&a);
|
||||
let second: u8x16 = ::std::mem::transmute_copy(&b);
|
||||
let lower: u8x16 = simd_shuffle16(
|
||||
first,
|
||||
second,
|
||||
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30],
|
||||
);
|
||||
Some(lower)
|
||||
}
|
||||
cfg_if! {
|
||||
if #[cfg(target_arch = "aarch64")]{
|
||||
extern "platform-intrinsic" {
|
||||
fn aarch64_vmaxvq_u16(x: u16x8) -> u16;
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_basic_latin(s: u16x8) -> bool {
|
||||
unsafe {
|
||||
aarch64_vmaxvq_u16(s) < 0x80
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#[inline(always)]
|
||||
pub fn is_basic_latin(s: u16x8) -> bool {
|
||||
let highest_ascii = u16x8::splat(0x7F);
|
||||
!s.gt(highest_ascii).any()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn unpack(s: u8x16) -> (u16x8, u16x8) {
|
||||
pub fn simd_unpack(s: u8x16) -> (u16x8, u16x8) {
|
||||
unsafe {
|
||||
let first: u8x16 = simd_shuffle16(
|
||||
s,
|
||||
@ -184,6 +153,36 @@ pub fn unpack(s: u8x16) -> (u16x8, u16x8) {
|
||||
}
|
||||
}
|
||||
|
||||
cfg_if! {
|
||||
if #[cfg(target_feature = "sse2")] {
|
||||
extern "platform-intrinsic" {
|
||||
fn x86_mm_packus_epi16(x: i16x8, y: i16x8) -> u8x16;
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn simd_pack(a: u16x8, b: u16x8) -> u8x16 {
|
||||
unsafe {
|
||||
let first: i16x8 = ::std::mem::transmute_copy(&a);
|
||||
let second: i16x8 = ::std::mem::transmute_copy(&b);
|
||||
x86_mm_packus_epi16(first, second)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#[inline(always)]
|
||||
pub fn simd_pack(a: u16x8, b: u16x8) -> u8x16 {
|
||||
unsafe {
|
||||
let first: u8x16 = ::std::mem::transmute_copy(&a);
|
||||
let second: u8x16 = ::std::mem::transmute_copy(&b);
|
||||
simd_shuffle16(
|
||||
first,
|
||||
second,
|
||||
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30],
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@ -197,7 +196,7 @@ mod tests {
|
||||
let simd = unsafe { load16_unaligned(ascii.as_ptr()) };
|
||||
let mut vec = Vec::with_capacity(16);
|
||||
vec.resize(16, 0u16);
|
||||
let (first, second) = unpack(simd);
|
||||
let (first, second) = simd_unpack(simd);
|
||||
let ptr = vec.as_mut_ptr();
|
||||
unsafe {
|
||||
store8_unaligned(ptr, first);
|
||||
@ -207,7 +206,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pack_basic_latin_success() {
|
||||
fn test_is_basic_latin_success() {
|
||||
let ascii: [u8; 16] = [0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71,
|
||||
0x72, 0x73, 0x74, 0x75, 0x76];
|
||||
let basic_latin: [u16; 16] = [0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70,
|
||||
@ -217,44 +216,38 @@ mod tests {
|
||||
let mut vec = Vec::with_capacity(16);
|
||||
vec.resize(16, 0u8);
|
||||
let ptr = vec.as_mut_ptr();
|
||||
assert!(is_basic_latin(first | second));
|
||||
unsafe {
|
||||
let packed = pack_basic_latin(first, second).unwrap();
|
||||
store16_unaligned(ptr, packed);
|
||||
store16_unaligned(ptr, simd_pack(first, second));
|
||||
}
|
||||
assert_eq!(&vec[..], &ascii[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pack_basic_latin_c0() {
|
||||
fn test_is_basic_latin_c0() {
|
||||
let input: [u16; 16] = [0x61, 0x62, 0x63, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71,
|
||||
0x72, 0x73, 0x74, 0x75, 0x76];
|
||||
let first = unsafe { load8_unaligned(input.as_ptr()) };
|
||||
let second = unsafe { load8_unaligned(input.as_ptr().offset(8)) };
|
||||
unsafe {
|
||||
assert!(pack_basic_latin(first, second).is_none());
|
||||
}
|
||||
assert!(!is_basic_latin(first | second));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pack_basic_latin_0fff() {
|
||||
fn test_is_basic_latin_0fff() {
|
||||
let input: [u16; 16] = [0x61, 0x62, 0x63, 0x0FFF, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70,
|
||||
0x71, 0x72, 0x73, 0x74, 0x75, 0x76];
|
||||
let first = unsafe { load8_unaligned(input.as_ptr()) };
|
||||
let second = unsafe { load8_unaligned(input.as_ptr().offset(8)) };
|
||||
unsafe {
|
||||
assert!(pack_basic_latin(first, second).is_none());
|
||||
}
|
||||
assert!(!is_basic_latin(first | second));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pack_basic_latin_ffff() {
|
||||
fn test_is_basic_latin_ffff() {
|
||||
let input: [u16; 16] = [0x61, 0x62, 0x63, 0xFFFF, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70,
|
||||
0x71, 0x72, 0x73, 0x74, 0x75, 0x76];
|
||||
let first = unsafe { load8_unaligned(input.as_ptr()) };
|
||||
let second = unsafe { load8_unaligned(input.as_ptr().offset(8)) };
|
||||
unsafe {
|
||||
assert!(pack_basic_latin(first, second).is_none());
|
||||
}
|
||||
assert!(!is_basic_latin(first | second));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
14
third_party/rust/encoding_rs/src/single_byte.rs
vendored
14
third_party/rust/encoding_rs/src/single_byte.rs
vendored
@ -556,6 +556,20 @@ mod tests {
|
||||
encode_from_utf16(encoding, data, &with_zeros[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_byte_from_two_low_surrogates() {
|
||||
let expectation = b"��";
|
||||
let mut output = [0u8; 40];
|
||||
let mut encoder = WINDOWS_1253.new_encoder();
|
||||
let (result, read, written, had_errors) =
|
||||
encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 2);
|
||||
assert_eq!(written, expectation.len());
|
||||
assert!(had_errors);
|
||||
assert_eq!(&output[..written], expectation);
|
||||
}
|
||||
|
||||
// These tests are so self-referential that they are pretty useless.
|
||||
|
||||
// BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
|
||||
|
3
third_party/rust/encoding_rs/src/utf_8.rs
vendored
3
third_party/rust/encoding_rs/src/utf_8.rs
vendored
@ -645,7 +645,7 @@ impl Utf8Encoder {
|
||||
written += 1;
|
||||
break;
|
||||
}
|
||||
if unit_minus_surrogate_start <= (0xDFFF - 0xDBFF) {
|
||||
if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
|
||||
// high surrogate
|
||||
if read == src.len() {
|
||||
// Unpaired surrogate at the end of the buffer.
|
||||
@ -943,6 +943,7 @@ mod tests {
|
||||
encode_utf8_from_utf16(&[0xFFFF], "\u{FFFF}".as_bytes());
|
||||
encode_utf8_from_utf16(&[0xD800, 0xDC00], "\u{10000}".as_bytes());
|
||||
encode_utf8_from_utf16(&[0xDBFF, 0xDFFF], "\u{10FFFF}".as_bytes());
|
||||
encode_utf8_from_utf16(&[0xDC00, 0xDEDE], "\u{FFFD}\u{FFFD}".as_bytes());
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
@ -139,4 +139,17 @@ mod tests {
|
||||
encode_x_user_defined("\u{F77F}\u{F800}", b"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_x_user_defined_from_two_low_surrogates() {
|
||||
let expectation = b"��";
|
||||
let mut output = [0u8; 40];
|
||||
let mut encoder = X_USER_DEFINED.new_encoder();
|
||||
let (result, read, written, had_errors) =
|
||||
encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 2);
|
||||
assert_eq!(written, expectation.len());
|
||||
assert!(had_errors);
|
||||
assert_eq!(&output[..written], expectation);
|
||||
}
|
||||
}
|
||||
|
8
toolkit/library/gtest/rust/Cargo.lock
generated
8
toolkit/library/gtest/rust/Cargo.lock
generated
@ -439,21 +439,21 @@ name = "encoding_c"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"encoding_rs 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"encoding_rs 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding_glue"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"encoding_rs 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"encoding_rs 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"nserror 0.1.0",
|
||||
"nsstring 0.1.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding_rs"
|
||||
version = "0.7.0"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"cfg-if 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
@ -1708,7 +1708,7 @@ dependencies = [
|
||||
"checksum dwrote 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "36e3b27cd0b8a68e00f07e8d8e1e4f4d8a6b8b873290a734f63bd56d792d23e1"
|
||||
"checksum either 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "18785c1ba806c258137c937e44ada9ee7e69a37e3c72077542cd2f069d78562a"
|
||||
"checksum encoding_c 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "93ec52324ca72f423237a413ca0e1c60654c8b3d0934fcd5fd888508dfcc4ba7"
|
||||
"checksum encoding_rs 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "6f0a39f0e2f497d3c2e6a5529a0ec4fc640084fa401493c640421673471f8b72"
|
||||
"checksum encoding_rs 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)" = "f5215aabf22b83153be3ee44dfe3f940214541b2ce13d419c55e7a115c8c51a9"
|
||||
"checksum env_logger 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3ddf21e73e016298f5cb37d6ef8e8da8e39f91f9ec8b0df44b7deb16a9f8cd5b"
|
||||
"checksum error-chain 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d9435d864e017c3c6afeac1654189b06cdb491cf2ff73dbf0d73b0f292f42ff8"
|
||||
"checksum euclid 0.15.2 (registry+https://github.com/rust-lang/crates.io-index)" = "50c9e4c3b53de731815135191f0b77969bea953211b8bbd3cc3083a7b10e190e"
|
||||
|
8
toolkit/library/rust/Cargo.lock
generated
8
toolkit/library/rust/Cargo.lock
generated
@ -438,21 +438,21 @@ name = "encoding_c"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"encoding_rs 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"encoding_rs 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding_glue"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"encoding_rs 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"encoding_rs 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"nserror 0.1.0",
|
||||
"nsstring 0.1.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding_rs"
|
||||
version = "0.7.0"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"cfg-if 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
@ -1720,7 +1720,7 @@ dependencies = [
|
||||
"checksum dwrote 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "36e3b27cd0b8a68e00f07e8d8e1e4f4d8a6b8b873290a734f63bd56d792d23e1"
|
||||
"checksum either 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "18785c1ba806c258137c937e44ada9ee7e69a37e3c72077542cd2f069d78562a"
|
||||
"checksum encoding_c 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "93ec52324ca72f423237a413ca0e1c60654c8b3d0934fcd5fd888508dfcc4ba7"
|
||||
"checksum encoding_rs 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "6f0a39f0e2f497d3c2e6a5529a0ec4fc640084fa401493c640421673471f8b72"
|
||||
"checksum encoding_rs 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)" = "f5215aabf22b83153be3ee44dfe3f940214541b2ce13d419c55e7a115c8c51a9"
|
||||
"checksum env_logger 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3ddf21e73e016298f5cb37d6ef8e8da8e39f91f9ec8b0df44b7deb16a9f8cd5b"
|
||||
"checksum error-chain 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d9435d864e017c3c6afeac1654189b06cdb491cf2ff73dbf0d73b0f292f42ff8"
|
||||
"checksum euclid 0.15.2 (registry+https://github.com/rust-lang/crates.io-index)" = "50c9e4c3b53de731815135191f0b77969bea953211b8bbd3cc3083a7b10e190e"
|
||||
|
Loading…
Reference in New Issue
Block a user