mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-25 05:41:12 +00:00
Bug 1507726 - Update encoding_rs to 0.8.12. r=m_kato
* Improves UTF-8 validation performance. * Improves UTF-8 to UTF-16 decode performance. * Improves non-Latin and Latin1-ish Latin single-byte encode performance. * Improves code quality by addressing some clippy lints. The optional legacy CJK encoder changes are not used by Firefox. Differential Revision: https://phabricator.services.mozilla.com/D12514 --HG-- extra : moz-landing-system : lando
This commit is contained in:
parent
2e676cc49d
commit
60fabe50a9
10
Cargo.lock
generated
10
Cargo.lock
generated
@ -799,21 +799,21 @@ name = "encoding_c"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"encoding_rs 0.8.9 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"encoding_rs 0.8.12 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding_glue"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"encoding_rs 0.8.9 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"encoding_rs 0.8.12 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"nserror 0.1.0",
|
||||
"nsstring 0.1.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding_rs"
|
||||
version = "0.8.9"
|
||||
version = "0.8.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"cfg-if 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
@ -1704,7 +1704,7 @@ name = "nsstring"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"encoding_rs 0.8.9 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"encoding_rs 0.8.12 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -3201,7 +3201,7 @@ dependencies = [
|
||||
"checksum either 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "18785c1ba806c258137c937e44ada9ee7e69a37e3c72077542cd2f069d78562a"
|
||||
"checksum ena 0.9.3 (registry+https://github.com/rust-lang/crates.io-index)" = "88dc8393b3c7352f94092497f6b52019643e493b6b890eb417cdb7c46117e621"
|
||||
"checksum encoding_c 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "769ecb8b33323998e482b218c0d13cd64c267609023b4b7ec3ee740714c318ee"
|
||||
"checksum encoding_rs 0.8.9 (registry+https://github.com/rust-lang/crates.io-index)" = "f1a79fa56c329a5b087de13748054fb3b974c4a672c12c71f0b66e35c5addec5"
|
||||
"checksum encoding_rs 0.8.12 (registry+https://github.com/rust-lang/crates.io-index)" = "ca20350a7cb5aab5b9034731123d6d412caf3e92d4985e739e411ba0955fd0eb"
|
||||
"checksum env_logger 0.5.6 (registry+https://github.com/rust-lang/crates.io-index)" = "0561146661ae44c579e993456bc76d11ce1e0c7d745e57b2fa7146b6e49fa2ad"
|
||||
"checksum error-chain 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ff511d5dc435d703f4971bc399647c9bc38e20cb41452e3b9feb4765419ed3f3"
|
||||
"checksum euclid 0.19.3 (registry+https://github.com/rust-lang/crates.io-index)" = "600657e7e5c03bfbccdc68721bc3b5abcb761553973387124eae9c9e4f02c210"
|
||||
|
File diff suppressed because one or more lines are too long
4
third_party/rust/encoding_rs/CONTRIBUTING.md
vendored
4
third_party/rust/encoding_rs/CONTRIBUTING.md
vendored
@ -28,7 +28,9 @@ taken as a waiver of copyright notice.
|
||||
Please do not contribute implementations of encodings that are not specified
|
||||
in the [Encoding Standard](https://encoding.spec.whatwg.org/).
|
||||
|
||||
For example, an implementation of UTF-7 would be explicitly not welcome.
|
||||
For example, an implementation of UTF-7 is explicitly out of scope for this
|
||||
crate and is, therefore, provided by the [charset](https://crates.io/crates/charset)
|
||||
crate instead.
|
||||
|
||||
## Compatibility with Stable Rust
|
||||
|
||||
|
14
third_party/rust/encoding_rs/COPYRIGHT
vendored
14
third_party/rust/encoding_rs/COPYRIGHT
vendored
@ -10,17 +10,3 @@ according to those terms.
|
||||
|
||||
Test code within encoding_rs is dedicated to the Public Domain when so
|
||||
designated (see the individual files for PD/CC0-dedicated sections).
|
||||
|
||||
The file utf_8_core.rs was extracted from the Rust project at revision
|
||||
7ad7232422f7e5bbfa0e52dabe36c12677df19e2, whose COPYRIGHT file said (in part):
|
||||
|
||||
The Rust Project is copyright 2010, The Rust Project
|
||||
Developers.
|
||||
|
||||
Licensed under the Apache License, Version 2.0
|
||||
<LICENSE-APACHE or
|
||||
http://www.apache.org/licenses/LICENSE-2.0> or the MIT
|
||||
license <LICENSE-MIT or http://opensource.org/licenses/MIT>,
|
||||
at your option. All files in the project carrying such
|
||||
notice may not be copied, modified, or distributed except
|
||||
according to those terms.
|
||||
|
10
third_party/rust/encoding_rs/Cargo.toml
vendored
10
third_party/rust/encoding_rs/Cargo.toml
vendored
@ -12,14 +12,14 @@
|
||||
|
||||
[package]
|
||||
name = "encoding_rs"
|
||||
version = "0.8.9"
|
||||
version = "0.8.12"
|
||||
authors = ["Henri Sivonen <hsivonen@hsivonen.fi>"]
|
||||
description = "A Gecko-oriented implementation of the Encoding Standard"
|
||||
homepage = "https://docs.rs/encoding_rs/"
|
||||
documentation = "https://docs.rs/encoding_rs/"
|
||||
readme = "README.md"
|
||||
keywords = ["encoding", "web", "unicode", "charset"]
|
||||
categories = ["text-processing", "encoding", "web-programming", "email"]
|
||||
categories = ["text-processing", "encoding", "web-programming", "internationalization"]
|
||||
license = "MIT/Apache-2.0"
|
||||
repository = "https://github.com/hsivonen/encoding_rs"
|
||||
[profile.release]
|
||||
@ -44,6 +44,12 @@ version = "1.0"
|
||||
version = "1.0"
|
||||
|
||||
[features]
|
||||
fast-big5-hanzi-encode = []
|
||||
fast-gb-hanzi-encode = []
|
||||
fast-hangul-encode = []
|
||||
fast-hanja-encode = []
|
||||
fast-kanji-encode = []
|
||||
fast-legacy-encode = ["fast-hangul-encode", "fast-hanja-encode", "fast-kanji-encode", "fast-gb-hanzi-encode", "fast-big5-hanzi-encode"]
|
||||
less-slow-big5-hanzi-encode = []
|
||||
less-slow-gb-hanzi-encode = []
|
||||
less-slow-kanji-encode = []
|
||||
|
28
third_party/rust/encoding_rs/Ideas.md
vendored
28
third_party/rust/encoding_rs/Ideas.md
vendored
@ -76,3 +76,31 @@ On [Cortex-A57](https://stackoverflow.com/questions/45714535/performance-of-unal
|
||||
|
||||
Currently, Aarch64 runs the generic ALU UTF-8 validation code that aligns
|
||||
reads. That's probably unnecessary on Aarch64. (SIMD was slower than ALU!)
|
||||
|
||||
## Table-driven UTF-8 validation
|
||||
|
||||
When there are at least four bytes left, read all four. With each byte
|
||||
index into tables corresponding to magic values indexable by byte in
|
||||
each position.
|
||||
|
||||
In the value read from the table indexed by lead byte, encode the
|
||||
following in 16 bits: advance 2 bits (2, 3 or 4 bytes), 9 positional
|
||||
bits one of which is set to indicate the type of lead byte (8 valid
|
||||
types, in the 8 lowest bits, and invalid, ASCII would be tenth type),
|
||||
and the mask for extracting the payload bits from the lead byte
|
||||
(for conversion to UTF-16 or UTF-32).
|
||||
|
||||
In the tables indexable by the trail bytes, in each positions
|
||||
corresponding byte the lead byte type, store 1 if the trail is
|
||||
invalid given the lead and 0 if valid given the lead.
|
||||
|
||||
Use the low 8 bits of the of the 16 bits read from the first
|
||||
table to mask (bitwise AND) one positional bit from each of the
|
||||
three other values. Bitwise OR the results together with the
|
||||
bit that is 1 if the lead is invalid. If the result is zero,
|
||||
the sequence is valid. Otherwise it's invalid.
|
||||
|
||||
Use the advance to advance. In the conversion to UTF-16 or
|
||||
UTF-32 case, use the mast for extracting the meaningful
|
||||
bits from the lead byte to mask them from the lead. Shift
|
||||
left by 6 as many times as the advance indicates, etc.
|
||||
|
29
third_party/rust/encoding_rs/LICENSE-MIT
vendored
29
third_party/rust/encoding_rs/LICENSE-MIT
vendored
@ -23,32 +23,3 @@ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
||||
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
|
||||
The file utf_8_core.rs was extracted from the Rust project at revision
|
||||
7ad7232422f7e5bbfa0e52dabe36c12677df19e2, whose LICENSE-MIT file said:
|
||||
|
||||
Copyright (c) 2010 The Rust Project Developers
|
||||
|
||||
Permission is hereby granted, free of charge, to any
|
||||
person obtaining a copy of this software and associated
|
||||
documentation files (the "Software"), to deal in the
|
||||
Software without restriction, including without
|
||||
limitation the rights to use, copy, modify, merge,
|
||||
publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software
|
||||
is furnished to do so, subject to the following
|
||||
conditions:
|
||||
|
||||
The above copyright notice and this permission notice
|
||||
shall be included in all copies or substantial portions
|
||||
of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
||||
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
||||
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
||||
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
||||
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
|
222
third_party/rust/encoding_rs/README.md
vendored
222
third_party/rust/encoding_rs/README.md
vendored
@ -75,6 +75,12 @@ a `std::io::Read`, decode it into UTF-8 and presenting the result via
|
||||
`std::io::Read`. The [`encoding_rs_io`](https://crates.io/crates/encoding_rs_io)
|
||||
crate provides that capability.
|
||||
|
||||
## Decoding Email
|
||||
|
||||
For decoding character encodings that occur in email, use the
|
||||
[`charset`](https://crates.io/crates/charset) crate instead of using this
|
||||
one directly. (It wraps this crate and adds UTF-7 decoding.)
|
||||
|
||||
## Licensing
|
||||
|
||||
Please see the file named
|
||||
@ -105,7 +111,7 @@ These bindings do not cover the `mem` module.
|
||||
|
||||
## Optional features
|
||||
|
||||
There are currently three optional cargo features:
|
||||
There are currently these optional cargo features:
|
||||
|
||||
### `simd-accel`
|
||||
|
||||
@ -121,6 +127,8 @@ Enabling this feature breaks the build unless the target is x86 with SSE2
|
||||
use an x86 target without SSE2, i.e. `i586` in `rustup` terms), ARMv7 or
|
||||
thumbv7 with NEON (`-C target_feature=+neon`), x86_64 or Aarch64.
|
||||
|
||||
Used by Firefox.
|
||||
|
||||
### `serde`
|
||||
|
||||
Enables support for serializing and deserializing `&'static Encoding`-typed
|
||||
@ -128,27 +136,134 @@ struct fields using [Serde][1].
|
||||
|
||||
[1]: https://serde.rs/
|
||||
|
||||
Not used by Firefox.
|
||||
|
||||
### `fast-legacy-encode`
|
||||
|
||||
A catch-all option for enabling the fastest legacy encode options. _Does not
|
||||
affect decode speed or UTF-8 encode speed._
|
||||
|
||||
At present, this option is equivalent to enabling the following options:
|
||||
* `fast-hangul-encode`
|
||||
* `fast-hanja-encode`
|
||||
* `fast-kanji-encode`
|
||||
* `fast-gb-hanzi-encode`
|
||||
* `fast-big5-hanzi-encode`
|
||||
|
||||
Adds 176 KB to the binary size.
|
||||
|
||||
Not used by Firefox.
|
||||
|
||||
### `fast-hangul-encode`
|
||||
|
||||
Changes encoding precomposed Hangul syllables into EUC-KR from binary
|
||||
search over the decode-optimized tables to lookup by index making Korean
|
||||
plain-text encode about 4 times as fast as without this option.
|
||||
|
||||
Adds 20 KB to the binary size.
|
||||
|
||||
Does _not_ affect decode speed.
|
||||
|
||||
Not used by Firefox.
|
||||
|
||||
### `fast-hanja-encode`
|
||||
|
||||
Changes encoding of Hanja into EUC-KR from linear search over the
|
||||
decode-optimized table to lookup by index. Since Hanja is practically absent
|
||||
in modern Korean text, this option doesn't affect perfomance in the common
|
||||
case and mainly makes sense if you want to make your application resilient
|
||||
agaist denial of service by someone intentionally feeding it a lot of Hanja
|
||||
to encode into EUC-KR.
|
||||
|
||||
Adds 40 KB to the binary size.
|
||||
|
||||
Does _not_ affect decode speed.
|
||||
|
||||
Not used by Firefox.
|
||||
|
||||
### `fast-kanji-encode`
|
||||
|
||||
Changes encoding of Kanji into Shift_JIS, EUC-JP and ISO-2022-JP from linear
|
||||
search over the decode-optimized tables to lookup by index making Japanese
|
||||
plain-text encode to legacy encodings 30 to 50 times as fast as without this
|
||||
option (about 2 times as fast as with `less-slow-kanji-encode`).
|
||||
|
||||
Takes precedence over `less-slow-kanji-encode`.
|
||||
|
||||
Adds 36 KB to the binary size (24 KB compared to `less-slow-kanji-encode`).
|
||||
|
||||
Does _not_ affect decode speed.
|
||||
|
||||
Not used by Firefox.
|
||||
|
||||
### `less-slow-kanji-encode`
|
||||
|
||||
Makes JIS X 0208 Level 1 Kanji (the most common Kanji in Shift_JIS, EUC-JP and
|
||||
ISO-2022-JP) encode less slow (binary search instead of linear search) at the
|
||||
expense of binary size. (Does _not_ affect decode speed.)
|
||||
ISO-2022-JP) encode less slow (binary search instead of linear search) making
|
||||
Japanese plain-text encode to legacy encodings 14 to 23 times as fast as
|
||||
without this option.
|
||||
|
||||
Adds 12 KB to the binary size.
|
||||
|
||||
Does _not_ affect decode speed.
|
||||
|
||||
Not used by Firefox.
|
||||
|
||||
### `fast-gb-hanzi-encode`
|
||||
|
||||
Changes encoding of Hanzi in the CJK Unified Ideographs block into GBK and
|
||||
gb18030 from linear search over a part the decode-optimized tables followed
|
||||
by a binary search over another part of the decode-optimized tables to lookup
|
||||
by index making Simplified Chinese plain-text encode to the legacy encodings
|
||||
100 to 110 times as fast as without this option (about 2.5 times as fast as
|
||||
with `less-slow-gb-hanzi-encode`).
|
||||
|
||||
Takes precedence over `less-slow-gb-hanzi-encode`.
|
||||
|
||||
Adds 36 KB to the binary size (24 KB compared to `less-slow-gb-hanzi-encode`).
|
||||
|
||||
Does _not_ affect decode speed.
|
||||
|
||||
Not used by Firefox.
|
||||
|
||||
### `less-slow-gb-hanzi-encode`
|
||||
|
||||
Makes GB2312 Level 1 Hanzi (the most common Hanzi in gb18030 and GBK) encode
|
||||
less slow (binary search instead of linear search) at the expense of binary
|
||||
size. (Does _not_ affect decode speed.)
|
||||
less slow (binary search instead of linear search) making Simplified Chinese
|
||||
plain-text encode to the legacy encodings about 40 times as fast as without
|
||||
this option.
|
||||
|
||||
Adds 12 KB to the binary size.
|
||||
|
||||
Does _not_ affect decode speed.
|
||||
|
||||
Not used by Firefox.
|
||||
|
||||
### `fast-big5-hanzi-encode`
|
||||
|
||||
Changes encoding of Hanzi in the CJK Unified Ideographs block into Big5 from
|
||||
linear search over a part the decode-optimized tables to lookup by index
|
||||
making Traditional Chinese plain-text encode to Big5 105 to 125 times as fast
|
||||
as without this option (about 3 times as fast as with
|
||||
`less-slow-big5-hanzi-encode`).
|
||||
|
||||
Takes precedence over `less-slow-big5-hanzi-encode`.
|
||||
|
||||
Adds 40 KB to the binary size (20 KB compared to `less-slow-big5-hanzi-encode`).
|
||||
|
||||
Does _not_ affect decode speed.
|
||||
|
||||
Not used by Firefox.
|
||||
|
||||
### `less-slow-big5-hanzi-encode`
|
||||
|
||||
Makes Big5 Level 1 Hanzi (the most common Hanzi in Big5) encode less slow
|
||||
(binary search instead of linear search) at the expense of binary size. (Does
|
||||
_not_ affect decode speed.)
|
||||
(binary search instead of linear search) making Traditional Chinese
|
||||
plain-text encode to Big5 about 36 times as fast as without this option.
|
||||
|
||||
Adds 20 KB to the binary size.
|
||||
|
||||
Does _not_ affect decode speed.
|
||||
|
||||
Not used by Firefox.
|
||||
|
||||
@ -156,29 +271,26 @@ Not used by Firefox.
|
||||
|
||||
For decoding to UTF-16, the goal is to perform at least as well as Gecko's old
|
||||
uconv. For decoding to UTF-8, the goal is to perform at least as well as
|
||||
rust-encoding.
|
||||
rust-encoding. These goals have been achieved.
|
||||
|
||||
Encoding to UTF-8 should be fast. (UTF-8 to UTF-8 encode should be equivalent
|
||||
to `memcpy` and UTF-16 to UTF-8 should be fast.)
|
||||
|
||||
Speed is a non-goal when encoding to legacy encodings. Encoding to legacy
|
||||
encodings should not be optimized for speed at the expense of code size as long
|
||||
as form submission and URL parsing in Gecko don't become noticeably too slow
|
||||
in real-world use.
|
||||
Speed is a non-goal when encoding to legacy encodings. By default, encoding to
|
||||
legacy encodings should not be optimized for speed at the expense of code size
|
||||
as long as form submission and URL parsing in Gecko don't become noticeably
|
||||
too slow in real-world use.
|
||||
|
||||
In the interest of binary size, by default, encoding_rs does not have any
|
||||
encode-specific data tables. Therefore, encoders search the decode-optimized
|
||||
data tables. This is a linear search in most cases. As a result, encode to
|
||||
legacy encodings varies from slow to extremely slow relative to other
|
||||
libraries. Still, with realistic work loads, this seemed fast enough
|
||||
not to be user-visibly slow on Raspberry Pi 3 (which stood in for a phone
|
||||
for testing) in the Web-exposed encoder use cases.
|
||||
In the interest of binary size, by default, encoding_rs does not have
|
||||
encode-specific data tables beyond 32 bits of encode-specific data for each
|
||||
single-byte encoding. Therefore, encoders search the decode-optimized data
|
||||
tables. This is a linear search in most cases. As a result, by default, encode
|
||||
to legacy encodings varies from slow to extremely slow relative to other
|
||||
libraries. Still, with realistic work loads, this seemed fast enough not to be
|
||||
user-visibly slow on Raspberry Pi 3 (which stood in for a phone for testing)
|
||||
in the Web-exposed encoder use cases.
|
||||
|
||||
See the cargo features above for optionally making Kanji and Hanzi legacy
|
||||
encode a bit less slow.
|
||||
|
||||
Actually fast options for legacy encode may be added in the future, but there
|
||||
do not appear to be pressing use cases.
|
||||
See the cargo features above for optionally making CJK legacy encode fast.
|
||||
|
||||
A framework for measuring performance is [available separately][2].
|
||||
|
||||
@ -187,15 +299,15 @@ A framework for measuring performance is [available separately][2].
|
||||
## Rust Version Compatibility
|
||||
|
||||
It is a goal to support the latest stable Rust, the latest nightly Rust and
|
||||
the version of Rust that's used for Firefox Nightly (currently 1.25.0).
|
||||
the version of Rust that's used for Firefox Nightly (currently 1.29.0).
|
||||
These are tested on Travis.
|
||||
|
||||
Additionally, beta and the oldest known to work Rust version (currently
|
||||
1.21.0) are tested on Travis. The oldest Rust known to work is tested as
|
||||
1.29.0) are tested on Travis. The oldest Rust known to work is tested as
|
||||
a canary so that when the oldest known to work no longer works, the change
|
||||
can be documented here. At this time, there is no firm commitment to support
|
||||
a version older than what's required by Firefox. The oldest supported Rust
|
||||
is expected to move forward rapidly when `stdsimd` can replace the `simd`
|
||||
is expected to move forward rapidly when `packed_simd` can replace the `simd`
|
||||
crate without performance regression.
|
||||
|
||||
## Compatibility with rust-encoding
|
||||
@ -207,6 +319,19 @@ encoding_rs is
|
||||
written with the assuption that Firefox would need it, but it is not currently
|
||||
used in Firefox.
|
||||
|
||||
## Regenerating Generated Code
|
||||
|
||||
To regenerate the generated code:
|
||||
|
||||
* Have Python 2 installed.
|
||||
* Clone [`https://github.com/hsivonen/encoding_c`](https://github.com/hsivonen/encoding_c)
|
||||
next to the `encoding_rs` directory.
|
||||
* Clone [`https://github.com/whatwg/encoding`](https://github.com/whatwg/encoding)
|
||||
next to the `encoding_rs` directory.
|
||||
* Checkout revision `f381389` of the `encoding` repo.
|
||||
* With the `encoding_rs` directory as the working directory, run
|
||||
`python generate-encoding-data.py`.
|
||||
|
||||
## Roadmap
|
||||
|
||||
- [x] Design the low-level API.
|
||||
@ -231,18 +356,53 @@ used in Firefox.
|
||||
- [ ] ~Parallelize UTF-8 validation using [Rayon](https://github.com/nikomatsakis/rayon).~
|
||||
(This turned out to be a pessimization in the ASCII case due to memory bandwidth reasons.)
|
||||
- [x] Provide an XPCOM/MFBT-flavored C++ API.
|
||||
- [ ] Investigate accelerating single-byte encode with a single fast-tracked
|
||||
- [x] Investigate accelerating single-byte encode with a single fast-tracked
|
||||
range per encoding.
|
||||
- [x] Replace uconv with encoding_rs in Gecko.
|
||||
- [x] Implement the rust-encoding API in terms of encoding_rs.
|
||||
- [x] Add SIMD acceleration for Aarch64.
|
||||
- [x] Investigate the use of NEON on 32-bit ARM.
|
||||
- [ ] Investigate Björn Höhrmann's lookup table acceleration for UTF-8 as
|
||||
adapted to Rust in rust-encoding.
|
||||
- [ ] Add actually fast CJK encode options.
|
||||
- [ ] ~Investigate Björn Höhrmann's lookup table acceleration for UTF-8 as
|
||||
adapted to Rust in rust-encoding.~
|
||||
- [x] Add actually fast CJK encode options.
|
||||
- [ ] Investigate [Bob Steagall's lookup table acceleration for UTF-8](https://github.com/BobSteagall/CppNow2018/blob/master/FastConversionFromUTF-8/Fast%20Conversion%20From%20UTF-8%20with%20C%2B%2B%2C%20DFAs%2C%20and%20SSE%20Intrinsics%20-%20Bob%20Steagall%20-%20C%2B%2BNow%202018.pdf).
|
||||
|
||||
## Release Notes
|
||||
|
||||
### 0.8.12
|
||||
|
||||
* Removed the `clippy::` prefix from clippy lint names.
|
||||
|
||||
### 0.8.11
|
||||
|
||||
* Changed minimum Rust requirement to 1.29.0 (for the ability to refer
|
||||
to the interior of a `static` when defining another `static`).
|
||||
* Explicitly aligned the lookup tables for single-byte encodings and
|
||||
UTF-8 to cache lines in the hope of freeing up one cache line for
|
||||
other data. (Perhaps the tables were already aligned and this is
|
||||
placebo.)
|
||||
* Added 32 bits of encode-oriented data for each single-byte encoding.
|
||||
The change was performance-neutral for non-Latin1-ish Latin legacy
|
||||
encodings, improved Latin1-ish and Arabic legacy encode speed
|
||||
somewhat (new speed is 2.4x the old speed for German, 2.3x for
|
||||
Arabic, 1.7x for Portuguese and 1.4x for French) and improved
|
||||
non-Latin1, non-Arabic legacy single-byte encode a lot (7.2x for
|
||||
Thai, 6x for Greek, 5x for Russian, 4x for Hebrew).
|
||||
* Added compile-time options for fast CJK legacy encode options (at
|
||||
the cost of binary size (up to 176 KB) and run-time memory usage).
|
||||
These options still retain the overall code structure instead of
|
||||
rewriting the CJK encoders totally, so the speed isn't as good as
|
||||
what could be achieved by using even more memory / making the
|
||||
binary even langer.
|
||||
* Made UTF-8 decode and validation faster.
|
||||
* Added method `is_single_byte()` on `Encoding`.
|
||||
* Added `mem::decode_latin1()` and `mem::encode_latin1_lossy()`.
|
||||
|
||||
### 0.8.10
|
||||
|
||||
* Disabled a unit test that tests a panic condition when the assertion
|
||||
being tested is disabled.
|
||||
|
||||
### 0.8.9
|
||||
|
||||
* Made `--features simd-accel` work with stable-channel compiler to
|
||||
|
@ -12,6 +12,15 @@
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import os.path
|
||||
|
||||
if (not os.path.isfile("../encoding/encodings.json")) or (not os.path.isfile("../encoding/indexes.json")):
|
||||
sys.stderr.write("This script needs a clone of https://github.com/whatwg/encoding/ (preferably at revision f381389) next to the encoding_rs directory.\n");
|
||||
sys.exit(-1)
|
||||
|
||||
if not os.path.isfile("../encoding_c/src/lib.rs"):
|
||||
sys.stderr.write("This script also writes the generated parts of the encoding_c crate and needs a clone of https://github.com/hsivonen/encoding_c next to the encoding_rs directory.\n");
|
||||
sys.exit(-1)
|
||||
|
||||
def cmp_from_end(one, other):
|
||||
c = cmp(len(one), len(other))
|
||||
@ -52,9 +61,12 @@ def static_u16_table(name, data):
|
||||
''')
|
||||
|
||||
def static_u16_table_from_indexable(name, data, item, feature):
|
||||
data_file.write('''#[cfg(feature = "%s")]
|
||||
data_file.write('''#[cfg(all(
|
||||
feature = "less-slow-%s",
|
||||
not(feature = "fast-%s")
|
||||
))]
|
||||
static %s: [u16; %d] = [
|
||||
''' % (feature, name, len(data)))
|
||||
''' % (feature, feature, name, len(data)))
|
||||
|
||||
for i in xrange(len(data)):
|
||||
data_file.write('0x%04X,\n' % data[i][item])
|
||||
@ -64,12 +76,30 @@ static %s: [u16; %d] = [
|
||||
''')
|
||||
|
||||
def static_u8_pair_table_from_indexable(name, data, item, feature):
|
||||
data_file.write('''#[cfg(all(
|
||||
feature = "less-slow-%s",
|
||||
not(feature = "fast-%s")
|
||||
))]
|
||||
static %s: [[u8; 2]; %d] = [
|
||||
''' % (feature, feature, name, len(data)))
|
||||
|
||||
for i in xrange(len(data)):
|
||||
data_file.write('[0x%02X, 0x%02X],\n' % data[i][item])
|
||||
|
||||
data_file.write('''];
|
||||
|
||||
''')
|
||||
|
||||
def static_u8_pair_table(name, data, feature):
|
||||
data_file.write('''#[cfg(feature = "%s")]
|
||||
static %s: [[u8; 2]; %d] = [
|
||||
''' % (feature, name, len(data)))
|
||||
|
||||
for i in xrange(len(data)):
|
||||
data_file.write('[0x%02X, 0x%02X],\n' % data[i][item])
|
||||
pair = data[i]
|
||||
if not pair:
|
||||
pair = (0, 0)
|
||||
data_file.write('[0x%02X, 0x%02X],\n' % pair)
|
||||
|
||||
data_file.write('''];
|
||||
|
||||
@ -167,6 +197,46 @@ encoding_by_alias_code_page = {
|
||||
51949: "EUC-KR",
|
||||
}
|
||||
|
||||
# The position in the index (0 is the first index entry,
|
||||
# i.e. byte value 0x80) that starts the longest run of
|
||||
# consecutive code points. Must not be in the first
|
||||
# quadrant. If the character to be encoded is not in this
|
||||
# run, the part of the index after the run is searched
|
||||
# forward. Then the part of the index from 32 to the start
|
||||
# of the run. The first quadrant is searched last.
|
||||
#
|
||||
# If there is no obviously most useful longest run,
|
||||
# the index here is just used to affect the search order.
|
||||
start_of_longest_run_in_single_byte = {
|
||||
"IBM866": 96, # 0 would be longest, but we don't want to start in the first quadrant
|
||||
"windows-874": 33,
|
||||
"windows-1250": 92,
|
||||
"windows-1251": 64,
|
||||
"windows-1252": 32,
|
||||
"windows-1253": 83,
|
||||
"windows-1254": 95,
|
||||
"windows-1255": 96,
|
||||
"windows-1256": 65,
|
||||
"windows-1257": 95, # not actually longest
|
||||
"windows-1258": 95, # not actually longest
|
||||
"macintosh": 106, # useless
|
||||
"x-mac-cyrillic": 96,
|
||||
"KOI8-R": 64, # not actually longest
|
||||
"KOI8-U": 64, # not actually longest
|
||||
"ISO-8859-2": 95, # not actually longest
|
||||
"ISO-8859-3": 95, # not actually longest
|
||||
"ISO-8859-4": 95, # not actually longest
|
||||
"ISO-8859-5": 46,
|
||||
"ISO-8859-6": 65,
|
||||
"ISO-8859-7": 83,
|
||||
"ISO-8859-8": 96,
|
||||
"ISO-8859-10": 90, # not actually longest
|
||||
"ISO-8859-13": 95, # not actually longest
|
||||
"ISO-8859-14": 95,
|
||||
"ISO-8859-15": 63,
|
||||
"ISO-8859-16": 95, # not actually longest
|
||||
}
|
||||
|
||||
#
|
||||
|
||||
for group in data:
|
||||
@ -201,6 +271,25 @@ for label in labels:
|
||||
longest_label_length = len(label.label)
|
||||
longest_label = label.label
|
||||
|
||||
def longest_run_for_single_byte(name):
|
||||
if name == u"ISO-8859-8-I":
|
||||
name = u"ISO-8859-8"
|
||||
index = indexes[name.lower()]
|
||||
run_byte_offset = start_of_longest_run_in_single_byte[name]
|
||||
run_bmp_offset = index[run_byte_offset]
|
||||
previous_code_point = run_bmp_offset
|
||||
run_length = 1
|
||||
while True:
|
||||
i = run_byte_offset + run_length
|
||||
if i == len(index):
|
||||
break
|
||||
code_point = index[i]
|
||||
if previous_code_point + 1 != code_point:
|
||||
break
|
||||
previous_code_point = code_point
|
||||
run_length += 1
|
||||
return (run_bmp_offset, run_byte_offset, run_length)
|
||||
|
||||
def is_single_byte(name):
|
||||
for encoding in single_byte:
|
||||
if name == encoding["name"]:
|
||||
@ -217,11 +306,11 @@ def read_non_generated(path):
|
||||
|
||||
generated_begin_index = full.find(generated_begin)
|
||||
if generated_begin_index < 0:
|
||||
print "Can't find generated code start marker in %s. Exiting." % path
|
||||
sys.stderr.write("Can't find generated code start marker in %s. Exiting.\n" % path)
|
||||
sys.exit(-1)
|
||||
generated_end_index = full.find(generated_end)
|
||||
if generated_end_index < 0:
|
||||
print "Can't find generated code end marker in %s. Exiting." % path
|
||||
sys.stderr.write("Can't find generated code end marker in %s. Exiting.\n" % path)
|
||||
sys.exit(-1)
|
||||
|
||||
return (full[0:generated_begin_index + len(generated_begin)],
|
||||
@ -242,7 +331,8 @@ const LONGEST_LABEL_LENGTH: usize = %d; // %s
|
||||
for name in preferred:
|
||||
variant = None
|
||||
if is_single_byte(name):
|
||||
variant = "SingleByte(data::%s_DATA)" % to_constant_name(u"iso-8859-8" if name == u"ISO-8859-8-I" else name)
|
||||
(run_bmp_offset, run_byte_offset, run_length) = longest_run_for_single_byte(name)
|
||||
variant = "SingleByte(&data::SINGLE_BYTE_DATA.%s, 0x%04X, %d, %d)" % (to_snake_name(u"iso-8859-8" if name == u"ISO-8859-8-I" else name), run_bmp_offset, run_byte_offset, run_length)
|
||||
else:
|
||||
variant = to_camel_name(name)
|
||||
|
||||
@ -323,19 +413,15 @@ def null_to_zero(code_point):
|
||||
code_point = 0
|
||||
return code_point
|
||||
|
||||
data_file = open("src/data.rs", "w")
|
||||
data_file.write('''// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
(data_rs_begin, data_rs_end) = read_non_generated("src/data.rs")
|
||||
|
||||
// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
|
||||
data_file = open("src/data.rs", "w")
|
||||
data_file.write(data_rs_begin)
|
||||
data_file.write('''
|
||||
// Instead, please regenerate using generate-encoding-data.py
|
||||
|
||||
#[repr(align(64))] // Align to cache lines
|
||||
pub struct SingleByteData {
|
||||
''')
|
||||
|
||||
# Single-byte
|
||||
@ -345,13 +431,29 @@ for encoding in single_byte:
|
||||
if name == u"ISO-8859-8-I":
|
||||
continue
|
||||
|
||||
data_file.write('''pub const %s_DATA: &'static [u16; 128] = &[
|
||||
''' % to_constant_name(name))
|
||||
data_file.write(''' pub %s: [u16; 128],
|
||||
''' % to_snake_name(name))
|
||||
|
||||
data_file.write('''}
|
||||
|
||||
pub static SINGLE_BYTE_DATA: SingleByteData = SingleByteData {
|
||||
''')
|
||||
|
||||
for encoding in single_byte:
|
||||
name = encoding["name"]
|
||||
if name == u"ISO-8859-8-I":
|
||||
continue
|
||||
|
||||
data_file.write(''' %s: [
|
||||
''' % to_snake_name(name))
|
||||
|
||||
for code_point in indexes[name.lower()]:
|
||||
data_file.write('0x%04X,\n' % null_to_zero(code_point))
|
||||
|
||||
data_file.write('''];
|
||||
data_file.write('''],
|
||||
''')
|
||||
|
||||
data_file.write('''};
|
||||
|
||||
''')
|
||||
|
||||
@ -374,7 +476,8 @@ for code_point in index[942:19782]:
|
||||
for j in xrange(32 - (len(astralness) % 32)):
|
||||
astralness.append(0)
|
||||
|
||||
data_file.write('''static BIG5_ASTRALNESS: [u32; %d] = [
|
||||
data_file.write('''#[cfg_attr(feature = "cargo-clippy", allow(unreadable_literal))]
|
||||
static BIG5_ASTRALNESS: [u32; %d] = [
|
||||
''' % (len(astralness) / 32))
|
||||
|
||||
i = 0
|
||||
@ -408,8 +511,23 @@ level1_hanzi_pairs.append((0x5188, (0xC8, 0xA2)))
|
||||
level1_hanzi_pairs.append((0x9FB1, (0xC8, 0xA3)))
|
||||
level1_hanzi_pairs.sort(key=lambda x: x[0])
|
||||
|
||||
static_u16_table_from_indexable("BIG5_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0, "less-slow-big5-hanzi-encode")
|
||||
static_u8_pair_table_from_indexable("BIG5_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1, "less-slow-big5-hanzi-encode")
|
||||
static_u16_table_from_indexable("BIG5_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0, "big5-hanzi-encode")
|
||||
static_u8_pair_table_from_indexable("BIG5_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1, "big5-hanzi-encode")
|
||||
|
||||
# Fast Unified Ideograph encode
|
||||
big5_unified_ideograph_bytes = [None] * (0x9FCC - 0x4E00)
|
||||
for row in xrange(0x7E - 0x20):
|
||||
for column in xrange(157):
|
||||
pointer = 5024 + column + (row * 157)
|
||||
code_point = index[pointer]
|
||||
if code_point and code_point >= 0x4E00 and code_point <= 0x9FCB:
|
||||
unified_offset = code_point - 0x4E00
|
||||
unified_lead = 0xA1 + row
|
||||
unified_trail = (0x40 if column < 0x3F else 0x62) + column
|
||||
if code_point == 0x5341 or code_point == 0x5345 or not big5_unified_ideograph_bytes[unified_offset]:
|
||||
big5_unified_ideograph_bytes[unified_offset] = (unified_lead, unified_trail)
|
||||
|
||||
static_u8_pair_table("BIG5_UNIFIED_IDEOGRAPH_BYTES", big5_unified_ideograph_bytes, "fast-big5-hanzi-encode")
|
||||
|
||||
# JIS0208
|
||||
|
||||
@ -550,8 +668,23 @@ for i in xrange(len(level1_kanji_index)):
|
||||
level1_kanji_pairs.append((level1_kanji_index[i], (lead, trail)))
|
||||
level1_kanji_pairs.sort(key=lambda x: x[0])
|
||||
|
||||
static_u16_table_from_indexable("JIS0208_LEVEL1_KANJI_CODE_POINTS", level1_kanji_pairs, 0, "less-slow-kanji-encode")
|
||||
static_u8_pair_table_from_indexable("JIS0208_LEVEL1_KANJI_SHIFT_JIS_BYTES", level1_kanji_pairs, 1, "less-slow-kanji-encode")
|
||||
static_u16_table_from_indexable("JIS0208_LEVEL1_KANJI_CODE_POINTS", level1_kanji_pairs, 0, "kanji-encode")
|
||||
static_u8_pair_table_from_indexable("JIS0208_LEVEL1_KANJI_SHIFT_JIS_BYTES", level1_kanji_pairs, 1, "kanji-encode")
|
||||
|
||||
# Fast encoder table for Kanji
|
||||
kanji_bytes = [None] * (0x9FA1 - 0x4E00)
|
||||
for pointer in xrange(len(index)):
|
||||
code_point = index[pointer]
|
||||
if code_point and code_point >= 0x4E00 and code_point <= 0x9FA0:
|
||||
(lead, trail) = divmod(pointer, 188)
|
||||
lead += 0x81 if lead < 0x1F else 0xC1
|
||||
trail += 0x40 if trail < 0x3F else 0x41
|
||||
# unset the high bit of lead if IBM Kanji
|
||||
if pointer >= 8272:
|
||||
lead = lead & 0x7F
|
||||
kanji_bytes[code_point - 0x4E00] = (lead, trail)
|
||||
|
||||
static_u8_pair_table("JIS0208_KANJI_BYTES", kanji_bytes, "fast-kanji-encode")
|
||||
|
||||
# ISO-2022-JP half-width katakana
|
||||
|
||||
@ -728,6 +861,28 @@ static_u16_table("KSX1001_OTHER_POINTERS", pointers)
|
||||
# is unmapped, so we don't want to look at it.
|
||||
static_u16_table("KSX1001_OTHER_UNSORTED_OFFSETS", offsets[:-1])
|
||||
|
||||
# Fast Hangul and Hanja encode
|
||||
hangul_bytes = [None] * (0xD7A4 - 0xAC00)
|
||||
hanja_unified_bytes = [None] * (0x9F9D - 0x4E00)
|
||||
hanja_compatibility_bytes = [None] * (0xFA0C - 0xF900)
|
||||
for row in xrange(0x7D):
|
||||
for column in xrange(190):
|
||||
pointer = column + (row * 190)
|
||||
code_point = index[pointer]
|
||||
if code_point:
|
||||
lead = 0x81 + row
|
||||
trail = 0x41 + column
|
||||
if code_point >= 0xAC00 and code_point < 0xD7A4:
|
||||
hangul_bytes[code_point - 0xAC00] = (lead, trail)
|
||||
elif code_point >= 0x4E00 and code_point < 0x9F9D:
|
||||
hanja_unified_bytes[code_point - 0x4E00] = (lead, trail)
|
||||
elif code_point >= 0xF900 and code_point < 0xFA0C:
|
||||
hanja_compatibility_bytes[code_point - 0xF900] = (lead, trail)
|
||||
|
||||
static_u8_pair_table("CP949_HANGUL_BYTES", hangul_bytes, "fast-hangul-encode")
|
||||
static_u8_pair_table("KSX1001_UNIFIED_HANJA_BYTES", hanja_unified_bytes, "fast-hanja-encode")
|
||||
static_u8_pair_table("KSX1001_COMPATIBILITY_HANJA_BYTES", hanja_compatibility_bytes, "fast-hanja-encode")
|
||||
|
||||
# JIS 0212
|
||||
|
||||
index = indexes["jis0212"]
|
||||
@ -927,502 +1082,23 @@ for i in xrange(len(level1_hanzi_index)):
|
||||
level1_hanzi_pairs.append((level1_hanzi_index[i], (hanzi_lead, hanzi_trail)))
|
||||
level1_hanzi_pairs.sort(key=lambda x: x[0])
|
||||
|
||||
static_u16_table_from_indexable("GB2312_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0, "less-slow-gb-hanzi-encode")
|
||||
static_u8_pair_table_from_indexable("GB2312_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1, "less-slow-gb-hanzi-encode")
|
||||
static_u16_table_from_indexable("GB2312_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0, "gb-hanzi-encode")
|
||||
static_u8_pair_table_from_indexable("GB2312_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1, "gb-hanzi-encode")
|
||||
|
||||
data_file.write('''#[inline(always)]
|
||||
fn map_with_ranges(haystack: &[u16], other: &[u16], needle: u16) -> u16 {
|
||||
debug_assert_eq!(haystack.len(), other.len());
|
||||
match haystack.binary_search(&needle) {
|
||||
Ok(i) => other[i],
|
||||
Err(i) => other[i - 1] + (needle - haystack[i - 1]),
|
||||
}
|
||||
}
|
||||
# Fast Hanzi encoder table
|
||||
hanzi_bytes = [None] * (0x9FA7 - 0x4E00)
|
||||
for row in xrange(126):
|
||||
for column in xrange(190):
|
||||
pointer = column + (row * 190)
|
||||
code_point = index[pointer]
|
||||
if code_point and code_point >= 0x4E00 and code_point <= 0x9FA6:
|
||||
hanzi_lead = 0x81 + row
|
||||
hanzi_trail = column + (0x40 if column < 0x3F else 0x41)
|
||||
hanzi_bytes[code_point - 0x4E00] = (hanzi_lead, hanzi_trail)
|
||||
|
||||
#[inline(always)]
|
||||
fn map_with_unsorted_ranges(haystack: &[u16], other: &[u16], needle: u16) -> Option<u16> {
|
||||
debug_assert_eq!(haystack.len() + 1, other.len());
|
||||
for i in 0..haystack.len() {
|
||||
let start = other[i];
|
||||
let end = other[i + 1];
|
||||
let length = end - start;
|
||||
let offset = needle.wrapping_sub(haystack[i]);
|
||||
if offset < length {
|
||||
return Some(start + offset);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
static_u8_pair_table("GBK_HANZI_BYTES", hanzi_bytes, "fast-gb-hanzi-encode")
|
||||
|
||||
#[inline(always)]
|
||||
pub fn position(haystack: &[u16], needle: u16) -> Option<usize> {
|
||||
haystack.iter().position(|&x| x == needle)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn gb18030_range_decode(pointer: u16) -> u16 {
|
||||
map_with_ranges(&GB18030_RANGE_POINTERS[..],
|
||||
&GB18030_RANGE_OFFSETS[..],
|
||||
pointer)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn gb18030_range_encode(bmp: u16) -> usize {
|
||||
if bmp == 0xE7C7 {
|
||||
return 7457;
|
||||
}
|
||||
map_with_ranges(&GB18030_RANGE_OFFSETS[..], &GB18030_RANGE_POINTERS[..], bmp) as usize
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn gbk_top_ideograph_decode(pointer: u16) -> u16 {
|
||||
map_with_ranges(&GBK_TOP_IDEOGRAPH_POINTERS[..],
|
||||
&GBK_TOP_IDEOGRAPH_OFFSETS[..],
|
||||
pointer)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn gbk_top_ideograph_encode(bmp: u16) -> u16 {
|
||||
map_with_ranges(&GBK_TOP_IDEOGRAPH_OFFSETS[..],
|
||||
&GBK_TOP_IDEOGRAPH_POINTERS[..],
|
||||
bmp)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn gbk_left_ideograph_decode(pointer: u16) -> u16 {
|
||||
map_with_ranges(&GBK_LEFT_IDEOGRAPH_POINTERS[..],
|
||||
&GBK_LEFT_IDEOGRAPH_OFFSETS[..],
|
||||
pointer)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn gbk_left_ideograph_encode(bmp: u16) -> u16 {
|
||||
map_with_ranges(&GBK_LEFT_IDEOGRAPH_OFFSETS[..],
|
||||
&GBK_LEFT_IDEOGRAPH_POINTERS[..],
|
||||
bmp)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn cp949_top_hangul_decode(pointer: u16) -> u16 {
|
||||
map_with_ranges(&CP949_TOP_HANGUL_POINTERS[..],
|
||||
&CP949_TOP_HANGUL_OFFSETS[..],
|
||||
pointer)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn cp949_top_hangul_encode(bmp: u16) -> u16 {
|
||||
map_with_ranges(&CP949_TOP_HANGUL_OFFSETS[..],
|
||||
&CP949_TOP_HANGUL_POINTERS[..],
|
||||
bmp)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn cp949_left_hangul_decode(pointer: u16) -> u16 {
|
||||
map_with_ranges(&CP949_LEFT_HANGUL_POINTERS[..],
|
||||
&CP949_LEFT_HANGUL_OFFSETS[..],
|
||||
pointer)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn cp949_left_hangul_encode(bmp: u16) -> u16 {
|
||||
map_with_ranges(&CP949_LEFT_HANGUL_OFFSETS[..],
|
||||
&CP949_LEFT_HANGUL_POINTERS[..],
|
||||
bmp)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn gbk_other_decode(pointer: u16) -> u16 {
|
||||
map_with_ranges(&GBK_OTHER_POINTERS[..GBK_OTHER_POINTERS.len() - 1],
|
||||
&GBK_OTHER_UNSORTED_OFFSETS[..],
|
||||
pointer)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn gbk_other_encode(bmp: u16) -> Option<u16> {
|
||||
map_with_unsorted_ranges(&GBK_OTHER_UNSORTED_OFFSETS[..],
|
||||
&GBK_OTHER_POINTERS[..],
|
||||
bmp)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn gb2312_other_decode(pointer: u16) -> u16 {
|
||||
map_with_ranges(&GB2312_OTHER_POINTERS[..GB2312_OTHER_POINTERS.len() - 1],
|
||||
&GB2312_OTHER_UNSORTED_OFFSETS[..],
|
||||
pointer)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn gb2312_other_encode(bmp: u16) -> Option<u16> {
|
||||
map_with_unsorted_ranges(&GB2312_OTHER_UNSORTED_OFFSETS[..],
|
||||
&GB2312_OTHER_POINTERS[..],
|
||||
bmp)
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "less-slow-gb-hanzi-encode"))]
|
||||
#[inline(always)]
|
||||
pub fn gb2312_level1_hanzi_encode(bmp: u16) -> Option<(u8, u8)> {
|
||||
position(&GB2312_HANZI[..(94 * (0xD8 - 0xB0) - 5)], bmp).map(|hanzi_pointer| {
|
||||
let hanzi_lead = (hanzi_pointer / 94) + 0xB0;
|
||||
let hanzi_trail = (hanzi_pointer % 94) + 0xA1;
|
||||
(hanzi_lead as u8, hanzi_trail as u8)
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(feature = "less-slow-gb-hanzi-encode")]
|
||||
#[inline(always)]
|
||||
pub fn gb2312_level1_hanzi_encode(bmp: u16) -> Option<(u8, u8)> {
|
||||
match GB2312_LEVEL1_HANZI_CODE_POINTS.binary_search(&bmp) {
|
||||
Ok(i) => {
|
||||
let pair = &GB2312_LEVEL1_HANZI_BYTES[i];
|
||||
Some((pair[0], pair[1]))
|
||||
}
|
||||
Err(_) => None,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn gb2312_level2_hanzi_encode(bmp: u16) -> Option<usize> {
|
||||
// TODO: optimize
|
||||
position(&GB2312_HANZI[(94 * (0xD8 - 0xB0))..], bmp)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn ksx1001_other_decode(pointer: u16) -> u16 {
|
||||
map_with_ranges(&KSX1001_OTHER_POINTERS[..KSX1001_OTHER_POINTERS.len() - 1],
|
||||
&KSX1001_OTHER_UNSORTED_OFFSETS[..],
|
||||
pointer)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn ksx1001_other_encode(bmp: u16) -> Option<u16> {
|
||||
map_with_unsorted_ranges(&KSX1001_OTHER_UNSORTED_OFFSETS[..],
|
||||
&KSX1001_OTHER_POINTERS[..],
|
||||
bmp)
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "less-slow-kanji-encode"))]
|
||||
#[inline(always)]
|
||||
pub fn jis0208_level1_kanji_shift_jis_encode(bmp: u16) -> Option<(u8, u8)> {
|
||||
position(&JIS0208_LEVEL1_KANJI[..], bmp).map(|kanji_pointer| {
|
||||
let pointer = 1410 + kanji_pointer;
|
||||
let lead = pointer / 188;
|
||||
let lead_offset = if lead < 0x1F {
|
||||
0x81
|
||||
} else {
|
||||
0xC1
|
||||
};
|
||||
let trail = pointer % 188;
|
||||
let trail_offset = if trail < 0x3F {
|
||||
0x40
|
||||
} else {
|
||||
0x41
|
||||
};
|
||||
((lead + lead_offset) as u8, (trail + trail_offset) as u8)
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(feature = "less-slow-kanji-encode")]
|
||||
#[inline(always)]
|
||||
pub fn jis0208_level1_kanji_shift_jis_encode(bmp: u16) -> Option<(u8, u8)> {
|
||||
match JIS0208_LEVEL1_KANJI_CODE_POINTS.binary_search(&bmp) {
|
||||
Ok(i) => {
|
||||
let pair = &JIS0208_LEVEL1_KANJI_SHIFT_JIS_BYTES[i];
|
||||
Some((pair[0], pair[1]))
|
||||
}
|
||||
Err(_) => None,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "less-slow-kanji-encode"))]
|
||||
#[inline(always)]
|
||||
pub fn jis0208_level1_kanji_euc_jp_encode(bmp: u16) -> Option<(u8, u8)> {
|
||||
position(&JIS0208_LEVEL1_KANJI[..], bmp).map(|kanji_pointer| {
|
||||
let lead = (kanji_pointer / 94) + 0xB0;
|
||||
let trail = (kanji_pointer % 94) + 0xA1;
|
||||
(lead as u8, trail as u8)
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(feature = "less-slow-kanji-encode")]
|
||||
#[inline(always)]
|
||||
pub fn jis0208_level1_kanji_euc_jp_encode(bmp: u16) -> Option<(u8, u8)> {
|
||||
jis0208_level1_kanji_shift_jis_encode(bmp).map(|(shift_jis_lead, shift_jis_trail)| {
|
||||
let mut lead = shift_jis_lead as usize;
|
||||
if shift_jis_lead >= 0xA0 {
|
||||
lead -= 0xC1 - 0x81;
|
||||
}
|
||||
// The next line would overflow u8. Letting it go over allows us to
|
||||
// subtract fewer times.
|
||||
lead <<= 1;
|
||||
// Bring it back to u8 range
|
||||
lead -= 0x61;
|
||||
let trail = if shift_jis_trail >= 0x9F {
|
||||
lead += 1;
|
||||
shift_jis_trail + (0xA1 - 0x9F)
|
||||
} else if shift_jis_trail < 0x7F {
|
||||
shift_jis_trail + (0xA1 - 0x40)
|
||||
} else {
|
||||
shift_jis_trail + (0xA1 - 0x41)
|
||||
};
|
||||
(lead as u8, trail)
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "less-slow-kanji-encode"))]
|
||||
#[inline(always)]
|
||||
pub fn jis0208_level1_kanji_iso_2022_jp_encode(bmp: u16) -> Option<(u8, u8)> {
|
||||
position(&JIS0208_LEVEL1_KANJI[..], bmp).map(|kanji_pointer| {
|
||||
let lead = (kanji_pointer / 94) + (0xB0 - 0x80);
|
||||
let trail = (kanji_pointer % 94) + 0x21;
|
||||
(lead as u8, trail as u8)
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(feature = "less-slow-kanji-encode")]
|
||||
#[inline(always)]
|
||||
pub fn jis0208_level1_kanji_iso_2022_jp_encode(bmp: u16) -> Option<(u8, u8)> {
|
||||
jis0208_level1_kanji_shift_jis_encode(bmp).map(|(shift_jis_lead, shift_jis_trail)| {
|
||||
let mut lead = shift_jis_lead as usize;
|
||||
if shift_jis_lead >= 0xA0 {
|
||||
lead -= 0xC1 - 0x81;
|
||||
}
|
||||
// The next line would overflow u8. Letting it go over allows us to
|
||||
// subtract fewer times.
|
||||
lead <<= 1;
|
||||
// Bring it back to u8 range
|
||||
lead -= 0xE1;
|
||||
let trail = if shift_jis_trail >= 0x9F {
|
||||
lead += 1;
|
||||
shift_jis_trail - (0x9F - 0x21)
|
||||
} else if shift_jis_trail < 0x7F {
|
||||
shift_jis_trail - (0x40 - 0x21)
|
||||
} else {
|
||||
shift_jis_trail - (0x41 - 0x21)
|
||||
};
|
||||
(lead as u8, trail)
|
||||
})
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn jis0208_level2_and_additional_kanji_encode(bmp: u16) -> Option<usize> {
|
||||
// TODO: optimize
|
||||
position(&JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[..], bmp)
|
||||
}
|
||||
|
||||
pub fn jis0208_symbol_decode(pointer: usize) -> Option<u16> {
|
||||
let mut i = 0;
|
||||
while i < JIS0208_SYMBOL_TRIPLES.len() {
|
||||
let start = JIS0208_SYMBOL_TRIPLES[i] as usize;
|
||||
let length = JIS0208_SYMBOL_TRIPLES[i + 1] as usize;
|
||||
let pointer_minus_start = pointer.wrapping_sub(start);
|
||||
if pointer_minus_start < length {
|
||||
let offset = JIS0208_SYMBOL_TRIPLES[i + 2] as usize;
|
||||
return Some(JIS0208_SYMBOLS[pointer_minus_start + offset]);
|
||||
}
|
||||
i += 3;
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Prefers Shift_JIS pointers for the three symbols that are in both ranges.
|
||||
#[inline(always)]
|
||||
pub fn jis0208_symbol_encode(bmp: u16) -> Option<usize> {
|
||||
let mut i = 0;
|
||||
while i < JIS0208_SYMBOL_TRIPLES.len() {
|
||||
let pointer_start = JIS0208_SYMBOL_TRIPLES[i] as usize;
|
||||
let length = JIS0208_SYMBOL_TRIPLES[i + 1] as usize;
|
||||
let symbol_start = JIS0208_SYMBOL_TRIPLES[i + 2] as usize;
|
||||
let symbol_end = symbol_start + length;
|
||||
let mut symbol_pos = symbol_start;
|
||||
while symbol_pos < symbol_end {
|
||||
if JIS0208_SYMBOLS[symbol_pos] == bmp {
|
||||
return Some(symbol_pos - symbol_start + pointer_start);
|
||||
}
|
||||
symbol_pos += 1;
|
||||
}
|
||||
i += 3;
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn ibm_symbol_encode(bmp: u16) -> Option<usize> {
|
||||
position(&JIS0208_SYMBOLS[IBM_SYMBOL_START..IBM_SYMBOL_END], bmp)
|
||||
.map(|x| x + IBM_SYMBOL_POINTER_START)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn jis0208_range_decode(pointer: usize) -> Option<u16> {
|
||||
let mut i = 0;
|
||||
while i < JIS0208_RANGE_TRIPLES.len() {
|
||||
let start = JIS0208_RANGE_TRIPLES[i] as usize;
|
||||
let length = JIS0208_RANGE_TRIPLES[i + 1] as usize;
|
||||
let pointer_minus_start = pointer.wrapping_sub(start);
|
||||
if pointer_minus_start < length {
|
||||
let offset = JIS0208_RANGE_TRIPLES[i + 2] as usize;
|
||||
return Some((pointer_minus_start + offset) as u16);
|
||||
}
|
||||
i += 3;
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn jis0208_range_encode(bmp: u16) -> Option<usize> {
|
||||
let mut i = 0;
|
||||
while i < JIS0208_RANGE_TRIPLES.len() {
|
||||
let start = JIS0208_RANGE_TRIPLES[i + 2] as usize;
|
||||
let length = JIS0208_RANGE_TRIPLES[i + 1] as usize;
|
||||
let bmp_minus_start = (bmp as usize).wrapping_sub(start);
|
||||
if bmp_minus_start < length {
|
||||
let offset = JIS0208_RANGE_TRIPLES[i] as usize;
|
||||
return Some(bmp_minus_start + offset);
|
||||
}
|
||||
i += 3;
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
pub fn jis0212_accented_decode(pointer: usize) -> Option<u16> {
|
||||
let mut i = 0;
|
||||
while i < JIS0212_ACCENTED_TRIPLES.len() {
|
||||
let start = JIS0212_ACCENTED_TRIPLES[i] as usize;
|
||||
let length = JIS0212_ACCENTED_TRIPLES[i + 1] as usize;
|
||||
let pointer_minus_start = pointer.wrapping_sub(start);
|
||||
if pointer_minus_start < length {
|
||||
let offset = JIS0212_ACCENTED_TRIPLES[i + 2] as usize;
|
||||
let candidate = JIS0212_ACCENTED[pointer_minus_start + offset];
|
||||
if candidate == 0 {
|
||||
return None;
|
||||
}
|
||||
return Some(candidate);
|
||||
}
|
||||
i += 3;
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn big5_is_astral(rebased_pointer: usize) -> bool {
|
||||
(BIG5_ASTRALNESS[rebased_pointer >> 5] & (1 << (rebased_pointer & 0x1F))) != 0
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn big5_low_bits(rebased_pointer: usize) -> u16 {
|
||||
if rebased_pointer < BIG5_LOW_BITS.len() {
|
||||
BIG5_LOW_BITS[rebased_pointer]
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn big5_astral_encode(low_bits: u16) -> Option<usize> {
|
||||
match low_bits {
|
||||
0x00CC => Some(11205 - 942),
|
||||
0x008A => Some(11207 - 942),
|
||||
0x7607 => Some(11213 - 942),
|
||||
_ => {
|
||||
let mut i = 18997 - 942;
|
||||
while i < BIG5_LOW_BITS.len() - 1 {
|
||||
if BIG5_LOW_BITS[i] == low_bits && big5_is_astral(i) {
|
||||
return Some(i);
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "less-slow-big5-hanzi-encode"))]
|
||||
#[inline(always)]
|
||||
pub fn big5_level1_hanzi_encode(bmp: u16) -> Option<(u8, u8)> {
|
||||
if super::in_inclusive_range16(bmp, 0x4E00, 0x9FB1) {
|
||||
if let Some(hanzi_pointer) = position(&BIG5_LOW_BITS[(5495 - 942)..(10951 - 942)], bmp) {
|
||||
let lead = hanzi_pointer / 157 + 0xA4;
|
||||
let remainder = hanzi_pointer % 157;
|
||||
let trail = if remainder < 0x3F {
|
||||
remainder + 0x40
|
||||
} else {
|
||||
remainder + 0x62
|
||||
};
|
||||
return Some((lead as u8, trail as u8));
|
||||
}
|
||||
match bmp {
|
||||
0x4E5A => {
|
||||
return Some((0xC8, 0x7B));
|
||||
}
|
||||
0x5202 => {
|
||||
return Some((0xC8, 0x7D));
|
||||
}
|
||||
0x9FB0 => {
|
||||
return Some((0xC8, 0xA1));
|
||||
}
|
||||
0x5188 => {
|
||||
return Some((0xC8, 0xA2));
|
||||
}
|
||||
0x9FB1 => {
|
||||
return Some((0xC8, 0xA3));
|
||||
}
|
||||
_ => {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(feature = "less-slow-big5-hanzi-encode")]
|
||||
#[inline(always)]
|
||||
pub fn big5_level1_hanzi_encode(bmp: u16) -> Option<(u8, u8)> {
|
||||
if super::in_inclusive_range16(bmp, 0x4E00, 0x9FB1) {
|
||||
match BIG5_LEVEL1_HANZI_CODE_POINTS.binary_search(&bmp) {
|
||||
Ok(i) => {
|
||||
let pair = &BIG5_LEVEL1_HANZI_BYTES[i];
|
||||
Some((pair[0], pair[1]))
|
||||
}
|
||||
Err(_) => None,
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn big5_box_encode(bmp: u16) -> Option<usize> {
|
||||
position(&BIG5_LOW_BITS[(18963 - 942)..(18992 - 942)], bmp).map(|x| x + 18963)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn big5_other_encode(bmp: u16) -> Option<usize> {
|
||||
if 0x4491 == bmp {
|
||||
return Some(11209);
|
||||
}
|
||||
if let Some(pos) = position(&BIG5_LOW_BITS[(5024 - 942)..(5466 - 942)], bmp) {
|
||||
return Some(pos + 5024);
|
||||
}
|
||||
if let Some(pos) = position(&BIG5_LOW_BITS[(10896 - 942)..(11205 - 942)], bmp) {
|
||||
return Some(pos + 10896);
|
||||
}
|
||||
if let Some(pos) = position(&BIG5_LOW_BITS[(11254 - 942)..(18963 - 942)], bmp) {
|
||||
return Some(pos + 11254);
|
||||
}
|
||||
let mut i = 18996 - 942;
|
||||
while i < BIG5_LOW_BITS.len() {
|
||||
if BIG5_LOW_BITS[i] == bmp && !big5_is_astral(i) {
|
||||
return Some(i + 942);
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn mul_94(lead: u8) -> usize {
|
||||
lead as usize * 94
|
||||
}
|
||||
''')
|
||||
data_file.write(data_rs_end)
|
||||
|
||||
data_file.close()
|
||||
|
||||
@ -1568,7 +1244,7 @@ write_variant_method("encode_from_utf8_raw", True, [("src", "&str"),
|
||||
variant_file.write('''}
|
||||
|
||||
pub enum VariantEncoding {
|
||||
SingleByte(&'static [u16; 128]),''')
|
||||
SingleByte(&'static [u16; 128], u16, u8, u8),''')
|
||||
|
||||
for encoding in multi_byte:
|
||||
variant_file.write("%s,\n" % to_camel_name(encoding["name"]))
|
||||
@ -1578,7 +1254,7 @@ variant_file.write('''}
|
||||
impl VariantEncoding {
|
||||
pub fn new_variant_decoder(&self) -> VariantDecoder {
|
||||
match *self {
|
||||
VariantEncoding::SingleByte(table) => SingleByteDecoder::new(table),
|
||||
VariantEncoding::SingleByte(table, _, _, _) => SingleByteDecoder::new(table),
|
||||
VariantEncoding::Utf8 => Utf8Decoder::new(),
|
||||
VariantEncoding::Gbk | VariantEncoding::Gb18030 => Gb18030Decoder::new(),
|
||||
VariantEncoding::Big5 => Big5Decoder::new(),
|
||||
@ -1595,7 +1271,7 @@ impl VariantEncoding {
|
||||
|
||||
pub fn new_encoder(&self, encoding: &'static Encoding) -> Encoder {
|
||||
match *self {
|
||||
VariantEncoding::SingleByte(table) => SingleByteEncoder::new(encoding, table),
|
||||
VariantEncoding::SingleByte(table, run_bmp_offset, run_byte_offset, run_length) => SingleByteEncoder::new(encoding, table, run_bmp_offset, run_byte_offset, run_length),
|
||||
VariantEncoding::Utf8 => Utf8Encoder::new(encoding),
|
||||
VariantEncoding::Gbk => Gb18030Encoder::new(encoding, false),
|
||||
VariantEncoding::Gb18030 => Gb18030Encoder::new(encoding, true),
|
||||
@ -1609,6 +1285,13 @@ impl VariantEncoding {
|
||||
VariantEncoding::Utf16Le => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_single_byte(&self) -> bool {
|
||||
match *self {
|
||||
VariantEncoding::SingleByte(_, _, _, _) | VariantEncoding::UserDefined => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
''')
|
||||
|
||||
@ -1653,7 +1336,7 @@ for name in preferred:
|
||||
continue;
|
||||
if is_single_byte(name):
|
||||
single_byte_file.write("""
|
||||
decode_single_byte(%s, %s_DATA);""" % (to_constant_name(name), to_constant_name(name)))
|
||||
decode_single_byte(%s, &data::SINGLE_BYTE_DATA.%s);""" % (to_constant_name(name), to_snake_name(name)))
|
||||
|
||||
single_byte_file.write("""
|
||||
}
|
||||
@ -1666,7 +1349,7 @@ for name in preferred:
|
||||
continue;
|
||||
if is_single_byte(name):
|
||||
single_byte_file.write("""
|
||||
encode_single_byte(%s, %s_DATA);""" % (to_constant_name(name), to_constant_name(name)))
|
||||
encode_single_byte(%s, &data::SINGLE_BYTE_DATA.%s);""" % (to_constant_name(name), to_snake_name(name)))
|
||||
|
||||
|
||||
single_byte_file.write("""
|
||||
@ -1748,25 +1431,48 @@ utf_8_file.write(utf_8_rs_begin)
|
||||
utf_8_file.write("""
|
||||
// Instead, please regenerate using generate-encoding-data.py
|
||||
|
||||
/// Bit is 1 if the trail is invalid.
|
||||
pub static UTF8_TRAIL_INVALID: [u8; 256] = [""")
|
||||
pub static UTF8_DATA: Utf8Data = Utf8Data {
|
||||
table: [
|
||||
""")
|
||||
|
||||
for i in range(256):
|
||||
combined = 0
|
||||
combined = (1 << 2) # invalid lead
|
||||
if i < 0x80 or i > 0xBF:
|
||||
combined |= (1 << 3)
|
||||
combined |= (1 << 3) # normal trail
|
||||
if i < 0xA0 or i > 0xBF:
|
||||
combined |= (1 << 4)
|
||||
combined |= (1 << 4) # three-byte special lower bound
|
||||
if i < 0x80 or i > 0x9F:
|
||||
combined |= (1 << 5)
|
||||
combined |= (1 << 5) # three-byte special upper bound
|
||||
if i < 0x90 or i > 0xBF:
|
||||
combined |= (1 << 6)
|
||||
combined |= (1 << 6) # four-byte special lower bound
|
||||
if i < 0x80 or i > 0x8F:
|
||||
combined |= (1 << 7)
|
||||
combined |= (1 << 7) # four-byte special upper bound
|
||||
utf_8_file.write("%d," % combined)
|
||||
|
||||
for i in range(128, 256):
|
||||
lane = (1 << 2) # invalid lead
|
||||
if i >= 0xC2 and i <= 0xDF:
|
||||
lane = (1 << 3) # normal trail
|
||||
elif i == 0xE0:
|
||||
lane = (1 << 4) # three-byte special lower bound
|
||||
elif i >= 0xE1 and i <= 0xEC:
|
||||
lane = (1 << 3) # normal trail
|
||||
elif i == 0xED:
|
||||
lane = (1 << 5) # three-byte special upper bound
|
||||
elif i >= 0xEE and i <= 0xEF:
|
||||
lane = (1 << 3) # normal trail
|
||||
elif i == 0xF0:
|
||||
lane = (1 << 6) # four-byte special lower bound
|
||||
elif i >= 0xF1 and i <= 0xF3:
|
||||
lane = (1 << 3) # normal trail
|
||||
elif i == 0xF4:
|
||||
lane = (1 << 7) # four-byte special upper bound
|
||||
utf_8_file.write("%d," % lane)
|
||||
|
||||
utf_8_file.write("""
|
||||
];
|
||||
],
|
||||
};
|
||||
|
||||
""")
|
||||
|
||||
utf_8_file.write(utf_8_rs_end)
|
||||
|
1009
third_party/rust/encoding_rs/src/ascii.rs
vendored
1009
third_party/rust/encoding_rs/src/ascii.rs
vendored
File diff suppressed because it is too large
Load Diff
2
third_party/rust/encoding_rs/src/big5.rs
vendored
2
third_party/rust/encoding_rs/src/big5.rs
vendored
@ -141,7 +141,7 @@ impl Big5Decoder {
|
||||
}
|
||||
}
|
||||
} else if big5_is_astral(rebased_pointer) {
|
||||
handle.write_astral(low_bits as u32 |
|
||||
handle.write_astral(u32::from(low_bits) |
|
||||
0x20000u32)
|
||||
} else {
|
||||
handle.write_bmp_excl_ascii(low_bits)
|
||||
|
96218
third_party/rust/encoding_rs/src/data.rs
vendored
96218
third_party/rust/encoding_rs/src/data.rs
vendored
File diff suppressed because it is too large
Load Diff
50
third_party/rust/encoding_rs/src/euc_jp.rs
vendored
50
third_party/rust/encoding_rs/src/euc_jp.rs
vendored
@ -77,10 +77,10 @@ impl EucJpDecoder {
|
||||
// and Katakana (10% acconding to Lunde).
|
||||
if jis0208_lead_minus_offset == 0x03 && trail_minus_offset < 0x53 {
|
||||
// Hiragana
|
||||
handle.write_upper_bmp(0x3041 + trail_minus_offset as u16)
|
||||
handle.write_upper_bmp(0x3041 + u16::from(trail_minus_offset))
|
||||
} else if jis0208_lead_minus_offset == 0x04 && trail_minus_offset < 0x56 {
|
||||
// Katakana
|
||||
handle.write_upper_bmp(0x30A1 + trail_minus_offset as u16)
|
||||
handle.write_upper_bmp(0x30A1 + u16::from(trail_minus_offset))
|
||||
} else if trail_minus_offset > (0xFE - 0xA1) {
|
||||
if byte < 0x80 {
|
||||
return (
|
||||
@ -95,7 +95,7 @@ impl EucJpDecoder {
|
||||
handle.written(),
|
||||
);
|
||||
} else {
|
||||
let pointer = mul_94(jis0208_lead_minus_offset) + trail_minus_offset as usize;
|
||||
let pointer = mul_94(jis0208_lead_minus_offset) + usize::from(trail_minus_offset);
|
||||
let level1_pointer = pointer.wrapping_sub(1410);
|
||||
if level1_pointer < JIS0208_LEVEL1_KANJI.len() {
|
||||
handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer])
|
||||
@ -160,7 +160,7 @@ impl EucJpDecoder {
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
let pointer = mul_94(jis0212_lead_minus_offset) + trail_minus_offset as usize;
|
||||
let pointer = mul_94(jis0212_lead_minus_offset) + usize::from(trail_minus_offset);
|
||||
let pointer_minus_kanji = pointer.wrapping_sub(1410);
|
||||
if pointer_minus_kanji < JIS0212_KANJI.len() {
|
||||
handle.write_upper_bmp(JIS0212_KANJI[pointer_minus_kanji])
|
||||
@ -202,7 +202,7 @@ impl EucJpDecoder {
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
handle.write_upper_bmp(0xFF61 + trail_minus_offset as u16)
|
||||
handle.write_upper_bmp(0xFF61 + u16::from(trail_minus_offset))
|
||||
},
|
||||
self,
|
||||
non_ascii,
|
||||
@ -217,6 +217,33 @@ impl EucJpDecoder {
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(feature = "fast-kanji-encode")]
|
||||
#[inline(always)]
|
||||
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
|
||||
jis0208_kanji_euc_jp_encode(bmp)
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "fast-kanji-encode"))]
|
||||
#[inline(always)]
|
||||
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
|
||||
if 0x4EDD == bmp {
|
||||
// Ideograph on the symbol row!
|
||||
Some((0xA1, 0xB8))
|
||||
} else if let Some((lead, trail)) = jis0208_level1_kanji_euc_jp_encode(bmp) {
|
||||
Some((lead, trail))
|
||||
} else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
|
||||
let lead = (pos / 94) + 0xD0;
|
||||
let trail = (pos % 94) + 0xA1;
|
||||
Some((lead as u8, trail as u8))
|
||||
} else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
|
||||
let lead = (pos / 94) + 0xF9;
|
||||
let trail = (pos % 94) + 0xA1;
|
||||
Some((lead as u8, trail as u8))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub struct EucJpEncoder;
|
||||
|
||||
impl EucJpEncoder {
|
||||
@ -245,19 +272,8 @@ impl EucJpEncoder {
|
||||
if bmp_minus_hiragana < 0x53 {
|
||||
handle.write_two(0xA4, 0xA1 + bmp_minus_hiragana as u8)
|
||||
} else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
|
||||
if 0x4EDD == bmp {
|
||||
// Ideograph on the symbol row!
|
||||
handle.write_two(0xA1, 0xB8)
|
||||
} else if let Some((lead, trail)) = jis0208_level1_kanji_euc_jp_encode(bmp) {
|
||||
if let Some((lead, trail)) = encode_kanji(bmp) {
|
||||
handle.write_two(lead, trail)
|
||||
} else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
|
||||
let lead = (pos / 94) + 0xD0;
|
||||
let trail = (pos % 94) + 0xA1;
|
||||
handle.write_two(lead as u8, trail as u8)
|
||||
} else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
|
||||
let lead = (pos / 94) + 0xF9;
|
||||
let trail = (pos % 94) + 0xA1;
|
||||
handle.write_two(lead as u8, trail as u8)
|
||||
} else {
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
|
102
third_party/rust/encoding_rs/src/euc_kr.rs
vendored
102
third_party/rust/encoding_rs/src/euc_kr.rs
vendored
@ -221,6 +221,69 @@ fn ksx1001_encode_misc(bmp: u16) -> Option<(usize, usize)> {
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "fast-hangul-encode"))]
|
||||
#[inline(always)]
|
||||
fn ksx1001_encode_hangul(bmp: u16, _: u16) -> (u8, u8) {
|
||||
match KSX1001_HANGUL.binary_search(&bmp) {
|
||||
Ok(ksx_hangul_pointer) => {
|
||||
let ksx_hangul_lead = (ksx_hangul_pointer / 94) + (0x81 + 0x2F);
|
||||
let ksx_hangul_trail = (ksx_hangul_pointer % 94) + 0xA1;
|
||||
(ksx_hangul_lead as u8, ksx_hangul_trail as u8)
|
||||
}
|
||||
Err(_) => {
|
||||
let (lead, cp949_trail) = if bmp < 0xC8A5 {
|
||||
// Above KS X 1001
|
||||
let top_pointer = cp949_top_hangul_encode(bmp) as usize;
|
||||
let top_lead = (top_pointer / (190 - 12)) + 0x81;
|
||||
let top_trail = top_pointer % (190 - 12);
|
||||
(top_lead as u8, top_trail as u8)
|
||||
} else {
|
||||
// To the left of KS X 1001
|
||||
let left_pointer = cp949_left_hangul_encode(bmp) as usize;
|
||||
let left_lead = (left_pointer / (190 - 94 - 12)) + (0x81 + 0x20);
|
||||
let left_trail = left_pointer % (190 - 94 - 12);
|
||||
(left_lead as u8, left_trail as u8)
|
||||
};
|
||||
let offset = if cp949_trail >= (0x40 - 12) {
|
||||
0x41 + 12
|
||||
} else if cp949_trail >= (0x20 - 6) {
|
||||
0x41 + 6
|
||||
} else {
|
||||
0x41
|
||||
};
|
||||
(lead as u8, (cp949_trail + offset) as u8)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "fast-hangul-encode")]
|
||||
#[inline(always)]
|
||||
fn ksx1001_encode_hangul(_: u16, bmp_minus_hangul_start: u16) -> (u8, u8) {
|
||||
cp949_hangul_encode(bmp_minus_hangul_start)
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "fast-hanja-encode"))]
|
||||
#[inline(always)]
|
||||
fn ksx1001_encode_hanja(bmp: u16) -> Option<(u8, u8)> {
|
||||
if let Some(hanja_pointer) = position(&KSX1001_HANJA[..], bmp) {
|
||||
let hanja_lead = (hanja_pointer / 94) + (0x81 + 0x49);
|
||||
let hanja_trail = (hanja_pointer % 94) + 0xA1;
|
||||
Some((hanja_lead as u8, hanja_trail as u8))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "fast-hanja-encode")]
|
||||
#[inline(always)]
|
||||
fn ksx1001_encode_hanja(bmp: u16) -> Option<(u8, u8)> {
|
||||
if bmp < 0xF900 {
|
||||
ksx1001_unified_hangul_encode(bmp)
|
||||
} else {
|
||||
Some(ksx1001_compatibility_hangul_encode(bmp))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct EucKrEncoder;
|
||||
|
||||
impl EucKrEncoder {
|
||||
@ -247,36 +310,7 @@ impl EucKrEncoder {
|
||||
let bmp_minus_hangul_start = bmp.wrapping_sub(0xAC00);
|
||||
let (lead, trail) = if bmp_minus_hangul_start < (0xD7A4 - 0xAC00) {
|
||||
// Hangul
|
||||
match KSX1001_HANGUL.binary_search(&bmp) {
|
||||
Ok(ksx_hangul_pointer) => {
|
||||
let ksx_hangul_lead = (ksx_hangul_pointer / 94) + (0x81 + 0x2F);
|
||||
let ksx_hangul_trail = (ksx_hangul_pointer % 94) + 0xA1;
|
||||
(ksx_hangul_lead, ksx_hangul_trail)
|
||||
}
|
||||
Err(_) => {
|
||||
let (lead, cp949_trail) = if bmp < 0xC8A5 {
|
||||
// Above KS X 1001
|
||||
let top_pointer = cp949_top_hangul_encode(bmp) as usize;
|
||||
let top_lead = (top_pointer / (190 - 12)) + 0x81;
|
||||
let top_trail = top_pointer % (190 - 12);
|
||||
(top_lead, top_trail)
|
||||
} else {
|
||||
// To the left of KS X 1001
|
||||
let left_pointer = cp949_left_hangul_encode(bmp) as usize;
|
||||
let left_lead = (left_pointer / (190 - 94 - 12)) + (0x81 + 0x20);
|
||||
let left_trail = left_pointer % (190 - 94 - 12);
|
||||
(left_lead, left_trail)
|
||||
};
|
||||
let offset = if cp949_trail >= (0x40 - 12) {
|
||||
0x41 + 12
|
||||
} else if cp949_trail >= (0x20 - 6) {
|
||||
0x41 + 6
|
||||
} else {
|
||||
0x41
|
||||
};
|
||||
(lead, cp949_trail + offset)
|
||||
}
|
||||
}
|
||||
ksx1001_encode_hangul(bmp, bmp_minus_hangul_start)
|
||||
} else if in_range16(bmp, 0x33DE, 0xFF01) {
|
||||
// Vast range that includes no other
|
||||
// mappables except Hangul (already
|
||||
@ -284,9 +318,7 @@ impl EucKrEncoder {
|
||||
// Narrow the range further to Unified and
|
||||
// Compatibility ranges of Hanja.
|
||||
if in_range16(bmp, 0x4E00, 0x9F9D) || in_range16(bmp, 0xF900, 0xFA0C) {
|
||||
if let Some(hanja_pointer) = position(&KSX1001_HANJA[..], bmp) {
|
||||
let hanja_lead = (hanja_pointer / 94) + (0x81 + 0x49);
|
||||
let hanja_trail = (hanja_pointer % 94) + 0xA1;
|
||||
if let Some((hanja_lead, hanja_trail)) = ksx1001_encode_hanja(bmp) {
|
||||
(hanja_lead, hanja_trail)
|
||||
} else {
|
||||
return (
|
||||
@ -303,7 +335,7 @@ impl EucKrEncoder {
|
||||
);
|
||||
}
|
||||
} else if let Some((lead, trail)) = ksx1001_encode_misc(bmp) {
|
||||
(lead, trail)
|
||||
(lead as u8, trail as u8)
|
||||
} else {
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
@ -311,7 +343,7 @@ impl EucKrEncoder {
|
||||
handle.written(),
|
||||
);
|
||||
};
|
||||
handle.write_two(lead as u8, trail as u8)
|
||||
handle.write_two(lead, trail)
|
||||
},
|
||||
bmp,
|
||||
self,
|
||||
|
99
third_party/rust/encoding_rs/src/gb18030.rs
vendored
99
third_party/rust/encoding_rs/src/gb18030.rs
vendored
@ -61,19 +61,23 @@ impl Gb18030Decoder {
|
||||
|
||||
fn extra_from_state(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_add(
|
||||
self.pending.count() + match self.first {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
} + match self.second {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
} + match self.third {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
} + match self.pending_ascii {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
},
|
||||
self.pending.count()
|
||||
+ match self.first {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
}
|
||||
+ match self.second {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
}
|
||||
+ match self.third {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
}
|
||||
+ match self.pending_ascii {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
@ -257,9 +261,9 @@ impl Gb18030Decoder {
|
||||
} else {
|
||||
handle.write_bmp_excl_ascii(gb18030_range_decode(pointer as u16))
|
||||
}
|
||||
} else if pointer >= 189000 && pointer <= 1237575 {
|
||||
} else if pointer >= 189_000 && pointer <= 1_237_575 {
|
||||
// Astral
|
||||
handle.write_astral((pointer - (189000usize - 0x10000usize)) as u32)
|
||||
handle.write_astral((pointer - (189_000usize - 0x1_0000usize)) as u32)
|
||||
} else {
|
||||
return (DecoderResult::Malformed(4, 0),
|
||||
unread_handle_fourth.consumed(),
|
||||
@ -391,6 +395,40 @@ fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> {
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "fast-gb-hanzi-encode"))]
|
||||
#[inline(always)]
|
||||
fn encode_hanzi(bmp: u16, _: u16) -> (u8, u8) {
|
||||
if let Some((lead, trail)) = gb2312_level1_hanzi_encode(bmp) {
|
||||
(lead, trail)
|
||||
} else if let Some(hanzi_pointer) = gb2312_level2_hanzi_encode(bmp) {
|
||||
let hanzi_lead = (hanzi_pointer / 94) + (0xD8);
|
||||
let hanzi_trail = (hanzi_pointer % 94) + 0xA1;
|
||||
(hanzi_lead as u8, hanzi_trail as u8)
|
||||
} else {
|
||||
let (lead, gbk_trail) = if bmp < 0x72DC {
|
||||
// Above GB2312
|
||||
let pointer = gbk_top_ideograph_encode(bmp) as usize;
|
||||
let lead = (pointer / 190) + 0x81;
|
||||
let gbk_trail = pointer % 190;
|
||||
(lead, gbk_trail)
|
||||
} else {
|
||||
// To the left of GB2312
|
||||
let gbk_left_ideograph_pointer = gbk_left_ideograph_encode(bmp) as usize;
|
||||
let lead = (gbk_left_ideograph_pointer / (190 - 94)) + (0x81 + 0x29);
|
||||
let gbk_trail = gbk_left_ideograph_pointer % (190 - 94);
|
||||
(lead, gbk_trail)
|
||||
};
|
||||
let offset = if gbk_trail < 0x3F { 0x40 } else { 0x41 };
|
||||
(lead as u8, (gbk_trail + offset) as u8)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "fast-gb-hanzi-encode")]
|
||||
#[inline(always)]
|
||||
fn encode_hanzi(_: u16, bmp_minus_unified_start: u16) -> (u8, u8) {
|
||||
gbk_hanzi_encode(bmp_minus_unified_start)
|
||||
}
|
||||
|
||||
pub struct Gb18030Encoder {
|
||||
extended: bool,
|
||||
}
|
||||
@ -447,33 +485,8 @@ impl Gb18030Encoder {
|
||||
// CJK Unified Ideographs
|
||||
// Can't fail now, since all are
|
||||
// mapped.
|
||||
// XXX Can we do something smarter
|
||||
// than linear search for GB2312
|
||||
// Level 2 Hanzi, which are almost
|
||||
// Unicode-ordered?
|
||||
if let Some((lead, trail)) = gb2312_level1_hanzi_encode(bmp) {
|
||||
handle.write_two(lead, trail)
|
||||
} else if let Some(hanzi_pointer) = gb2312_level2_hanzi_encode(bmp) {
|
||||
let hanzi_lead = (hanzi_pointer / 94) + (0xD8);
|
||||
let hanzi_trail = (hanzi_pointer % 94) + 0xA1;
|
||||
handle.write_two(hanzi_lead as u8, hanzi_trail as u8)
|
||||
} else {
|
||||
let (lead, gbk_trail) = if bmp < 0x72DC {
|
||||
// Above GB2312
|
||||
let pointer = gbk_top_ideograph_encode(bmp) as usize;
|
||||
let lead = (pointer / 190) + 0x81;
|
||||
let gbk_trail = pointer % 190;
|
||||
(lead, gbk_trail)
|
||||
} else {
|
||||
// To the left of GB2312
|
||||
let gbk_left_ideograph_pointer = gbk_left_ideograph_encode(bmp) as usize;
|
||||
let lead = (gbk_left_ideograph_pointer / (190 - 94)) + (0x81 + 0x29);
|
||||
let gbk_trail = gbk_left_ideograph_pointer % (190 - 94);
|
||||
(lead, gbk_trail)
|
||||
};
|
||||
let offset = if gbk_trail < 0x3F { 0x40 } else { 0x41 };
|
||||
handle.write_two(lead as u8, (gbk_trail + offset) as u8)
|
||||
}
|
||||
let (lead, trail) = encode_hanzi(bmp, bmp_minus_unified_start);
|
||||
handle.write_two(lead, trail)
|
||||
} else if bmp == 0xE5E5 {
|
||||
// It's not optimal to check for the unmappable
|
||||
// and for euro at this stage, but getting
|
||||
@ -522,7 +535,7 @@ impl Gb18030Encoder {
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
let range_pointer = astral as usize + (189000usize - 0x10000usize);
|
||||
let range_pointer = astral as usize + (189_000usize - 0x1_0000usize);
|
||||
let first = range_pointer / (10 * 126 * 10);
|
||||
let rem_first = range_pointer % (10 * 126 * 10);
|
||||
let second = rem_first / (10 * 126);
|
||||
|
279
third_party/rust/encoding_rs/src/handles.rs
vendored
279
third_party/rust/encoding_rs/src/handles.rs
vendored
@ -16,28 +16,24 @@
|
||||
//! the plan is to replace the internals with unsafe code that omits the
|
||||
//! bound check at the read/write time.
|
||||
|
||||
#[cfg(
|
||||
all(
|
||||
feature = "simd-accel",
|
||||
any(
|
||||
target_feature = "sse2",
|
||||
all(target_endian = "little", target_arch = "aarch64"),
|
||||
all(target_endian = "little", target_feature = "neon")
|
||||
)
|
||||
#[cfg(all(
|
||||
feature = "simd-accel",
|
||||
any(
|
||||
target_feature = "sse2",
|
||||
all(target_endian = "little", target_arch = "aarch64"),
|
||||
all(target_endian = "little", target_feature = "neon")
|
||||
)
|
||||
)]
|
||||
))]
|
||||
use simd_funcs::*;
|
||||
|
||||
#[cfg(
|
||||
all(
|
||||
feature = "simd-accel",
|
||||
any(
|
||||
target_feature = "sse2",
|
||||
all(target_endian = "little", target_arch = "aarch64"),
|
||||
all(target_endian = "little", target_feature = "neon")
|
||||
)
|
||||
#[cfg(all(
|
||||
feature = "simd-accel",
|
||||
any(
|
||||
target_feature = "sse2",
|
||||
all(target_endian = "little", target_arch = "aarch64"),
|
||||
all(target_endian = "little", target_feature = "neon")
|
||||
)
|
||||
)]
|
||||
))]
|
||||
use simd::u16x8;
|
||||
|
||||
use super::DecoderResult;
|
||||
@ -92,6 +88,7 @@ impl Endian for LittleEndian {
|
||||
const OPPOSITE_ENDIAN: bool = true;
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
struct UnalignedU16Slice {
|
||||
ptr: *const u8,
|
||||
len: usize,
|
||||
@ -114,11 +111,7 @@ impl UnalignedU16Slice {
|
||||
assert!(i < self.len);
|
||||
unsafe {
|
||||
let mut u: u16 = ::std::mem::uninitialized();
|
||||
::std::ptr::copy_nonoverlapping(
|
||||
self.ptr.offset((i * 2) as isize),
|
||||
&mut u as *mut u16 as *mut u8,
|
||||
2,
|
||||
);
|
||||
::std::ptr::copy_nonoverlapping(self.ptr.add(i * 2), &mut u as *mut u16 as *mut u8, 2);
|
||||
u
|
||||
}
|
||||
}
|
||||
@ -128,7 +121,7 @@ impl UnalignedU16Slice {
|
||||
pub fn simd_at(&self, i: usize) -> u16x8 {
|
||||
assert!(i + SIMD_STRIDE_SIZE / 2 <= self.len);
|
||||
let byte_index = i * 2;
|
||||
unsafe { to_u16_lanes(load16_unaligned(self.ptr.offset(byte_index as isize))) }
|
||||
unsafe { to_u16_lanes(load16_unaligned(self.ptr.add(byte_index))) }
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
@ -141,7 +134,7 @@ impl UnalignedU16Slice {
|
||||
// XXX the return value should be restricted not to
|
||||
// outlive self.
|
||||
assert!(from <= self.len);
|
||||
unsafe { UnalignedU16Slice::new(self.ptr.offset((from * 2) as isize), self.len - from) }
|
||||
unsafe { UnalignedU16Slice::new(self.ptr.add(from * 2), self.len - from) }
|
||||
}
|
||||
|
||||
#[cfg(feature = "simd-accel")]
|
||||
@ -157,7 +150,7 @@ impl UnalignedU16Slice {
|
||||
simd = simd_byte_swap(simd);
|
||||
}
|
||||
unsafe {
|
||||
store8_unaligned(other.as_mut_ptr().offset(offset as isize), simd);
|
||||
store8_unaligned(other.as_mut_ptr().add(offset), simd);
|
||||
}
|
||||
if contains_surrogates(simd) {
|
||||
break;
|
||||
@ -183,9 +176,9 @@ impl UnalignedU16Slice {
|
||||
#[inline(always)]
|
||||
fn copy_bmp_to<E: Endian>(&self, other: &mut [u16]) -> Option<(u16, usize)> {
|
||||
assert!(self.len <= other.len());
|
||||
for i in 0..self.len {
|
||||
for (i, target) in other.iter_mut().enumerate().take(self.len) {
|
||||
let unit = swap_if_opposite_endian::<E>(self.at(i));
|
||||
other[i] = unit;
|
||||
*target = unit;
|
||||
if super::in_range16(unit, 0xD800, 0xE000) {
|
||||
return Some((unit, i));
|
||||
}
|
||||
@ -255,7 +248,7 @@ fn copy_unaligned_basic_latin_to_ascii<E: Endian>(
|
||||
}
|
||||
let packed = simd_pack(first, second);
|
||||
unsafe {
|
||||
store16_unaligned(dst.as_mut_ptr().offset(offset as isize), packed);
|
||||
store16_unaligned(dst.as_mut_ptr().add(offset), packed);
|
||||
}
|
||||
offset += SIMD_STRIDE_SIZE;
|
||||
if offset > len_minus_stride {
|
||||
@ -302,16 +295,16 @@ fn convert_unaligned_utf16_to_utf8<E: Endian>(
|
||||
let non_ascii_minus_surrogate_start = non_ascii.wrapping_sub(0xD800);
|
||||
if non_ascii_minus_surrogate_start > (0xDFFF - 0xD800) {
|
||||
if non_ascii < 0x800 {
|
||||
dst[dst_pos] = ((non_ascii as u32 >> 6) | 0xC0u32) as u8;
|
||||
dst[dst_pos] = ((non_ascii >> 6) | 0xC0) as u8;
|
||||
dst_pos += 1;
|
||||
dst[dst_pos] = ((non_ascii as u32 & 0x3Fu32) | 0x80u32) as u8;
|
||||
dst[dst_pos] = ((non_ascii & 0x3F) | 0x80) as u8;
|
||||
dst_pos += 1;
|
||||
} else {
|
||||
dst[dst_pos] = ((non_ascii as u32 >> 12) | 0xE0u32) as u8;
|
||||
dst[dst_pos] = ((non_ascii >> 12) | 0xE0) as u8;
|
||||
dst_pos += 1;
|
||||
dst[dst_pos] = (((non_ascii as u32 & 0xFC0u32) >> 6) | 0x80u32) as u8;
|
||||
dst[dst_pos] = (((non_ascii & 0xFC0) >> 6) | 0x80) as u8;
|
||||
dst_pos += 1;
|
||||
dst[dst_pos] = ((non_ascii as u32 & 0x3Fu32) | 0x80u32) as u8;
|
||||
dst[dst_pos] = ((non_ascii & 0x3F) | 0x80) as u8;
|
||||
dst_pos += 1;
|
||||
}
|
||||
} else if non_ascii_minus_surrogate_start <= (0xDBFF - 0xD800) {
|
||||
@ -322,7 +315,7 @@ fn convert_unaligned_utf16_to_utf8<E: Endian>(
|
||||
if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) {
|
||||
// The next code unit is a low surrogate. Advance position.
|
||||
src_pos += 1;
|
||||
let point = ((non_ascii as u32) << 10) + (second as u32)
|
||||
let point = (u32::from(non_ascii) << 10) + u32::from(second)
|
||||
- (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
|
||||
|
||||
dst[dst_pos] = ((point >> 18) | 0xF0u32) as u8;
|
||||
@ -613,7 +606,7 @@ impl<'a> Utf16Destination<'a> {
|
||||
#[inline(always)]
|
||||
fn write_ascii(&mut self, ascii: u8) {
|
||||
debug_assert!(ascii < 0x80);
|
||||
self.write_code_unit(ascii as u16);
|
||||
self.write_code_unit(u16::from(ascii));
|
||||
}
|
||||
#[inline(always)]
|
||||
fn write_bmp(&mut self, bmp: u16) {
|
||||
@ -637,7 +630,7 @@ impl<'a> Utf16Destination<'a> {
|
||||
#[inline(always)]
|
||||
fn write_astral(&mut self, astral: u32) {
|
||||
debug_assert!(astral > 0xFFFF);
|
||||
debug_assert!(astral <= 0x10FFFF);
|
||||
debug_assert!(astral <= 0x10_FFFF);
|
||||
self.write_code_unit((0xD7C0 + (astral >> 10)) as u16);
|
||||
self.write_code_unit((0xDC00 + (astral & 0x3FF)) as u16);
|
||||
}
|
||||
@ -962,15 +955,15 @@ impl<'a> Utf8Destination<'a> {
|
||||
fn write_mid_bmp(&mut self, mid_bmp: u16) {
|
||||
debug_assert!(mid_bmp >= 0x80);
|
||||
debug_assert!(mid_bmp < 0x800);
|
||||
self.write_code_unit(((mid_bmp as u32 >> 6) | 0xC0u32) as u8);
|
||||
self.write_code_unit(((mid_bmp as u32 & 0x3Fu32) | 0x80u32) as u8);
|
||||
self.write_code_unit(((mid_bmp >> 6) | 0xC0) as u8);
|
||||
self.write_code_unit(((mid_bmp & 0x3F) | 0x80) as u8);
|
||||
}
|
||||
#[inline(always)]
|
||||
fn write_upper_bmp(&mut self, upper_bmp: u16) {
|
||||
debug_assert!(upper_bmp >= 0x800);
|
||||
self.write_code_unit(((upper_bmp as u32 >> 12) | 0xE0u32) as u8);
|
||||
self.write_code_unit((((upper_bmp as u32 & 0xFC0u32) >> 6) | 0x80u32) as u8);
|
||||
self.write_code_unit(((upper_bmp as u32 & 0x3Fu32) | 0x80u32) as u8);
|
||||
self.write_code_unit(((upper_bmp >> 12) | 0xE0) as u8);
|
||||
self.write_code_unit((((upper_bmp & 0xFC0) >> 6) | 0x80) as u8);
|
||||
self.write_code_unit(((upper_bmp & 0x3F) | 0x80) as u8);
|
||||
}
|
||||
#[inline(always)]
|
||||
fn write_bmp_excl_ascii(&mut self, bmp: u16) {
|
||||
@ -983,16 +976,17 @@ impl<'a> Utf8Destination<'a> {
|
||||
#[inline(always)]
|
||||
fn write_astral(&mut self, astral: u32) {
|
||||
debug_assert!(astral > 0xFFFF);
|
||||
debug_assert!(astral <= 0x10FFFF);
|
||||
self.write_code_unit(((astral >> 18) | 0xF0u32) as u8);
|
||||
self.write_code_unit((((astral & 0x3F000u32) >> 12) | 0x80u32) as u8);
|
||||
self.write_code_unit((((astral & 0xFC0u32) >> 6) | 0x80u32) as u8);
|
||||
self.write_code_unit(((astral & 0x3Fu32) | 0x80u32) as u8);
|
||||
debug_assert!(astral <= 0x10_FFFF);
|
||||
self.write_code_unit(((astral >> 18) | 0xF0) as u8);
|
||||
self.write_code_unit((((astral & 0x3F000) >> 12) | 0x80) as u8);
|
||||
self.write_code_unit((((astral & 0xFC0) >> 6) | 0x80) as u8);
|
||||
self.write_code_unit(((astral & 0x3F) | 0x80) as u8);
|
||||
}
|
||||
#[inline(always)]
|
||||
pub fn write_surrogate_pair(&mut self, high: u16, low: u16) {
|
||||
self.write_astral(
|
||||
((high as u32) << 10) + (low as u32) - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32),
|
||||
(u32::from(high) << 10) + u32::from(low)
|
||||
- (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32),
|
||||
);
|
||||
}
|
||||
#[inline(always)]
|
||||
@ -1088,13 +1082,7 @@ impl<'a> Utf8Destination<'a> {
|
||||
// Validate first, then memcpy to let memcpy do its thing even for
|
||||
// non-ASCII. (And potentially do something better than SSE2 for ASCII.)
|
||||
let valid_len = utf8_valid_up_to(&src_remaining[..min_len]);
|
||||
unsafe {
|
||||
::std::ptr::copy_nonoverlapping(
|
||||
src_remaining.as_ptr(),
|
||||
dst_remaining.as_mut_ptr(),
|
||||
valid_len,
|
||||
);
|
||||
}
|
||||
(&mut dst_remaining[..valid_len]).copy_from_slice(&src_remaining[..valid_len]);
|
||||
source.pos += valid_len;
|
||||
self.pos += valid_len;
|
||||
}
|
||||
@ -1162,23 +1150,24 @@ impl<'a> Utf16Source<'a> {
|
||||
#[inline(always)]
|
||||
fn read(&mut self) -> char {
|
||||
self.old_pos = self.pos;
|
||||
let unit = self.slice[self.pos] as u32;
|
||||
let unit = self.slice[self.pos];
|
||||
self.pos += 1;
|
||||
let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
|
||||
if unit_minus_surrogate_start > (0xDFFF - 0xD800) {
|
||||
return unsafe { ::std::mem::transmute(unit) };
|
||||
return unsafe { ::std::char::from_u32_unchecked(u32::from(unit)) };
|
||||
}
|
||||
if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
|
||||
// high surrogate
|
||||
if self.pos < self.slice.len() {
|
||||
let second = self.slice[self.pos] as u32;
|
||||
let second = self.slice[self.pos];
|
||||
let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
|
||||
if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) {
|
||||
// The next code unit is a low surrogate. Advance position.
|
||||
self.pos += 1;
|
||||
return unsafe {
|
||||
::std::mem::transmute(
|
||||
(unit << 10) + second - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32),
|
||||
::std::char::from_u32_unchecked(
|
||||
(u32::from(unit) << 10) + u32::from(second)
|
||||
- (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32),
|
||||
)
|
||||
};
|
||||
}
|
||||
@ -1207,14 +1196,14 @@ impl<'a> Utf16Source<'a> {
|
||||
if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
|
||||
// high surrogate
|
||||
if self.pos < self.slice.len() {
|
||||
let second = self.slice[self.pos] as u32;
|
||||
let second = self.slice[self.pos];
|
||||
let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
|
||||
if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) {
|
||||
// The next code unit is a low surrogate. Advance position.
|
||||
self.pos += 1;
|
||||
return Unicode::NonAscii(NonAscii::Astral(unsafe {
|
||||
::std::mem::transmute(
|
||||
((unit as u32) << 10) + (second as u32)
|
||||
::std::char::from_u32_unchecked(
|
||||
(u32::from(unit) << 10) + u32::from(second)
|
||||
- (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32),
|
||||
)
|
||||
}));
|
||||
@ -1271,14 +1260,14 @@ impl<'a> Utf16Source<'a> {
|
||||
} else if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
|
||||
// high surrogate
|
||||
if self.pos < self.slice.len() {
|
||||
let second = self.slice[self.pos] as u32;
|
||||
let second = self.slice[self.pos];
|
||||
let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
|
||||
if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) {
|
||||
// The next code unit is a low surrogate. Advance position.
|
||||
self.pos += 1;
|
||||
NonAscii::Astral(unsafe {
|
||||
::std::mem::transmute(
|
||||
((unit as u32) << 10) + (second as u32)
|
||||
::std::char::from_u32_unchecked(
|
||||
(u32::from(unit) << 10) + u32::from(second)
|
||||
- (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32),
|
||||
)
|
||||
})
|
||||
@ -1344,15 +1333,15 @@ impl<'a> Utf16Source<'a> {
|
||||
// Unpaired surrogate at the end of the buffer.
|
||||
NonAscii::BmpExclAscii(0xFFFDu16)
|
||||
} else {
|
||||
let second = self.slice[self.pos] as u32;
|
||||
let second = self.slice[self.pos];
|
||||
let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
|
||||
if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) {
|
||||
// The next code unit is a low surrogate. Advance position.
|
||||
self.pos += 1;
|
||||
NonAscii::Astral(unsafe {
|
||||
::std::mem::transmute(
|
||||
((unit as u32) << 10) + (second as u32)
|
||||
- (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32),
|
||||
::std::char::from_u32_unchecked(
|
||||
(u32::from(unit) << 10) + u32::from(second)
|
||||
- (((0xD800u32 << 10) - 0x1_0000u32) + 0xDC00u32),
|
||||
)
|
||||
})
|
||||
} else {
|
||||
@ -1469,57 +1458,60 @@ impl<'a> Utf8Source<'a> {
|
||||
#[inline(always)]
|
||||
fn read(&mut self) -> char {
|
||||
self.old_pos = self.pos;
|
||||
let unit = self.slice[self.pos] as u32;
|
||||
if unit < 0x80u32 {
|
||||
let unit = self.slice[self.pos];
|
||||
if unit < 0x80 {
|
||||
self.pos += 1;
|
||||
return unsafe { ::std::mem::transmute(unit) };
|
||||
return char::from(unit);
|
||||
}
|
||||
if unit < 0xE0u32 {
|
||||
let point = ((unit & 0x1Fu32) << 6) | (self.slice[self.pos + 1] as u32 & 0x3Fu32);
|
||||
if unit < 0xE0 {
|
||||
let point =
|
||||
((u32::from(unit) & 0x1F) << 6) | (u32::from(self.slice[self.pos + 1]) & 0x3F);
|
||||
self.pos += 2;
|
||||
return unsafe { ::std::mem::transmute(point) };
|
||||
return unsafe { ::std::char::from_u32_unchecked(point) };
|
||||
}
|
||||
if unit < 0xF0u32 {
|
||||
let point = ((unit & 0xFu32) << 12)
|
||||
| ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 6)
|
||||
| (self.slice[self.pos + 2] as u32 & 0x3Fu32);
|
||||
if unit < 0xF0 {
|
||||
let point = ((u32::from(unit) & 0xF) << 12)
|
||||
| ((u32::from(self.slice[self.pos + 1]) & 0x3F) << 6)
|
||||
| (u32::from(self.slice[self.pos + 2]) & 0x3F);
|
||||
self.pos += 3;
|
||||
return unsafe { ::std::mem::transmute(point) };
|
||||
return unsafe { ::std::char::from_u32_unchecked(point) };
|
||||
}
|
||||
let point = ((unit & 0x7u32) << 18)
|
||||
| ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 12)
|
||||
| ((self.slice[self.pos + 2] as u32 & 0x3Fu32) << 6)
|
||||
| (self.slice[self.pos + 3] as u32 & 0x3Fu32);
|
||||
let point = ((u32::from(unit) & 0x7) << 18)
|
||||
| ((u32::from(self.slice[self.pos + 1]) & 0x3F) << 12)
|
||||
| ((u32::from(self.slice[self.pos + 2]) & 0x3F) << 6)
|
||||
| (u32::from(self.slice[self.pos + 3]) & 0x3F);
|
||||
self.pos += 4;
|
||||
unsafe { ::std::mem::transmute(point) }
|
||||
unsafe { ::std::char::from_u32_unchecked(point) }
|
||||
}
|
||||
#[inline(always)]
|
||||
fn read_enum(&mut self) -> Unicode {
|
||||
self.old_pos = self.pos;
|
||||
let unit = self.slice[self.pos];
|
||||
if unit < 0x80u8 {
|
||||
if unit < 0x80 {
|
||||
self.pos += 1;
|
||||
return Unicode::Ascii(unit);
|
||||
}
|
||||
if unit < 0xE0u8 {
|
||||
if unit < 0xE0 {
|
||||
let point =
|
||||
(((unit as u32) & 0x1Fu32) << 6) | (self.slice[self.pos + 1] as u32 & 0x3Fu32);
|
||||
((u16::from(unit) & 0x1F) << 6) | (u16::from(self.slice[self.pos + 1]) & 0x3F);
|
||||
self.pos += 2;
|
||||
return Unicode::NonAscii(NonAscii::BmpExclAscii(point as u16));
|
||||
return Unicode::NonAscii(NonAscii::BmpExclAscii(point));
|
||||
}
|
||||
if unit < 0xF0u8 {
|
||||
let point = (((unit as u32) & 0xFu32) << 12)
|
||||
| ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 6)
|
||||
| (self.slice[self.pos + 2] as u32 & 0x3Fu32);
|
||||
if unit < 0xF0 {
|
||||
let point = ((u16::from(unit) & 0xF) << 12)
|
||||
| ((u16::from(self.slice[self.pos + 1]) & 0x3F) << 6)
|
||||
| (u16::from(self.slice[self.pos + 2]) & 0x3F);
|
||||
self.pos += 3;
|
||||
return Unicode::NonAscii(NonAscii::BmpExclAscii(point as u16));
|
||||
return Unicode::NonAscii(NonAscii::BmpExclAscii(point));
|
||||
}
|
||||
let point = (((unit as u32) & 0x7u32) << 18)
|
||||
| ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 12)
|
||||
| ((self.slice[self.pos + 2] as u32 & 0x3Fu32) << 6)
|
||||
| (self.slice[self.pos + 3] as u32 & 0x3Fu32);
|
||||
let point = ((u32::from(unit) & 0x7) << 18)
|
||||
| ((u32::from(self.slice[self.pos + 1]) & 0x3F) << 12)
|
||||
| ((u32::from(self.slice[self.pos + 2]) & 0x3F) << 6)
|
||||
| (u32::from(self.slice[self.pos + 3]) & 0x3F);
|
||||
self.pos += 4;
|
||||
Unicode::NonAscii(NonAscii::Astral(unsafe { ::std::mem::transmute(point) }))
|
||||
Unicode::NonAscii(NonAscii::Astral(unsafe {
|
||||
::std::char::from_u32_unchecked(point)
|
||||
}))
|
||||
}
|
||||
#[inline(always)]
|
||||
fn unread(&mut self) -> usize {
|
||||
@ -1556,25 +1548,24 @@ impl<'a> Utf8Source<'a> {
|
||||
dest.pos += consumed;
|
||||
// We don't need to check space in destination, because
|
||||
// `ascii_to_ascii()` already did.
|
||||
let non_ascii32 = non_ascii as u32;
|
||||
if non_ascii32 < 0xE0u32 {
|
||||
let point = ((non_ascii32 & 0x1Fu32) << 6)
|
||||
| (self.slice[self.pos + 1] as u32 & 0x3Fu32);
|
||||
if non_ascii < 0xE0 {
|
||||
let point = ((u16::from(non_ascii) & 0x1F) << 6)
|
||||
| (u16::from(self.slice[self.pos + 1]) & 0x3F);
|
||||
self.pos += 2;
|
||||
NonAscii::BmpExclAscii(point as u16)
|
||||
} else if non_ascii32 < 0xF0u32 {
|
||||
let point = ((non_ascii32 & 0xFu32) << 12)
|
||||
| ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 6)
|
||||
| (self.slice[self.pos + 2] as u32 & 0x3Fu32);
|
||||
NonAscii::BmpExclAscii(point)
|
||||
} else if non_ascii < 0xF0 {
|
||||
let point = ((u16::from(non_ascii) & 0xF) << 12)
|
||||
| ((u16::from(self.slice[self.pos + 1]) & 0x3F) << 6)
|
||||
| (u16::from(self.slice[self.pos + 2]) & 0x3F);
|
||||
self.pos += 3;
|
||||
NonAscii::BmpExclAscii(point as u16)
|
||||
NonAscii::BmpExclAscii(point)
|
||||
} else {
|
||||
let point = ((non_ascii32 & 0x7u32) << 18)
|
||||
| ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 12)
|
||||
| ((self.slice[self.pos + 2] as u32 & 0x3Fu32) << 6)
|
||||
| (self.slice[self.pos + 3] as u32 & 0x3Fu32);
|
||||
let point = ((u32::from(non_ascii) & 0x7) << 18)
|
||||
| ((u32::from(self.slice[self.pos + 1]) & 0x3F) << 12)
|
||||
| ((u32::from(self.slice[self.pos + 2]) & 0x3F) << 6)
|
||||
| (u32::from(self.slice[self.pos + 3]) & 0x3F);
|
||||
self.pos += 4;
|
||||
NonAscii::Astral(unsafe { ::std::mem::transmute(point) })
|
||||
NonAscii::Astral(unsafe { ::std::char::from_u32_unchecked(point) })
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1607,25 +1598,24 @@ impl<'a> Utf8Source<'a> {
|
||||
self.pos += consumed;
|
||||
dest.pos += consumed;
|
||||
if dest.pos + 1 < dst_len {
|
||||
let non_ascii32 = non_ascii as u32;
|
||||
if non_ascii32 < 0xE0u32 {
|
||||
let point = ((non_ascii32 & 0x1Fu32) << 6)
|
||||
| (self.slice[self.pos + 1] as u32 & 0x3Fu32);
|
||||
if non_ascii < 0xE0 {
|
||||
let point = ((u16::from(non_ascii) & 0x1F) << 6)
|
||||
| (u16::from(self.slice[self.pos + 1]) & 0x3F);
|
||||
self.pos += 2;
|
||||
NonAscii::BmpExclAscii(point as u16)
|
||||
} else if non_ascii32 < 0xF0u32 {
|
||||
let point = ((non_ascii32 & 0xFu32) << 12)
|
||||
| ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 6)
|
||||
| (self.slice[self.pos + 2] as u32 & 0x3Fu32);
|
||||
NonAscii::BmpExclAscii(point)
|
||||
} else if non_ascii < 0xF0 {
|
||||
let point = ((u16::from(non_ascii) & 0xF) << 12)
|
||||
| ((u16::from(self.slice[self.pos + 1]) & 0x3F) << 6)
|
||||
| (u16::from(self.slice[self.pos + 2]) & 0x3F);
|
||||
self.pos += 3;
|
||||
NonAscii::BmpExclAscii(point as u16)
|
||||
NonAscii::BmpExclAscii(point)
|
||||
} else {
|
||||
let point = ((non_ascii32 & 0x7u32) << 18)
|
||||
| ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 12)
|
||||
| ((self.slice[self.pos + 2] as u32 & 0x3Fu32) << 6)
|
||||
| (self.slice[self.pos + 3] as u32 & 0x3Fu32);
|
||||
let point = ((u32::from(non_ascii) & 0x7) << 18)
|
||||
| ((u32::from(self.slice[self.pos + 1]) & 0x3F) << 12)
|
||||
| ((u32::from(self.slice[self.pos + 2]) & 0x3F) << 6)
|
||||
| (u32::from(self.slice[self.pos + 3]) & 0x3F);
|
||||
self.pos += 4;
|
||||
NonAscii::Astral(unsafe { ::std::mem::transmute(point) })
|
||||
NonAscii::Astral(unsafe { ::std::char::from_u32_unchecked(point) })
|
||||
}
|
||||
} else {
|
||||
return CopyAsciiResult::Stop((
|
||||
@ -1665,25 +1655,24 @@ impl<'a> Utf8Source<'a> {
|
||||
self.pos += consumed;
|
||||
dest.pos += consumed;
|
||||
if dest.pos + 3 < dst_len {
|
||||
let non_ascii32 = non_ascii as u32;
|
||||
if non_ascii32 < 0xE0u32 {
|
||||
let point = ((non_ascii32 & 0x1Fu32) << 6)
|
||||
| (self.slice[self.pos + 1] as u32 & 0x3Fu32);
|
||||
if non_ascii < 0xE0 {
|
||||
let point = ((u16::from(non_ascii) & 0x1F) << 6)
|
||||
| (u16::from(self.slice[self.pos + 1]) & 0x3F);
|
||||
self.pos += 2;
|
||||
NonAscii::BmpExclAscii(point as u16)
|
||||
} else if non_ascii32 < 0xF0u32 {
|
||||
let point = ((non_ascii32 & 0xFu32) << 12)
|
||||
| ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 6)
|
||||
| (self.slice[self.pos + 2] as u32 & 0x3Fu32);
|
||||
NonAscii::BmpExclAscii(point)
|
||||
} else if non_ascii < 0xF0 {
|
||||
let point = ((u16::from(non_ascii) & 0xF) << 12)
|
||||
| ((u16::from(self.slice[self.pos + 1]) & 0x3F) << 6)
|
||||
| (u16::from(self.slice[self.pos + 2]) & 0x3F);
|
||||
self.pos += 3;
|
||||
NonAscii::BmpExclAscii(point as u16)
|
||||
NonAscii::BmpExclAscii(point)
|
||||
} else {
|
||||
let point = ((non_ascii32 & 0x7u32) << 18)
|
||||
| ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 12)
|
||||
| ((self.slice[self.pos + 2] as u32 & 0x3Fu32) << 6)
|
||||
| (self.slice[self.pos + 3] as u32 & 0x3Fu32);
|
||||
let point = ((u32::from(non_ascii) & 0x7) << 18)
|
||||
| ((u32::from(self.slice[self.pos + 1]) & 0x3F) << 12)
|
||||
| ((u32::from(self.slice[self.pos + 2]) & 0x3F) << 6)
|
||||
| (u32::from(self.slice[self.pos + 3]) & 0x3F);
|
||||
self.pos += 4;
|
||||
NonAscii::Astral(unsafe { ::std::mem::transmute(point) })
|
||||
NonAscii::Astral(unsafe { ::std::char::from_u32_unchecked(point) })
|
||||
}
|
||||
} else {
|
||||
return CopyAsciiResult::Stop((
|
||||
|
108
third_party/rust/encoding_rs/src/iso_2022_jp.rs
vendored
108
third_party/rust/encoding_rs/src/iso_2022_jp.rs
vendored
@ -107,7 +107,7 @@ impl Iso2022JpDecoder {
|
||||
}
|
||||
Iso2022JpDecoderState::Katakana => {
|
||||
destination_handle
|
||||
.write_upper_bmp(self.lead as u16 - 0x21u16 + 0xFF61u16);
|
||||
.write_upper_bmp(u16::from(self.lead) - 0x21u16 + 0xFF61u16);
|
||||
self.lead = 0x0u8;
|
||||
}
|
||||
Iso2022JpDecoderState::LeadByte => {
|
||||
@ -183,7 +183,7 @@ impl Iso2022JpDecoder {
|
||||
}
|
||||
self.output_flag = false;
|
||||
if b >= 0x21u8 && b <= 0x5Fu8 {
|
||||
destination_handle.write_upper_bmp(b as u16 - 0x21u16 + 0xFF61u16);
|
||||
destination_handle.write_upper_bmp(u16::from(b) - 0x21u16 + 0xFF61u16);
|
||||
continue;
|
||||
}
|
||||
return (
|
||||
@ -231,11 +231,11 @@ impl Iso2022JpDecoder {
|
||||
// and Katakana (10% acconding to Lunde).
|
||||
if jis0208_lead_minus_offset == 0x03 && trail_minus_offset < 0x53 {
|
||||
// Hiragana
|
||||
handle.write_upper_bmp(0x3041 + trail_minus_offset as u16);
|
||||
handle.write_upper_bmp(0x3041 + u16::from(trail_minus_offset));
|
||||
continue;
|
||||
} else if jis0208_lead_minus_offset == 0x04 && trail_minus_offset < 0x56 {
|
||||
// Katakana
|
||||
handle.write_upper_bmp(0x30A1 + trail_minus_offset as u16);
|
||||
handle.write_upper_bmp(0x30A1 + u16::from(trail_minus_offset));
|
||||
continue;
|
||||
} else if trail_minus_offset > (0xFE - 0xA1) {
|
||||
return (
|
||||
@ -356,7 +356,46 @@ impl Iso2022JpDecoder {
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(if_let_redundant_pattern_matching, if_same_then_else))]
|
||||
#[cfg(feature = "fast-kanji-encode")]
|
||||
#[inline(always)]
|
||||
fn is_kanji_mapped(bmp: u16) -> bool {
|
||||
// Use the shift_jis variant, because we don't care about the
|
||||
// byte values here.
|
||||
jis0208_kanji_shift_jis_encode(bmp).is_some()
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "fast-kanji-encode"))]
|
||||
#[cfg_attr(
|
||||
feature = "cargo-clippy",
|
||||
allow(
|
||||
if_let_redundant_pattern_matching,
|
||||
if_same_then_else
|
||||
)
|
||||
)]
|
||||
#[inline(always)]
|
||||
fn is_kanji_mapped(bmp: u16) -> bool {
|
||||
if 0x4EDD == bmp {
|
||||
true
|
||||
} else if let Some(_) = jis0208_level1_kanji_shift_jis_encode(bmp) {
|
||||
// Use the shift_jis variant, because we don't care about the
|
||||
// byte values here.
|
||||
true
|
||||
} else if let Some(_) = jis0208_level2_and_additional_kanji_encode(bmp) {
|
||||
true
|
||||
} else if let Some(_) = position(&IBM_KANJI[..], bmp) {
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg_attr(
|
||||
feature = "cargo-clippy",
|
||||
allow(
|
||||
if_let_redundant_pattern_matching,
|
||||
if_same_then_else
|
||||
)
|
||||
)]
|
||||
fn is_mapped_for_two_byte_encode(bmp: u16) -> bool {
|
||||
// The code below uses else after return to
|
||||
// keep the same structure as in EUC-JP.
|
||||
@ -365,19 +404,7 @@ fn is_mapped_for_two_byte_encode(bmp: u16) -> bool {
|
||||
if bmp_minus_hiragana < 0x53 {
|
||||
true
|
||||
} else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
|
||||
if 0x4EDD == bmp {
|
||||
true
|
||||
} else if let Some(_) = jis0208_level1_kanji_shift_jis_encode(bmp) {
|
||||
// Use the shift_jis variant, because we don't care about the
|
||||
// byte values here.
|
||||
true
|
||||
} else if let Some(_) = jis0208_level2_and_additional_kanji_encode(bmp) {
|
||||
true
|
||||
} else if let Some(_) = position(&IBM_KANJI[..], bmp) {
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
is_kanji_mapped(bmp)
|
||||
} else {
|
||||
let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
|
||||
if bmp_minus_katakana < 0x56 {
|
||||
@ -406,6 +433,33 @@ fn is_mapped_for_two_byte_encode(bmp: u16) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "fast-kanji-encode")]
|
||||
#[inline(always)]
|
||||
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
|
||||
jis0208_kanji_iso_2022_jp_encode(bmp)
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "fast-kanji-encode"))]
|
||||
#[inline(always)]
|
||||
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
|
||||
if 0x4EDD == bmp {
|
||||
// Ideograph on the symbol row!
|
||||
Some((0x21, 0xB8 - 0x80))
|
||||
} else if let Some((lead, trail)) = jis0208_level1_kanji_iso_2022_jp_encode(bmp) {
|
||||
Some((lead, trail))
|
||||
} else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
|
||||
let lead = (pos / 94) + (0xD0 - 0x80);
|
||||
let trail = (pos % 94) + 0x21;
|
||||
Some((lead as u8, trail as u8))
|
||||
} else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
|
||||
let lead = (pos / 94) + (0xF9 - 0x80);
|
||||
let trail = (pos % 94) + 0x21;
|
||||
Some((lead as u8, trail as u8))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
enum Iso2022JpEncoderState {
|
||||
Ascii,
|
||||
Roman,
|
||||
@ -605,25 +659,9 @@ impl Iso2022JpEncoder {
|
||||
handle.write_two(0x24, 0x21 + bmp_minus_hiragana as u8);
|
||||
continue;
|
||||
} else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
|
||||
if 0x4EDD == bmp {
|
||||
// Ideograph on the symbol row!
|
||||
handle.write_two(0x21, 0xB8 - 0x80);
|
||||
continue;
|
||||
} else if let Some((lead, trail)) =
|
||||
jis0208_level1_kanji_iso_2022_jp_encode(bmp)
|
||||
{
|
||||
if let Some((lead, trail)) = encode_kanji(bmp) {
|
||||
handle.write_two(lead, trail);
|
||||
continue;
|
||||
} else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
|
||||
let lead = (pos / 94) + (0xD0 - 0x80);
|
||||
let trail = (pos % 94) + 0x21;
|
||||
handle.write_two(lead as u8, trail as u8);
|
||||
continue;
|
||||
} else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
|
||||
let lead = (pos / 94) + (0xF9 - 0x80);
|
||||
let trail = (pos % 94) + 0x21;
|
||||
handle.write_two(lead as u8, trail as u8);
|
||||
continue;
|
||||
} else {
|
||||
self.state = Iso2022JpEncoderState::Ascii;
|
||||
return (
|
||||
|
224
third_party/rust/encoding_rs/src/lib.rs
vendored
224
third_party/rust/encoding_rs/src/lib.rs
vendored
@ -7,8 +7,15 @@
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
#![cfg_attr(feature = "cargo-clippy", allow(doc_markdown, inline_always, new_ret_no_self))]
|
||||
#![doc(html_root_url = "https://docs.rs/encoding_rs/0.8.9")]
|
||||
#![cfg_attr(
|
||||
feature = "cargo-clippy",
|
||||
allow(
|
||||
doc_markdown,
|
||||
inline_always,
|
||||
new_ret_no_self
|
||||
)
|
||||
)]
|
||||
#![doc(html_root_url = "https://docs.rs/encoding_rs/0.8.12")]
|
||||
|
||||
//! encoding_rs is a Gecko-oriented Free Software / Open Source implementation
|
||||
//! of the [Encoding Standard](https://encoding.spec.whatwg.org/) in Rust.
|
||||
@ -82,10 +89,7 @@
|
||||
//! // Very short output buffer to demonstrate the output buffer getting full.
|
||||
//! // Normally, you'd use something like `[0u8; 2048]`.
|
||||
//! let mut buffer_bytes = [0u8; 8];
|
||||
//! // Rust doesn't allow us to stack-allocate a `mut str` without `unsafe`.
|
||||
//! let mut buffer: &mut str = unsafe {
|
||||
//! std::mem::transmute(&mut buffer_bytes[..])
|
||||
//! };
|
||||
//! let mut buffer: &mut str = std::str::from_utf8_mut(&mut buffer_bytes[..]).unwrap();
|
||||
//!
|
||||
//! // How many bytes in the buffer currently hold significant data.
|
||||
//! let mut bytes_in_buffer = 0usize;
|
||||
@ -231,16 +235,17 @@
|
||||
//! performance, the decoder for ISO-2022-JP optimizes for ease/clarity
|
||||
//! of implementation.
|
||||
//!
|
||||
//! Despite the focus on the Web, encoding_rs may well be useful for decoding
|
||||
//! email, although you'll need to implement UTF-7 decoding and label handling
|
||||
//! by other means. (Due to the Web focus, patches to add UTF-7 are unwelcome
|
||||
//! in encoding_rs itself.) Also, despite the browser focus, the hope is that
|
||||
//! non-browser applications that wish to consume Web content or submit Web
|
||||
//! forms in a Web-compatible way will find encoding_rs useful. While
|
||||
//! encoding_rs does not try to match Windows behavior, many of the encodings
|
||||
//! are close enough to legacy encodings implemented by Windows that
|
||||
//! applications that need to consume data in legacy Windows encodins may
|
||||
//! find encoding_rs useful.
|
||||
//! Despite the browser focus, the hope is that non-browser applications
|
||||
//! that wish to consume Web content or submit Web forms in a Web-compatible
|
||||
//! way will find encoding_rs useful. While encoding_rs does not try to match
|
||||
//! Windows behavior, many of the encodings are close enough to legacy
|
||||
//! encodings implemented by Windows that applications that need to consume
|
||||
//! data in legacy Windows encodins may find encoding_rs useful.
|
||||
//!
|
||||
//! For decoding email, UTF-7 support is needed (unfortunately) in additition
|
||||
//! to the encodings defined in the Encoding Standard. The
|
||||
//! [charset](https://crates.io/crates/charset) wraps encoding_rs and adds
|
||||
//! UTF-7 decoding for email purposes.
|
||||
//!
|
||||
//! # Streaming & Non-Streaming; Rust & C/C++
|
||||
//!
|
||||
@ -660,22 +665,21 @@
|
||||
//! for discussion about the UTF-16 family.
|
||||
|
||||
#![cfg_attr(
|
||||
feature = "simd-accel", feature(cfg_target_feature, platform_intrinsics, core_intrinsics)
|
||||
feature = "simd-accel",
|
||||
feature(platform_intrinsics, core_intrinsics)
|
||||
)]
|
||||
|
||||
#[macro_use]
|
||||
extern crate cfg_if;
|
||||
|
||||
#[cfg(
|
||||
all(
|
||||
feature = "simd-accel",
|
||||
any(
|
||||
target_feature = "sse2",
|
||||
all(target_endian = "little", target_arch = "aarch64"),
|
||||
all(target_endian = "little", target_feature = "neon")
|
||||
)
|
||||
#[cfg(all(
|
||||
feature = "simd-accel",
|
||||
any(
|
||||
target_feature = "sse2",
|
||||
all(target_endian = "little", target_arch = "aarch64"),
|
||||
all(target_endian = "little", target_feature = "neon")
|
||||
)
|
||||
)]
|
||||
))]
|
||||
extern crate simd;
|
||||
|
||||
#[cfg(feature = "serde")]
|
||||
@ -692,26 +696,15 @@ extern crate serde_json;
|
||||
#[macro_use]
|
||||
mod macros;
|
||||
|
||||
#[cfg(
|
||||
all(
|
||||
feature = "simd-accel",
|
||||
any(
|
||||
target_feature = "sse2",
|
||||
all(target_endian = "little", target_arch = "aarch64"),
|
||||
all(target_endian = "little", target_feature = "neon")
|
||||
)
|
||||
)
|
||||
)]
|
||||
mod simd_funcs;
|
||||
|
||||
#[cfg(
|
||||
#[cfg(all(
|
||||
feature = "simd-accel",
|
||||
any(
|
||||
all(feature = "simd-accel", target_feature = "sse2"),
|
||||
target_feature = "sse2",
|
||||
all(target_endian = "little", target_arch = "aarch64"),
|
||||
all(target_endian = "little", target_arch = "arm")
|
||||
all(target_endian = "little", target_feature = "neon")
|
||||
)
|
||||
)]
|
||||
mod utf_8_core;
|
||||
))]
|
||||
mod simd_funcs;
|
||||
|
||||
#[cfg(test)]
|
||||
mod testing;
|
||||
@ -934,7 +927,7 @@ pub static GBK: &'static Encoding = &GBK_INIT;
|
||||
/// items.
|
||||
pub static IBM866_INIT: Encoding = Encoding {
|
||||
name: "IBM866",
|
||||
variant: VariantEncoding::SingleByte(data::IBM866_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.ibm866, 0x0440, 96, 16),
|
||||
};
|
||||
|
||||
/// The IBM866 encoding.
|
||||
@ -1004,7 +997,7 @@ pub static ISO_2022_JP: &'static Encoding = &ISO_2022_JP_INIT;
|
||||
/// items.
|
||||
pub static ISO_8859_10_INIT: Encoding = Encoding {
|
||||
name: "ISO-8859-10",
|
||||
variant: VariantEncoding::SingleByte(data::ISO_8859_10_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_10, 0x00DA, 90, 6),
|
||||
};
|
||||
|
||||
/// The ISO-8859-10 encoding.
|
||||
@ -1038,7 +1031,7 @@ pub static ISO_8859_10: &'static Encoding = &ISO_8859_10_INIT;
|
||||
/// items.
|
||||
pub static ISO_8859_13_INIT: Encoding = Encoding {
|
||||
name: "ISO-8859-13",
|
||||
variant: VariantEncoding::SingleByte(data::ISO_8859_13_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_13, 0x00DF, 95, 1),
|
||||
};
|
||||
|
||||
/// The ISO-8859-13 encoding.
|
||||
@ -1072,7 +1065,7 @@ pub static ISO_8859_13: &'static Encoding = &ISO_8859_13_INIT;
|
||||
/// items.
|
||||
pub static ISO_8859_14_INIT: Encoding = Encoding {
|
||||
name: "ISO-8859-14",
|
||||
variant: VariantEncoding::SingleByte(data::ISO_8859_14_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_14, 0x00DF, 95, 17),
|
||||
};
|
||||
|
||||
/// The ISO-8859-14 encoding.
|
||||
@ -1106,7 +1099,7 @@ pub static ISO_8859_14: &'static Encoding = &ISO_8859_14_INIT;
|
||||
/// items.
|
||||
pub static ISO_8859_15_INIT: Encoding = Encoding {
|
||||
name: "ISO-8859-15",
|
||||
variant: VariantEncoding::SingleByte(data::ISO_8859_15_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_15, 0x00BF, 63, 65),
|
||||
};
|
||||
|
||||
/// The ISO-8859-15 encoding.
|
||||
@ -1139,7 +1132,7 @@ pub static ISO_8859_15: &'static Encoding = &ISO_8859_15_INIT;
|
||||
/// items.
|
||||
pub static ISO_8859_16_INIT: Encoding = Encoding {
|
||||
name: "ISO-8859-16",
|
||||
variant: VariantEncoding::SingleByte(data::ISO_8859_16_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_16, 0x00DF, 95, 4),
|
||||
};
|
||||
|
||||
/// The ISO-8859-16 encoding.
|
||||
@ -1173,7 +1166,7 @@ pub static ISO_8859_16: &'static Encoding = &ISO_8859_16_INIT;
|
||||
/// items.
|
||||
pub static ISO_8859_2_INIT: Encoding = Encoding {
|
||||
name: "ISO-8859-2",
|
||||
variant: VariantEncoding::SingleByte(data::ISO_8859_2_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_2, 0x00DF, 95, 1),
|
||||
};
|
||||
|
||||
/// The ISO-8859-2 encoding.
|
||||
@ -1205,7 +1198,7 @@ pub static ISO_8859_2: &'static Encoding = &ISO_8859_2_INIT;
|
||||
/// items.
|
||||
pub static ISO_8859_3_INIT: Encoding = Encoding {
|
||||
name: "ISO-8859-3",
|
||||
variant: VariantEncoding::SingleByte(data::ISO_8859_3_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_3, 0x00DF, 95, 4),
|
||||
};
|
||||
|
||||
/// The ISO-8859-3 encoding.
|
||||
@ -1237,7 +1230,7 @@ pub static ISO_8859_3: &'static Encoding = &ISO_8859_3_INIT;
|
||||
/// items.
|
||||
pub static ISO_8859_4_INIT: Encoding = Encoding {
|
||||
name: "ISO-8859-4",
|
||||
variant: VariantEncoding::SingleByte(data::ISO_8859_4_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_4, 0x00DF, 95, 1),
|
||||
};
|
||||
|
||||
/// The ISO-8859-4 encoding.
|
||||
@ -1269,7 +1262,7 @@ pub static ISO_8859_4: &'static Encoding = &ISO_8859_4_INIT;
|
||||
/// items.
|
||||
pub static ISO_8859_5_INIT: Encoding = Encoding {
|
||||
name: "ISO-8859-5",
|
||||
variant: VariantEncoding::SingleByte(data::ISO_8859_5_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_5, 0x040E, 46, 66),
|
||||
};
|
||||
|
||||
/// The ISO-8859-5 encoding.
|
||||
@ -1301,7 +1294,7 @@ pub static ISO_8859_5: &'static Encoding = &ISO_8859_5_INIT;
|
||||
/// items.
|
||||
pub static ISO_8859_6_INIT: Encoding = Encoding {
|
||||
name: "ISO-8859-6",
|
||||
variant: VariantEncoding::SingleByte(data::ISO_8859_6_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_6, 0x0621, 65, 26),
|
||||
};
|
||||
|
||||
/// The ISO-8859-6 encoding.
|
||||
@ -1334,7 +1327,7 @@ pub static ISO_8859_6: &'static Encoding = &ISO_8859_6_INIT;
|
||||
/// items.
|
||||
pub static ISO_8859_7_INIT: Encoding = Encoding {
|
||||
name: "ISO-8859-7",
|
||||
variant: VariantEncoding::SingleByte(data::ISO_8859_7_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_7, 0x03A3, 83, 44),
|
||||
};
|
||||
|
||||
/// The ISO-8859-7 encoding.
|
||||
@ -1371,7 +1364,7 @@ pub static ISO_8859_7: &'static Encoding = &ISO_8859_7_INIT;
|
||||
/// items.
|
||||
pub static ISO_8859_8_INIT: Encoding = Encoding {
|
||||
name: "ISO-8859-8",
|
||||
variant: VariantEncoding::SingleByte(data::ISO_8859_8_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
|
||||
};
|
||||
|
||||
/// The ISO-8859-8 encoding.
|
||||
@ -1406,7 +1399,7 @@ pub static ISO_8859_8: &'static Encoding = &ISO_8859_8_INIT;
|
||||
/// items.
|
||||
pub static ISO_8859_8_I_INIT: Encoding = Encoding {
|
||||
name: "ISO-8859-8-I",
|
||||
variant: VariantEncoding::SingleByte(data::ISO_8859_8_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
|
||||
};
|
||||
|
||||
/// The ISO-8859-8-I encoding.
|
||||
@ -1441,7 +1434,7 @@ pub static ISO_8859_8_I: &'static Encoding = &ISO_8859_8_I_INIT;
|
||||
/// items.
|
||||
pub static KOI8_R_INIT: Encoding = Encoding {
|
||||
name: "KOI8-R",
|
||||
variant: VariantEncoding::SingleByte(data::KOI8_R_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_r, 0x044E, 64, 1),
|
||||
};
|
||||
|
||||
/// The KOI8-R encoding.
|
||||
@ -1473,7 +1466,7 @@ pub static KOI8_R: &'static Encoding = &KOI8_R_INIT;
|
||||
/// items.
|
||||
pub static KOI8_U_INIT: Encoding = Encoding {
|
||||
name: "KOI8-U",
|
||||
variant: VariantEncoding::SingleByte(data::KOI8_U_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_u, 0x044E, 64, 1),
|
||||
};
|
||||
|
||||
/// The KOI8-U encoding.
|
||||
@ -1673,7 +1666,7 @@ pub static GB18030: &'static Encoding = &GB18030_INIT;
|
||||
/// items.
|
||||
pub static MACINTOSH_INIT: Encoding = Encoding {
|
||||
name: "macintosh",
|
||||
variant: VariantEncoding::SingleByte(data::MACINTOSH_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.macintosh, 0x00CD, 106, 3),
|
||||
};
|
||||
|
||||
/// The macintosh encoding.
|
||||
@ -1742,7 +1735,7 @@ pub static REPLACEMENT: &'static Encoding = &REPLACEMENT_INIT;
|
||||
/// items.
|
||||
pub static WINDOWS_1250_INIT: Encoding = Encoding {
|
||||
name: "windows-1250",
|
||||
variant: VariantEncoding::SingleByte(data::WINDOWS_1250_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1250, 0x00DC, 92, 2),
|
||||
};
|
||||
|
||||
/// The windows-1250 encoding.
|
||||
@ -1774,7 +1767,7 @@ pub static WINDOWS_1250: &'static Encoding = &WINDOWS_1250_INIT;
|
||||
/// items.
|
||||
pub static WINDOWS_1251_INIT: Encoding = Encoding {
|
||||
name: "windows-1251",
|
||||
variant: VariantEncoding::SingleByte(data::WINDOWS_1251_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1251, 0x0410, 64, 64),
|
||||
};
|
||||
|
||||
/// The windows-1251 encoding.
|
||||
@ -1806,7 +1799,7 @@ pub static WINDOWS_1251: &'static Encoding = &WINDOWS_1251_INIT;
|
||||
/// items.
|
||||
pub static WINDOWS_1252_INIT: Encoding = Encoding {
|
||||
name: "windows-1252",
|
||||
variant: VariantEncoding::SingleByte(data::WINDOWS_1252_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1252, 0x00A0, 32, 96),
|
||||
};
|
||||
|
||||
/// The windows-1252 encoding.
|
||||
@ -1839,7 +1832,7 @@ pub static WINDOWS_1252: &'static Encoding = &WINDOWS_1252_INIT;
|
||||
/// items.
|
||||
pub static WINDOWS_1253_INIT: Encoding = Encoding {
|
||||
name: "windows-1253",
|
||||
variant: VariantEncoding::SingleByte(data::WINDOWS_1253_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1253, 0x03A3, 83, 44),
|
||||
};
|
||||
|
||||
/// The windows-1253 encoding.
|
||||
@ -1873,7 +1866,7 @@ pub static WINDOWS_1253: &'static Encoding = &WINDOWS_1253_INIT;
|
||||
/// items.
|
||||
pub static WINDOWS_1254_INIT: Encoding = Encoding {
|
||||
name: "windows-1254",
|
||||
variant: VariantEncoding::SingleByte(data::WINDOWS_1254_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1254, 0x00DF, 95, 17),
|
||||
};
|
||||
|
||||
/// The windows-1254 encoding.
|
||||
@ -1906,7 +1899,7 @@ pub static WINDOWS_1254: &'static Encoding = &WINDOWS_1254_INIT;
|
||||
/// items.
|
||||
pub static WINDOWS_1255_INIT: Encoding = Encoding {
|
||||
name: "windows-1255",
|
||||
variant: VariantEncoding::SingleByte(data::WINDOWS_1255_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1255, 0x05D0, 96, 27),
|
||||
};
|
||||
|
||||
/// The windows-1255 encoding.
|
||||
@ -1940,7 +1933,7 @@ pub static WINDOWS_1255: &'static Encoding = &WINDOWS_1255_INIT;
|
||||
/// items.
|
||||
pub static WINDOWS_1256_INIT: Encoding = Encoding {
|
||||
name: "windows-1256",
|
||||
variant: VariantEncoding::SingleByte(data::WINDOWS_1256_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1256, 0x0621, 65, 22),
|
||||
};
|
||||
|
||||
/// The windows-1256 encoding.
|
||||
@ -1972,7 +1965,7 @@ pub static WINDOWS_1256: &'static Encoding = &WINDOWS_1256_INIT;
|
||||
/// items.
|
||||
pub static WINDOWS_1257_INIT: Encoding = Encoding {
|
||||
name: "windows-1257",
|
||||
variant: VariantEncoding::SingleByte(data::WINDOWS_1257_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1257, 0x00DF, 95, 1),
|
||||
};
|
||||
|
||||
/// The windows-1257 encoding.
|
||||
@ -2005,7 +1998,7 @@ pub static WINDOWS_1257: &'static Encoding = &WINDOWS_1257_INIT;
|
||||
/// items.
|
||||
pub static WINDOWS_1258_INIT: Encoding = Encoding {
|
||||
name: "windows-1258",
|
||||
variant: VariantEncoding::SingleByte(data::WINDOWS_1258_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1258, 0x00DF, 95, 4),
|
||||
};
|
||||
|
||||
/// The windows-1258 encoding.
|
||||
@ -2042,7 +2035,7 @@ pub static WINDOWS_1258: &'static Encoding = &WINDOWS_1258_INIT;
|
||||
/// items.
|
||||
pub static WINDOWS_874_INIT: Encoding = Encoding {
|
||||
name: "windows-874",
|
||||
variant: VariantEncoding::SingleByte(data::WINDOWS_874_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_874, 0x0E01, 33, 58),
|
||||
};
|
||||
|
||||
/// The windows-874 encoding.
|
||||
@ -2075,7 +2068,7 @@ pub static WINDOWS_874: &'static Encoding = &WINDOWS_874_INIT;
|
||||
/// items.
|
||||
pub static X_MAC_CYRILLIC_INIT: Encoding = Encoding {
|
||||
name: "x-mac-cyrillic",
|
||||
variant: VariantEncoding::SingleByte(data::X_MAC_CYRILLIC_DATA),
|
||||
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.x_mac_cyrillic, 0x0430, 96, 31),
|
||||
};
|
||||
|
||||
/// The x-mac-cyrillic encoding.
|
||||
@ -2848,6 +2841,20 @@ impl Encoding {
|
||||
!(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE || self == ISO_2022_JP)
|
||||
}
|
||||
|
||||
/// Checks whether this encoding maps one byte to one Basic Multilingual
|
||||
/// Plane code point (i.e. byte length equals decoded UTF-16 length) and
|
||||
/// vice versa (for mappable characters).
|
||||
///
|
||||
/// `true` iff this encoding is on the list of [Legacy single-byte
|
||||
/// encodings](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
|
||||
/// in the spec or x-user-defined.
|
||||
///
|
||||
/// Available via the C wrapper.
|
||||
#[inline]
|
||||
pub fn is_single_byte(&'static self) -> bool {
|
||||
self.variant.is_single_byte()
|
||||
}
|
||||
|
||||
/// Checks whether the bytes 0x00...0x7F map mostly to the characters
|
||||
/// U+0000...U+007F and vice versa.
|
||||
#[inline]
|
||||
@ -3002,7 +3009,7 @@ impl Encoding {
|
||||
ascii_valid_up_to(bytes)
|
||||
};
|
||||
if valid_up_to == bytes.len() {
|
||||
let str: &str = unsafe { std::mem::transmute(bytes) };
|
||||
let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
|
||||
return (Cow::Borrowed(str), false);
|
||||
}
|
||||
let decoder = self.new_decoder_without_bom_handling();
|
||||
@ -3094,7 +3101,7 @@ impl Encoding {
|
||||
if self == UTF_8 {
|
||||
let valid_up_to = utf8_valid_up_to(bytes);
|
||||
if valid_up_to == bytes.len() {
|
||||
let str: &str = unsafe { std::mem::transmute(bytes) };
|
||||
let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
|
||||
return Some(Cow::Borrowed(str));
|
||||
}
|
||||
return None;
|
||||
@ -3106,7 +3113,7 @@ impl Encoding {
|
||||
ascii_valid_up_to(bytes)
|
||||
};
|
||||
if valid_up_to == bytes.len() {
|
||||
let str: &str = unsafe { std::mem::transmute(bytes) };
|
||||
let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
|
||||
return Some(Cow::Borrowed(str));
|
||||
}
|
||||
let decoder = self.new_decoder_without_bom_handling();
|
||||
@ -3114,7 +3121,8 @@ impl Encoding {
|
||||
checked_add(
|
||||
valid_up_to,
|
||||
decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
|
||||
).unwrap(),
|
||||
)
|
||||
.unwrap(),
|
||||
);
|
||||
unsafe {
|
||||
let vec = string.as_mut_vec();
|
||||
@ -3201,8 +3209,9 @@ impl Encoding {
|
||||
(checked_add(
|
||||
valid_up_to,
|
||||
encoder.max_buffer_length_from_utf8_if_no_unmappables(string.len() - valid_up_to),
|
||||
)).unwrap()
|
||||
.next_power_of_two(),
|
||||
))
|
||||
.unwrap()
|
||||
.next_power_of_two(),
|
||||
);
|
||||
unsafe {
|
||||
vec.set_len(valid_up_to);
|
||||
@ -3394,7 +3403,7 @@ impl<'de> Deserialize<'de> for &'static Encoding {
|
||||
}
|
||||
|
||||
/// Tracks the life cycle of a decoder from BOM sniffing to conversion to end.
|
||||
#[derive(PartialEq, Debug)]
|
||||
#[derive(PartialEq, Debug, Copy, Clone)]
|
||||
enum DecoderLifeCycle {
|
||||
/// The decoder has seen no input yet.
|
||||
AtStart,
|
||||
@ -3423,6 +3432,7 @@ enum DecoderLifeCycle {
|
||||
}
|
||||
|
||||
/// Communicate the BOM handling mode.
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
enum BomHandling {
|
||||
/// Don't handle the BOM
|
||||
Off,
|
||||
@ -3887,7 +3897,7 @@ impl Decoder {
|
||||
dst: &mut str,
|
||||
last: bool,
|
||||
) -> (CoderResult, usize, usize, bool) {
|
||||
let bytes: &mut [u8] = unsafe { std::mem::transmute(dst) };
|
||||
let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
|
||||
let (result, read, written, replaced) = self.decode_to_utf8(src, bytes, last);
|
||||
let len = bytes.len();
|
||||
let mut trail = written;
|
||||
@ -3977,7 +3987,7 @@ impl Decoder {
|
||||
dst: &mut str,
|
||||
last: bool,
|
||||
) -> (DecoderResult, usize, usize) {
|
||||
let bytes: &mut [u8] = unsafe { std::mem::transmute(dst) };
|
||||
let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
|
||||
let (result, read, written) = self.decode_to_utf8_without_replacement(src, bytes, last);
|
||||
let len = bytes.len();
|
||||
let mut trail = written;
|
||||
@ -4217,7 +4227,7 @@ pub enum EncoderResult {
|
||||
|
||||
impl EncoderResult {
|
||||
fn unmappable_from_bmp(bmp: u16) -> EncoderResult {
|
||||
EncoderResult::Unmappable(::std::char::from_u32(bmp as u32).unwrap())
|
||||
EncoderResult::Unmappable(::std::char::from_u32(u32::from(bmp)).unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
@ -4688,13 +4698,13 @@ fn write_ncr(unmappable: char, dst: &mut [u8]) -> usize {
|
||||
// len is the number of decimal digits needed to represent unmappable plus
|
||||
// 3 (the length of "&#" and ";").
|
||||
let mut number = unmappable as u32;
|
||||
let len = if number >= 1000000u32 {
|
||||
let len = if number >= 1_000_000u32 {
|
||||
10usize
|
||||
} else if number >= 100000u32 {
|
||||
} else if number >= 100_000u32 {
|
||||
9usize
|
||||
} else if number >= 10000u32 {
|
||||
} else if number >= 10_000u32 {
|
||||
8usize
|
||||
} else if number >= 1000u32 {
|
||||
} else if number >= 1_000u32 {
|
||||
7usize
|
||||
} else if number >= 100u32 {
|
||||
6usize
|
||||
@ -5635,4 +5645,48 @@ mod tests {
|
||||
assert_eq!(debincoded, demo);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_single_byte() {
|
||||
assert!(!BIG5.is_single_byte());
|
||||
assert!(!EUC_JP.is_single_byte());
|
||||
assert!(!EUC_KR.is_single_byte());
|
||||
assert!(!GB18030.is_single_byte());
|
||||
assert!(!GBK.is_single_byte());
|
||||
assert!(!REPLACEMENT.is_single_byte());
|
||||
assert!(!SHIFT_JIS.is_single_byte());
|
||||
assert!(!UTF_8.is_single_byte());
|
||||
assert!(!UTF_16BE.is_single_byte());
|
||||
assert!(!UTF_16LE.is_single_byte());
|
||||
assert!(!ISO_2022_JP.is_single_byte());
|
||||
|
||||
assert!(IBM866.is_single_byte());
|
||||
assert!(ISO_8859_2.is_single_byte());
|
||||
assert!(ISO_8859_3.is_single_byte());
|
||||
assert!(ISO_8859_4.is_single_byte());
|
||||
assert!(ISO_8859_5.is_single_byte());
|
||||
assert!(ISO_8859_6.is_single_byte());
|
||||
assert!(ISO_8859_7.is_single_byte());
|
||||
assert!(ISO_8859_8.is_single_byte());
|
||||
assert!(ISO_8859_10.is_single_byte());
|
||||
assert!(ISO_8859_13.is_single_byte());
|
||||
assert!(ISO_8859_14.is_single_byte());
|
||||
assert!(ISO_8859_15.is_single_byte());
|
||||
assert!(ISO_8859_16.is_single_byte());
|
||||
assert!(ISO_8859_8_I.is_single_byte());
|
||||
assert!(KOI8_R.is_single_byte());
|
||||
assert!(KOI8_U.is_single_byte());
|
||||
assert!(MACINTOSH.is_single_byte());
|
||||
assert!(WINDOWS_874.is_single_byte());
|
||||
assert!(WINDOWS_1250.is_single_byte());
|
||||
assert!(WINDOWS_1251.is_single_byte());
|
||||
assert!(WINDOWS_1252.is_single_byte());
|
||||
assert!(WINDOWS_1253.is_single_byte());
|
||||
assert!(WINDOWS_1254.is_single_byte());
|
||||
assert!(WINDOWS_1255.is_single_byte());
|
||||
assert!(WINDOWS_1256.is_single_byte());
|
||||
assert!(WINDOWS_1257.is_single_byte());
|
||||
assert!(WINDOWS_1258.is_single_byte());
|
||||
assert!(X_MAC_CYRILLIC.is_single_byte());
|
||||
assert!(X_USER_DEFINED.is_single_byte());
|
||||
}
|
||||
}
|
||||
|
2
third_party/rust/encoding_rs/src/macros.rs
vendored
2
third_party/rust/encoding_rs/src/macros.rs
vendored
@ -361,6 +361,7 @@ macro_rules! gb18030_decoder_function {
|
||||
$name:ident,
|
||||
$code_unit:ty,
|
||||
$dest_struct:ident) => (
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(never_loop))]
|
||||
pub fn $name(&mut $slf,
|
||||
src: &[u8],
|
||||
dst: &mut [$code_unit],
|
||||
@ -685,6 +686,7 @@ macro_rules! euc_jp_decoder_function {
|
||||
$name:ident,
|
||||
$code_unit:ty,
|
||||
$dest_struct:ident) => (
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(never_loop))]
|
||||
pub fn $name(&mut $slf,
|
||||
src: &[u8],
|
||||
dst: &mut [$code_unit],
|
||||
|
465
third_party/rust/encoding_rs/src/mem.rs
vendored
465
third_party/rust/encoding_rs/src/mem.rs
vendored
@ -21,6 +21,8 @@
|
||||
//! in-memory encoding is sometimes used as a storage optimization of text
|
||||
//! when UTF-16 indexing and length semantics are exposed.
|
||||
|
||||
use std::borrow::Cow;
|
||||
|
||||
use super::in_inclusive_range16;
|
||||
use super::in_inclusive_range32;
|
||||
use super::in_inclusive_range8;
|
||||
@ -65,11 +67,12 @@ pub enum Latin1Bidi {
|
||||
|
||||
// `as` truncates, so works on 32-bit, too.
|
||||
#[allow(dead_code)]
|
||||
const LATIN1_MASK: usize = 0xFF00FF00_FF00FF00u64 as usize;
|
||||
const LATIN1_MASK: usize = 0xFF00_FF00_FF00_FF00u64 as usize;
|
||||
|
||||
#[allow(unused_macros)]
|
||||
macro_rules! by_unit_check_alu {
|
||||
($name:ident, $unit:ty, $bound:expr, $mask:ident) => {
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
|
||||
#[inline(always)]
|
||||
fn $name(buffer: &[$unit]) -> bool {
|
||||
let mut offset = 0usize;
|
||||
@ -84,7 +87,8 @@ macro_rules! by_unit_check_alu {
|
||||
}
|
||||
let src = buffer.as_ptr();
|
||||
let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK))
|
||||
& ALU_ALIGNMENT_MASK) / unit_size;
|
||||
& ALU_ALIGNMENT_MASK)
|
||||
/ unit_size;
|
||||
if until_alignment + ALU_ALIGNMENT / unit_size <= len {
|
||||
if until_alignment != 0 {
|
||||
accu |= buffer[offset] as usize;
|
||||
@ -103,18 +107,18 @@ macro_rules! by_unit_check_alu {
|
||||
if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len {
|
||||
let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size));
|
||||
loop {
|
||||
let unroll_accu = unsafe {
|
||||
*(src.offset(offset as isize) as *const usize)
|
||||
} | unsafe {
|
||||
*(src.offset((offset + (ALU_ALIGNMENT / unit_size)) as isize)
|
||||
as *const usize)
|
||||
} | unsafe {
|
||||
*(src.offset((offset + (2 * (ALU_ALIGNMENT / unit_size))) as isize)
|
||||
as *const usize)
|
||||
} | unsafe {
|
||||
*(src.offset((offset + (3 * (ALU_ALIGNMENT / unit_size))) as isize)
|
||||
as *const usize)
|
||||
};
|
||||
let unroll_accu = unsafe { *(src.add(offset) as *const usize) }
|
||||
| unsafe {
|
||||
*(src.add(offset + (ALU_ALIGNMENT / unit_size)) as *const usize)
|
||||
}
|
||||
| unsafe {
|
||||
*(src.add(offset + (2 * (ALU_ALIGNMENT / unit_size)))
|
||||
as *const usize)
|
||||
}
|
||||
| unsafe {
|
||||
*(src.add(offset + (3 * (ALU_ALIGNMENT / unit_size)))
|
||||
as *const usize)
|
||||
};
|
||||
if unroll_accu & $mask != 0 {
|
||||
return false;
|
||||
}
|
||||
@ -125,7 +129,7 @@ macro_rules! by_unit_check_alu {
|
||||
}
|
||||
}
|
||||
while offset <= len_minus_stride {
|
||||
accu |= unsafe { *(src.offset(offset as isize) as *const usize) };
|
||||
accu |= unsafe { *(src.add(offset) as *const usize) };
|
||||
offset += ALU_ALIGNMENT / unit_size;
|
||||
}
|
||||
}
|
||||
@ -154,8 +158,10 @@ macro_rules! by_unit_check_simd {
|
||||
return false;
|
||||
}
|
||||
let src = buffer.as_ptr();
|
||||
let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK))
|
||||
& SIMD_ALIGNMENT_MASK) / unit_size;
|
||||
let mut until_alignment = ((SIMD_ALIGNMENT
|
||||
- ((src as usize) & SIMD_ALIGNMENT_MASK))
|
||||
& SIMD_ALIGNMENT_MASK)
|
||||
/ unit_size;
|
||||
if until_alignment + SIMD_STRIDE_SIZE / unit_size <= len {
|
||||
if until_alignment != 0 {
|
||||
accu |= buffer[offset] as usize;
|
||||
@ -174,20 +180,19 @@ macro_rules! by_unit_check_simd {
|
||||
if offset + (4 * (SIMD_STRIDE_SIZE / unit_size)) <= len {
|
||||
let len_minus_unroll = len - (4 * (SIMD_STRIDE_SIZE / unit_size));
|
||||
loop {
|
||||
let unroll_accu = unsafe {
|
||||
*(src.offset(offset as isize) as *const $simd_ty)
|
||||
} | unsafe {
|
||||
*(src.offset((offset + (SIMD_STRIDE_SIZE / unit_size)) as isize)
|
||||
as *const $simd_ty)
|
||||
} | unsafe {
|
||||
*(src.offset(
|
||||
(offset + (2 * (SIMD_STRIDE_SIZE / unit_size))) as isize,
|
||||
) as *const $simd_ty)
|
||||
} | unsafe {
|
||||
*(src.offset(
|
||||
(offset + (3 * (SIMD_STRIDE_SIZE / unit_size))) as isize,
|
||||
) as *const $simd_ty)
|
||||
};
|
||||
let unroll_accu = unsafe { *(src.add(offset) as *const $simd_ty) }
|
||||
| unsafe {
|
||||
*(src.add(offset + (SIMD_STRIDE_SIZE / unit_size))
|
||||
as *const $simd_ty)
|
||||
}
|
||||
| unsafe {
|
||||
*(src.add(offset + (2 * (SIMD_STRIDE_SIZE / unit_size)))
|
||||
as *const $simd_ty)
|
||||
}
|
||||
| unsafe {
|
||||
*(src.add(offset + (3 * (SIMD_STRIDE_SIZE / unit_size)))
|
||||
as *const $simd_ty)
|
||||
};
|
||||
if !$func(unroll_accu) {
|
||||
return false;
|
||||
}
|
||||
@ -199,8 +204,7 @@ macro_rules! by_unit_check_simd {
|
||||
}
|
||||
let mut simd_accu = $splat;
|
||||
while offset <= len_minus_stride {
|
||||
simd_accu = simd_accu
|
||||
| unsafe { *(src.offset(offset as isize) as *const $simd_ty) };
|
||||
simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) };
|
||||
offset += SIMD_STRIDE_SIZE / unit_size;
|
||||
}
|
||||
if !$func(simd_accu) {
|
||||
@ -241,7 +245,7 @@ cfg_if!{
|
||||
let len = buffer.len();
|
||||
let mut offset = 0usize;
|
||||
'outer: loop {
|
||||
let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.offset(offset as isize) } as usize) & SIMD_ALIGNMENT_MASK)) &
|
||||
let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.add(offset) } as usize) & SIMD_ALIGNMENT_MASK)) &
|
||||
SIMD_ALIGNMENT_MASK) / unit_size;
|
||||
if until_alignment == 0 {
|
||||
if offset + SIMD_STRIDE_SIZE / unit_size > len {
|
||||
@ -266,7 +270,7 @@ cfg_if!{
|
||||
let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
|
||||
'inner: loop {
|
||||
let offset_plus_stride = offset + SIMD_STRIDE_SIZE / unit_size;
|
||||
if contains_surrogates(unsafe { *(src.offset(offset as isize) as *const u16x8) }) {
|
||||
if contains_surrogates(unsafe { *(src.add(offset) as *const u16x8) }) {
|
||||
if offset_plus_stride == len {
|
||||
break 'outer;
|
||||
}
|
||||
@ -304,6 +308,7 @@ cfg_if!{
|
||||
|
||||
/// The second return value is true iff the last code unit of the slice was
|
||||
/// reached and turned out to be a low surrogate that is part of a valid pair.
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
|
||||
#[inline(always)]
|
||||
fn utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool) {
|
||||
let len = buffer.len();
|
||||
@ -368,7 +373,7 @@ cfg_if!{
|
||||
}
|
||||
let len_minus_stride = len - SIMD_STRIDE_SIZE;
|
||||
loop {
|
||||
if !simd_is_str_latin1(unsafe { *(src.offset(offset as isize) as *const u8x16) }) {
|
||||
if !simd_is_str_latin1(unsafe { *(src.add(offset) as *const u8x16) }) {
|
||||
// TODO: Ensure this compiles away when inlined into `is_str_latin1()`.
|
||||
while bytes[offset] & 0xC0 == 0x80 {
|
||||
offset += 1;
|
||||
@ -456,7 +461,7 @@ cfg_if!{
|
||||
}
|
||||
let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
|
||||
loop {
|
||||
if is_u16x8_bidi(unsafe { *(src.offset(offset as isize) as *const u16x8) }) {
|
||||
if is_u16x8_bidi(unsafe { *(src.add(offset) as *const u16x8) }) {
|
||||
return true;
|
||||
}
|
||||
offset += SIMD_STRIDE_SIZE / 2;
|
||||
@ -511,7 +516,7 @@ cfg_if!{
|
||||
}
|
||||
let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
|
||||
loop {
|
||||
let mut s = unsafe { *(src.offset(offset as isize) as *const u16x8) };
|
||||
let mut s = unsafe { *(src.add(offset) as *const u16x8) };
|
||||
if !simd_is_latin1(s) {
|
||||
loop {
|
||||
if is_u16x8_bidi(s) {
|
||||
@ -526,7 +531,7 @@ cfg_if!{
|
||||
}
|
||||
return Latin1Bidi::LeftToRight;
|
||||
}
|
||||
s = unsafe { *(src.offset(offset as isize) as *const u16x8) };
|
||||
s = unsafe { *(src.add(offset) as *const u16x8) };
|
||||
}
|
||||
}
|
||||
offset += SIMD_STRIDE_SIZE / 2;
|
||||
@ -558,6 +563,7 @@ cfg_if!{
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
|
||||
#[inline(always)]
|
||||
fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
|
||||
let mut offset = 0usize;
|
||||
@ -579,7 +585,7 @@ cfg_if!{
|
||||
}
|
||||
let len_minus_stride = len - ALU_ALIGNMENT / 2;
|
||||
loop {
|
||||
if unsafe { *(src.offset(offset as isize) as *const usize) } & LATIN1_MASK != 0 {
|
||||
if unsafe { *(src.add(offset) as *const usize) } & LATIN1_MASK != 0 {
|
||||
if is_utf16_bidi_impl(&buffer[offset..]) {
|
||||
return Latin1Bidi::Bidi;
|
||||
}
|
||||
@ -681,6 +687,10 @@ pub fn is_utf16_latin1(buffer: &[u16]) -> bool {
|
||||
/// Returns `true` if the input is invalid UTF-8 or the input contains an
|
||||
/// RTL character. Returns `false` if the input is valid UTF-8 and contains
|
||||
/// no RTL characters.
|
||||
#[cfg_attr(
|
||||
feature = "cargo-clippy",
|
||||
allow(collapsible_if, cyclomatic_complexity)
|
||||
)]
|
||||
#[inline]
|
||||
pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
|
||||
// As of rustc 1.25.0-nightly (73ac5d6a8 2018-01-11), this is faster
|
||||
@ -721,33 +731,33 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
|
||||
// U+1E800: F0 9E A0 80
|
||||
// U+1EFFF: F0 9E BF BF
|
||||
// U+1F000: F0 9F 80 80
|
||||
let mut bytes = buffer;
|
||||
let mut src = buffer;
|
||||
'outer: loop {
|
||||
if let Some((mut byte, mut read)) = validate_ascii(bytes) {
|
||||
if let Some((mut byte, mut read)) = validate_ascii(src) {
|
||||
// Check for the longest sequence to avoid checking twice for the
|
||||
// multi-byte sequences.
|
||||
if read + 4 <= bytes.len() {
|
||||
if read + 4 <= src.len() {
|
||||
'inner: loop {
|
||||
// At this point, `byte` is not included in `read`.
|
||||
match byte {
|
||||
0...0x7F => {
|
||||
// ASCII: go back to SIMD.
|
||||
read += 1;
|
||||
bytes = &bytes[read..];
|
||||
src = &src[read..];
|
||||
continue 'outer;
|
||||
}
|
||||
0xC2...0xD5 => {
|
||||
// Two-byte
|
||||
let second = bytes[read + 1];
|
||||
if (UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL) != 0 {
|
||||
let second = unsafe { *(src.get_unchecked(read + 1)) };
|
||||
if !in_inclusive_range8(second, 0x80, 0xBF) {
|
||||
return true;
|
||||
}
|
||||
read += 2;
|
||||
}
|
||||
0xD6 => {
|
||||
// Two-byte
|
||||
let second = bytes[read + 1];
|
||||
if (UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL) != 0 {
|
||||
let second = unsafe { *(src.get_unchecked(read + 1)) };
|
||||
if !in_inclusive_range8(second, 0x80, 0xBF) {
|
||||
return true;
|
||||
}
|
||||
// XXX consider folding the above and below checks
|
||||
@ -759,11 +769,12 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
|
||||
// two-byte starting with 0xD7 and above is bidi
|
||||
0xE1 | 0xE3...0xEC | 0xEE => {
|
||||
// Three-byte normal
|
||||
let second = bytes[read + 1];
|
||||
let third = bytes[read + 2];
|
||||
if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
|
||||
!= 0
|
||||
let second = unsafe { *(src.get_unchecked(read + 1)) };
|
||||
let third = unsafe { *(src.get_unchecked(read + 2)) };
|
||||
if ((UTF8_DATA.table[usize::from(second)] & unsafe {
|
||||
*(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
|
||||
}) | (third >> 6))
|
||||
!= 2
|
||||
{
|
||||
return true;
|
||||
}
|
||||
@ -771,11 +782,12 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
|
||||
}
|
||||
0xE2 => {
|
||||
// Three-byte normal, potentially bidi
|
||||
let second = bytes[read + 1];
|
||||
let third = bytes[read + 2];
|
||||
if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
|
||||
!= 0
|
||||
let second = unsafe { *(src.get_unchecked(read + 1)) };
|
||||
let third = unsafe { *(src.get_unchecked(read + 2)) };
|
||||
if ((UTF8_DATA.table[usize::from(second)] & unsafe {
|
||||
*(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
|
||||
}) | (third >> 6))
|
||||
!= 2
|
||||
{
|
||||
return true;
|
||||
}
|
||||
@ -792,11 +804,12 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
|
||||
}
|
||||
0xEF => {
|
||||
// Three-byte normal, potentially bidi
|
||||
let second = bytes[read + 1];
|
||||
let third = bytes[read + 2];
|
||||
if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
|
||||
!= 0
|
||||
let second = unsafe { *(src.get_unchecked(read + 1)) };
|
||||
let third = unsafe { *(src.get_unchecked(read + 2)) };
|
||||
if ((UTF8_DATA.table[usize::from(second)] & unsafe {
|
||||
*(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
|
||||
}) | (third >> 6))
|
||||
!= 2
|
||||
{
|
||||
return true;
|
||||
}
|
||||
@ -825,12 +838,12 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
|
||||
}
|
||||
0xE0 => {
|
||||
// Three-byte special lower bound, potentially bidi
|
||||
let second = bytes[read + 1];
|
||||
let third = bytes[read + 2];
|
||||
if ((UTF8_TRAIL_INVALID[second as usize]
|
||||
& UTF8_THREE_BYTE_SPECIAL_LOWER_BOUND_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
|
||||
!= 0
|
||||
let second = unsafe { *(src.get_unchecked(read + 1)) };
|
||||
let third = unsafe { *(src.get_unchecked(read + 2)) };
|
||||
if ((UTF8_DATA.table[usize::from(second)] & unsafe {
|
||||
*(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
|
||||
}) | (third >> 6))
|
||||
!= 2
|
||||
{
|
||||
return true;
|
||||
}
|
||||
@ -842,26 +855,30 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
|
||||
}
|
||||
0xED => {
|
||||
// Three-byte special upper bound
|
||||
let second = bytes[read + 1];
|
||||
let third = bytes[read + 2];
|
||||
if ((UTF8_TRAIL_INVALID[second as usize]
|
||||
& UTF8_THREE_BYTE_SPECIAL_UPPER_BOUND_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
|
||||
!= 0
|
||||
let second = unsafe { *(src.get_unchecked(read + 1)) };
|
||||
let third = unsafe { *(src.get_unchecked(read + 2)) };
|
||||
if ((UTF8_DATA.table[usize::from(second)] & unsafe {
|
||||
*(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
|
||||
}) | (third >> 6))
|
||||
!= 2
|
||||
{
|
||||
return true;
|
||||
}
|
||||
read += 3;
|
||||
}
|
||||
0xF1...0xF3 => {
|
||||
0xF1...0xF4 => {
|
||||
// Four-byte normal
|
||||
let second = bytes[read + 1];
|
||||
let third = bytes[read + 2];
|
||||
let fourth = bytes[read + 3];
|
||||
if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[fourth as usize] & UTF8_NORMAL_TRAIL))
|
||||
!= 0
|
||||
let second = unsafe { *(src.get_unchecked(read + 1)) };
|
||||
let third = unsafe { *(src.get_unchecked(read + 2)) };
|
||||
let fourth = unsafe { *(src.get_unchecked(read + 3)) };
|
||||
if (u16::from(
|
||||
UTF8_DATA.table[usize::from(second)]
|
||||
& unsafe {
|
||||
*(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
|
||||
},
|
||||
) | u16::from(third >> 6)
|
||||
| (u16::from(fourth & 0xC0) << 2))
|
||||
!= 0x202
|
||||
{
|
||||
return true;
|
||||
}
|
||||
@ -869,53 +886,41 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
|
||||
}
|
||||
0xF0 => {
|
||||
// Four-byte special lower bound, potentially bidi
|
||||
let second = bytes[read + 1];
|
||||
let third = bytes[read + 2];
|
||||
let fourth = bytes[read + 3];
|
||||
if ((UTF8_TRAIL_INVALID[second as usize]
|
||||
& UTF8_FOUR_BYTE_SPECIAL_LOWER_BOUND_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[fourth as usize] & UTF8_NORMAL_TRAIL))
|
||||
!= 0
|
||||
let second = unsafe { *(src.get_unchecked(read + 1)) };
|
||||
let third = unsafe { *(src.get_unchecked(read + 2)) };
|
||||
let fourth = unsafe { *(src.get_unchecked(read + 3)) };
|
||||
if (u16::from(
|
||||
UTF8_DATA.table[usize::from(second)]
|
||||
& unsafe {
|
||||
*(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
|
||||
},
|
||||
) | u16::from(third >> 6)
|
||||
| (u16::from(fourth & 0xC0) << 2))
|
||||
!= 0x202
|
||||
{
|
||||
return true;
|
||||
}
|
||||
if unsafe { unlikely(second == 0x90 || second == 0x9E) } {
|
||||
let third = bytes[read + 2];
|
||||
let third = src[read + 2];
|
||||
if third >= 0xA0 {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
read += 4;
|
||||
}
|
||||
0xF4 => {
|
||||
// Four-byte special upper bound
|
||||
let second = bytes[read + 1];
|
||||
let third = bytes[read + 2];
|
||||
let fourth = bytes[read + 3];
|
||||
if ((UTF8_TRAIL_INVALID[second as usize]
|
||||
& UTF8_FOUR_BYTE_SPECIAL_UPPER_BOUND_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[fourth as usize] & UTF8_NORMAL_TRAIL))
|
||||
!= 0
|
||||
{
|
||||
return true;
|
||||
}
|
||||
read += 4;
|
||||
}
|
||||
_ => {
|
||||
// Invalid lead or bidi-only lead
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if read + 4 > bytes.len() {
|
||||
if read == bytes.len() {
|
||||
if read + 4 > src.len() {
|
||||
if read == src.len() {
|
||||
return false;
|
||||
}
|
||||
byte = bytes[read];
|
||||
byte = src[read];
|
||||
break 'inner;
|
||||
}
|
||||
byte = bytes[read];
|
||||
byte = src[read];
|
||||
continue 'inner;
|
||||
}
|
||||
}
|
||||
@ -927,33 +932,33 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
|
||||
0...0x7F => {
|
||||
// ASCII: go back to SIMD.
|
||||
read += 1;
|
||||
bytes = &bytes[read..];
|
||||
src = &src[read..];
|
||||
continue 'outer;
|
||||
}
|
||||
0xC2...0xD5 => {
|
||||
// Two-byte
|
||||
let new_read = read + 2;
|
||||
if new_read > bytes.len() {
|
||||
if new_read > src.len() {
|
||||
return true;
|
||||
}
|
||||
let second = bytes[read + 1];
|
||||
if (UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL) != 0 {
|
||||
let second = unsafe { *(src.get_unchecked(read + 1)) };
|
||||
if !in_inclusive_range8(second, 0x80, 0xBF) {
|
||||
return true;
|
||||
}
|
||||
read = new_read;
|
||||
// We need to deal with the case where we came here with 3 bytes
|
||||
// left, so we need to take a look at the last one.
|
||||
bytes = &bytes[read..];
|
||||
src = &src[read..];
|
||||
continue 'outer;
|
||||
}
|
||||
0xD6 => {
|
||||
// Two-byte, potentially bidi
|
||||
let new_read = read + 2;
|
||||
if new_read > bytes.len() {
|
||||
if new_read > src.len() {
|
||||
return true;
|
||||
}
|
||||
let second = bytes[read + 1];
|
||||
if (UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL) != 0 {
|
||||
let second = unsafe { *(src.get_unchecked(read + 1)) };
|
||||
if !in_inclusive_range8(second, 0x80, 0xBF) {
|
||||
return true;
|
||||
}
|
||||
// XXX consider folding the above and below checks
|
||||
@ -963,21 +968,22 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
|
||||
read = new_read;
|
||||
// We need to deal with the case where we came here with 3 bytes
|
||||
// left, so we need to take a look at the last one.
|
||||
bytes = &bytes[read..];
|
||||
src = &src[read..];
|
||||
continue 'outer;
|
||||
}
|
||||
// two-byte starting with 0xD7 and above is bidi
|
||||
0xE1 | 0xE3...0xEC | 0xEE => {
|
||||
// Three-byte normal
|
||||
let new_read = read + 3;
|
||||
if new_read > bytes.len() {
|
||||
if new_read > src.len() {
|
||||
return true;
|
||||
}
|
||||
let second = bytes[read + 1];
|
||||
let third = bytes[read + 2];
|
||||
if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
|
||||
!= 0
|
||||
let second = unsafe { *(src.get_unchecked(read + 1)) };
|
||||
let third = unsafe { *(src.get_unchecked(read + 2)) };
|
||||
if ((UTF8_DATA.table[usize::from(second)]
|
||||
& unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
|
||||
| (third >> 6))
|
||||
!= 2
|
||||
{
|
||||
return true;
|
||||
}
|
||||
@ -985,14 +991,15 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
|
||||
0xE2 => {
|
||||
// Three-byte normal, potentially bidi
|
||||
let new_read = read + 3;
|
||||
if new_read > bytes.len() {
|
||||
if new_read > src.len() {
|
||||
return true;
|
||||
}
|
||||
let second = bytes[read + 1];
|
||||
let third = bytes[read + 2];
|
||||
if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
|
||||
!= 0
|
||||
let second = unsafe { *(src.get_unchecked(read + 1)) };
|
||||
let third = unsafe { *(src.get_unchecked(read + 2)) };
|
||||
if ((UTF8_DATA.table[usize::from(second)]
|
||||
& unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
|
||||
| (third >> 6))
|
||||
!= 2
|
||||
{
|
||||
return true;
|
||||
}
|
||||
@ -1009,14 +1016,15 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
|
||||
0xEF => {
|
||||
// Three-byte normal, potentially bidi
|
||||
let new_read = read + 3;
|
||||
if new_read > bytes.len() {
|
||||
if new_read > src.len() {
|
||||
return true;
|
||||
}
|
||||
let second = bytes[read + 1];
|
||||
let third = bytes[read + 2];
|
||||
if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
|
||||
!= 0
|
||||
let second = unsafe { *(src.get_unchecked(read + 1)) };
|
||||
let third = unsafe { *(src.get_unchecked(read + 2)) };
|
||||
if ((UTF8_DATA.table[usize::from(second)]
|
||||
& unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
|
||||
| (third >> 6))
|
||||
!= 2
|
||||
{
|
||||
return true;
|
||||
}
|
||||
@ -1045,15 +1053,15 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
|
||||
0xE0 => {
|
||||
// Three-byte special lower bound, potentially bidi
|
||||
let new_read = read + 3;
|
||||
if new_read > bytes.len() {
|
||||
if new_read > src.len() {
|
||||
return true;
|
||||
}
|
||||
let second = bytes[read + 1];
|
||||
let third = bytes[read + 2];
|
||||
if ((UTF8_TRAIL_INVALID[second as usize]
|
||||
& UTF8_THREE_BYTE_SPECIAL_LOWER_BOUND_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
|
||||
!= 0
|
||||
let second = unsafe { *(src.get_unchecked(read + 1)) };
|
||||
let third = unsafe { *(src.get_unchecked(read + 2)) };
|
||||
if ((UTF8_DATA.table[usize::from(second)]
|
||||
& unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
|
||||
| (third >> 6))
|
||||
!= 2
|
||||
{
|
||||
return true;
|
||||
}
|
||||
@ -1065,15 +1073,15 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
|
||||
0xED => {
|
||||
// Three-byte special upper bound
|
||||
let new_read = read + 3;
|
||||
if new_read > bytes.len() {
|
||||
if new_read > src.len() {
|
||||
return true;
|
||||
}
|
||||
let second = bytes[read + 1];
|
||||
let third = bytes[read + 2];
|
||||
if ((UTF8_TRAIL_INVALID[second as usize]
|
||||
& UTF8_THREE_BYTE_SPECIAL_UPPER_BOUND_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
|
||||
!= 0
|
||||
let second = unsafe { *(src.get_unchecked(read + 1)) };
|
||||
let third = unsafe { *(src.get_unchecked(read + 2)) };
|
||||
if ((UTF8_DATA.table[usize::from(second)]
|
||||
& unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
|
||||
| (third >> 6))
|
||||
!= 2
|
||||
{
|
||||
return true;
|
||||
}
|
||||
@ -1102,6 +1110,7 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
|
||||
/// cause right-to-left behavior without the presence of right-to-left
|
||||
/// characters or right-to-left controls are not checked for. As a special
|
||||
/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
|
||||
#[inline]
|
||||
pub fn is_str_bidi(buffer: &str) -> bool {
|
||||
// U+058F: D6 8F
|
||||
@ -1299,7 +1308,7 @@ pub fn is_char_bidi(c: char) -> bool {
|
||||
// https://www.unicode.org/roadmaps/smp/
|
||||
// U+10800...U+10FFF (Lead surrogate U+D802 or U+D803)
|
||||
// U+1E800...U+1EFFF (Lead surrogate U+D83A or U+D83B)
|
||||
let code_point = c as u32;
|
||||
let code_point = u32::from(c);
|
||||
if code_point < 0x0590 {
|
||||
// Below Hebrew
|
||||
return false;
|
||||
@ -1457,8 +1466,9 @@ pub fn check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi {
|
||||
/// Panics if the destination buffer is shorter than stated above.
|
||||
#[inline]
|
||||
pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize {
|
||||
// TODO: Can the + 1 be eliminated?
|
||||
assert!(dst.len() >= src.len() + 1);
|
||||
// TODO: Can the requirement for dst to be at least one unit longer
|
||||
// be eliminated?
|
||||
assert!(dst.len() > src.len());
|
||||
let mut decoder = Utf8Decoder::new_inner();
|
||||
let mut total_read = 0usize;
|
||||
let mut total_written = 0usize;
|
||||
@ -1528,13 +1538,13 @@ pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
|
||||
if byte >= 0x80 {
|
||||
// Two-byte
|
||||
let second = bytes[read + 1];
|
||||
let point = (((byte as u32) & 0x1Fu32) << 6) | (second as u32 & 0x3Fu32);
|
||||
dst[written] = point as u16;
|
||||
let point = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F);
|
||||
dst[written] = point;
|
||||
read += 2;
|
||||
written += 1;
|
||||
} else {
|
||||
// ASCII: write and go back to SIMD.
|
||||
dst[written] = byte as u16;
|
||||
dst[written] = u16::from(byte);
|
||||
read += 1;
|
||||
written += 1;
|
||||
// Intuitively, we should go back to the outer loop only
|
||||
@ -1548,10 +1558,10 @@ pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
|
||||
// Three-byte
|
||||
let second = bytes[read + 1];
|
||||
let third = bytes[read + 2];
|
||||
let point = (((byte as u32) & 0xFu32) << 12)
|
||||
| ((second as u32 & 0x3Fu32) << 6)
|
||||
| (third as u32 & 0x3Fu32);
|
||||
dst[written] = point as u16;
|
||||
let point = ((u16::from(byte) & 0xF) << 12)
|
||||
| ((u16::from(second) & 0x3F) << 6)
|
||||
| (u16::from(third) & 0x3F);
|
||||
dst[written] = point;
|
||||
read += 3;
|
||||
written += 1;
|
||||
} else {
|
||||
@ -1559,10 +1569,10 @@ pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
|
||||
let second = bytes[read + 1];
|
||||
let third = bytes[read + 2];
|
||||
let fourth = bytes[read + 3];
|
||||
let point = (((byte as u32) & 0x7u32) << 18)
|
||||
| ((second as u32 & 0x3Fu32) << 12)
|
||||
| ((third as u32 & 0x3Fu32) << 6)
|
||||
| (fourth as u32 & 0x3Fu32);
|
||||
let point = ((u32::from(byte) & 0x7) << 18)
|
||||
| ((u32::from(second) & 0x3F) << 12)
|
||||
| ((u32::from(third) & 0x3F) << 6)
|
||||
| (u32::from(fourth) & 0x3F);
|
||||
dst[written] = (0xD7C0 + (point >> 10)) as u16;
|
||||
dst[written + 1] = (0xDC00 + (point & 0x3FF)) as u16;
|
||||
read += 4;
|
||||
@ -1627,7 +1637,7 @@ pub fn convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usi
|
||||
/// a `&mut str`, use `convert_utf16_to_str()` instead of this function.
|
||||
#[inline]
|
||||
pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize {
|
||||
assert!(dst.len() >= src.len() * 3 + 1);
|
||||
assert!(dst.len() > src.len() * 3);
|
||||
let (read, written) = convert_utf16_to_utf8_partial(src, dst);
|
||||
debug_assert_eq!(read, src.len());
|
||||
written
|
||||
@ -1648,7 +1658,7 @@ pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize {
|
||||
/// replaced with the REPLACEMENT CHARACTER.
|
||||
#[inline]
|
||||
pub fn convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize) {
|
||||
let bytes: &mut [u8] = unsafe { ::std::mem::transmute(dst) };
|
||||
let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
|
||||
let (read, written) = convert_utf16_to_utf8_partial(src, bytes);
|
||||
let len = bytes.len();
|
||||
let mut trail = written;
|
||||
@ -1678,7 +1688,7 @@ pub fn convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize
|
||||
/// Panics if the destination buffer is shorter than stated above.
|
||||
#[inline]
|
||||
pub fn convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize {
|
||||
assert!(dst.len() >= src.len() * 3 + 1);
|
||||
assert!(dst.len() > src.len() * 3);
|
||||
let (read, written) = convert_utf16_to_str_partial(src, dst);
|
||||
debug_assert_eq!(read, src.len());
|
||||
written
|
||||
@ -1738,8 +1748,8 @@ pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usi
|
||||
let min_left = ::std::cmp::min(src_left, dst_left);
|
||||
if let Some((non_ascii, consumed)) = unsafe {
|
||||
ascii_to_ascii(
|
||||
src_ptr.offset(total_read as isize),
|
||||
dst_ptr.offset(total_written as isize),
|
||||
src_ptr.add(total_read),
|
||||
dst_ptr.add(total_written),
|
||||
min_left,
|
||||
)
|
||||
} {
|
||||
@ -1751,10 +1761,9 @@ pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usi
|
||||
|
||||
total_read += 1; // consume `non_ascii`
|
||||
|
||||
let code_point = non_ascii as u32;
|
||||
dst[total_written] = ((code_point >> 6) | 0xC0u32) as u8;
|
||||
dst[total_written] = (non_ascii >> 6) | 0xC0;
|
||||
total_written += 1;
|
||||
dst[total_written] = ((code_point as u32 & 0x3Fu32) | 0x80u32) as u8;
|
||||
dst[total_written] = (non_ascii & 0x3F) | 0x80;
|
||||
total_written += 1;
|
||||
continue;
|
||||
}
|
||||
@ -1801,7 +1810,7 @@ pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize {
|
||||
/// If the output isn't large enough, not all input is consumed.
|
||||
#[inline]
|
||||
pub fn convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize) {
|
||||
let bytes: &mut [u8] = unsafe { ::std::mem::transmute(dst) };
|
||||
let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
|
||||
let (read, written) = convert_latin1_to_utf8_partial(src, bytes);
|
||||
let len = bytes.len();
|
||||
let mut trail = written;
|
||||
@ -1880,8 +1889,8 @@ pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize {
|
||||
let src_left = src_len - total_read;
|
||||
if let Some((non_ascii, consumed)) = unsafe {
|
||||
ascii_to_ascii(
|
||||
src_ptr.offset(total_read as isize),
|
||||
dst_ptr.offset(total_written as isize),
|
||||
src_ptr.add(total_read),
|
||||
dst_ptr.add(total_written),
|
||||
src_left,
|
||||
)
|
||||
} {
|
||||
@ -1895,8 +1904,7 @@ pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize {
|
||||
let trail = src[total_read];
|
||||
total_read += 1;
|
||||
|
||||
dst[total_written] =
|
||||
(((non_ascii as u32 & 0x1Fu32) << 6) | (trail as u32 & 0x3Fu32)) as u8;
|
||||
dst[total_written] = ((non_ascii & 0x1F) << 6) | (trail & 0x3F);
|
||||
total_written += 1;
|
||||
continue;
|
||||
}
|
||||
@ -1939,6 +1947,65 @@ pub fn convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8]) {
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts bytes whose unsigned value is interpreted as Unicode code point
|
||||
/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
|
||||
///
|
||||
/// Borrows if input is ASCII-only. Performs a single heap allocation
|
||||
/// otherwise.
|
||||
pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> {
|
||||
let up_to = ascii_valid_up_to(bytes);
|
||||
// >= makes later things optimize better than ==
|
||||
if up_to >= bytes.len() {
|
||||
debug_assert_eq!(up_to, bytes.len());
|
||||
let s: &str = unsafe { ::std::str::from_utf8_unchecked(bytes) };
|
||||
return Cow::Borrowed(s);
|
||||
}
|
||||
let (head, tail) = bytes.split_at(up_to);
|
||||
let capacity = head.len() + tail.len() * 2;
|
||||
let mut vec = Vec::with_capacity(capacity);
|
||||
unsafe {
|
||||
vec.set_len(capacity);
|
||||
}
|
||||
(&mut vec[..up_to]).copy_from_slice(head);
|
||||
let written = convert_latin1_to_utf8(tail, &mut vec[up_to..]);
|
||||
vec.truncate(up_to + written);
|
||||
Cow::Owned(unsafe { String::from_utf8_unchecked(vec) })
|
||||
}
|
||||
|
||||
/// If the input is valid UTF-8 representing only Unicode code points from
|
||||
/// U+0000 to U+00FF, inclusive, converts the input into output that
|
||||
/// represents the value of each code point as the unsigned byte value of
|
||||
/// each output byte.
|
||||
///
|
||||
/// If the input does not fulfill the condition stated above, this function
|
||||
/// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
|
||||
/// does something that is memory-safe without any promises about any
|
||||
/// properties of the output. In particular, callers shouldn't assume the
|
||||
/// output to be the same across crate versions or CPU architectures and
|
||||
/// should not assume that non-ASCII input can't map to ASCII output.
|
||||
///
|
||||
/// Borrows if input is ASCII-only. Performs a single heap allocation
|
||||
/// otherwise.
|
||||
pub fn encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]> {
|
||||
let bytes = string.as_bytes();
|
||||
let up_to = ascii_valid_up_to(bytes);
|
||||
// >= makes later things optimize better than ==
|
||||
if up_to >= bytes.len() {
|
||||
debug_assert_eq!(up_to, bytes.len());
|
||||
return Cow::Borrowed(bytes);
|
||||
}
|
||||
let (head, tail) = bytes.split_at(up_to);
|
||||
let capacity = bytes.len();
|
||||
let mut vec = Vec::with_capacity(capacity);
|
||||
unsafe {
|
||||
vec.set_len(capacity);
|
||||
}
|
||||
(&mut vec[..up_to]).copy_from_slice(head);
|
||||
let written = convert_utf8_to_latin1_lossy(tail, &mut vec[up_to..]);
|
||||
vec.truncate(up_to + written);
|
||||
Cow::Owned(vec)
|
||||
}
|
||||
|
||||
/// Returns the index of the first unpaired surrogate or, if the input is
|
||||
/// valid UTF-16 in its entirety, the length of the input.
|
||||
#[inline]
|
||||
@ -2321,6 +2388,7 @@ mod tests {
|
||||
assert_eq!(dst, reference);
|
||||
}
|
||||
|
||||
#[cfg(all(debug_assertions, not(fuzzing)))]
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_convert_utf8_to_latin1_lossy_panics() {
|
||||
@ -3035,11 +3103,11 @@ mod tests {
|
||||
#[test]
|
||||
fn test_is_char_bidi_thoroughly() {
|
||||
for i in 0..0xD800u32 {
|
||||
let c: char = unsafe { ::std::mem::transmute(i) };
|
||||
let c: char = ::std::char::from_u32(i).unwrap();
|
||||
assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
|
||||
}
|
||||
for i in 0xE000..0x110000u32 {
|
||||
let c: char = unsafe { ::std::mem::transmute(i) };
|
||||
let c: char = ::std::char::from_u32(i).unwrap();
|
||||
assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
|
||||
}
|
||||
}
|
||||
@ -3059,14 +3127,14 @@ mod tests {
|
||||
fn test_is_str_bidi_thoroughly() {
|
||||
let mut buf = [0; 4];
|
||||
for i in 0..0xD800u32 {
|
||||
let c: char = unsafe { ::std::mem::transmute(i) };
|
||||
let c: char = ::std::char::from_u32(i).unwrap();
|
||||
assert_eq!(
|
||||
is_str_bidi(c.encode_utf8(&mut buf[..])),
|
||||
reference_is_char_bidi(c)
|
||||
);
|
||||
}
|
||||
for i in 0xE000..0x110000u32 {
|
||||
let c: char = unsafe { ::std::mem::transmute(i) };
|
||||
let c: char = ::std::char::from_u32(i).unwrap();
|
||||
assert_eq!(
|
||||
is_str_bidi(c.encode_utf8(&mut buf[..])),
|
||||
reference_is_char_bidi(c)
|
||||
@ -3078,7 +3146,7 @@ mod tests {
|
||||
fn test_is_utf8_bidi_thoroughly() {
|
||||
let mut buf = [0; 8];
|
||||
for i in 0..0xD800u32 {
|
||||
let c: char = unsafe { ::std::mem::transmute(i) };
|
||||
let c: char = ::std::char::from_u32(i).unwrap();
|
||||
let expect = reference_is_char_bidi(c);
|
||||
{
|
||||
let len = {
|
||||
@ -3096,7 +3164,7 @@ mod tests {
|
||||
assert_eq!(is_utf8_bidi(&buf[..]), expect);
|
||||
}
|
||||
for i in 0xE000..0x110000u32 {
|
||||
let c: char = unsafe { ::std::mem::transmute(i) };
|
||||
let c: char = ::std::char::from_u32(i).unwrap();
|
||||
let expect = reference_is_char_bidi(c);
|
||||
{
|
||||
let len = {
|
||||
@ -3137,4 +3205,31 @@ mod tests {
|
||||
assert!(is_utf8_bidi(b"\xD6\x80\xC2"));
|
||||
assert!(is_utf8_bidi(b"ab\xC2"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_latin1() {
|
||||
match decode_latin1(b"ab") {
|
||||
Cow::Borrowed(s) => {
|
||||
assert_eq!(s, "ab");
|
||||
}
|
||||
Cow::Owned(_) => {
|
||||
unreachable!("Should have borrowed");
|
||||
}
|
||||
}
|
||||
assert_eq!(decode_latin1(b"a\xE4"), "a\u{E4}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_latin1_lossy() {
|
||||
match encode_latin1_lossy("ab") {
|
||||
Cow::Borrowed(s) => {
|
||||
assert_eq!(s, b"ab");
|
||||
}
|
||||
Cow::Owned(_) => {
|
||||
unreachable!("Should have borrowed");
|
||||
}
|
||||
}
|
||||
assert_eq!(encode_latin1_lossy("a\u{E4}"), &(b"a\xE4")[..]);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -41,7 +41,7 @@ impl ReplacementDecoder {
|
||||
// https://github.com/whatwg/encoding/issues/33
|
||||
if self.emitted || src.is_empty() {
|
||||
(DecoderResult::InputEmpty, src.len(), 0)
|
||||
} else if dst.len() < 1 {
|
||||
} else if dst.is_empty() {
|
||||
// Make sure there's room for the replacement character.
|
||||
(DecoderResult::OutputFull, 0, 0)
|
||||
} else {
|
||||
|
61
third_party/rust/encoding_rs/src/shift_jis.rs
vendored
61
third_party/rust/encoding_rs/src/shift_jis.rs
vendored
@ -68,7 +68,7 @@ impl ShiftJisDecoder {
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
handle.write_upper_bmp(0xFF61 + non_ascii_minus_half_with_katakana_start as u16);
|
||||
handle.write_upper_bmp(0xFF61 + u16::from(non_ascii_minus_half_with_katakana_start));
|
||||
// Not caring about optimizing subsequent non-ASCII
|
||||
continue 'outermost;
|
||||
}
|
||||
@ -89,7 +89,7 @@ impl ShiftJisDecoder {
|
||||
let trail_minus_hiragana = byte.wrapping_sub(0x9F);
|
||||
if lead_minus_offset == 0x01 && trail_minus_hiragana < 0x53 {
|
||||
// Hiragana
|
||||
handle.write_upper_bmp(0x3041 + trail_minus_hiragana as u16)
|
||||
handle.write_upper_bmp(0x3041 + u16::from(trail_minus_hiragana))
|
||||
} else {
|
||||
let mut trail_minus_offset =
|
||||
byte.wrapping_sub(0x40);
|
||||
@ -111,7 +111,7 @@ impl ShiftJisDecoder {
|
||||
if lead_minus_offset == 0x02 &&
|
||||
trail_minus_offset < 0x56 {
|
||||
// Katakana
|
||||
handle.write_upper_bmp(0x30A1 + trail_minus_offset as u16)
|
||||
handle.write_upper_bmp(0x30A1 + u16::from(trail_minus_offset))
|
||||
} else {
|
||||
let pointer = lead_minus_offset as usize *
|
||||
188usize +
|
||||
@ -167,6 +167,35 @@ impl ShiftJisDecoder {
|
||||
false);
|
||||
}
|
||||
|
||||
#[cfg(feature = "fast-kanji-encode")]
|
||||
#[inline(always)]
|
||||
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
|
||||
jis0208_kanji_shift_jis_encode(bmp)
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "fast-kanji-encode"))]
|
||||
#[inline(always)]
|
||||
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
|
||||
if let Some((lead, trail)) = jis0208_level1_kanji_shift_jis_encode(bmp) {
|
||||
return Some((lead, trail));
|
||||
}
|
||||
let pointer = if 0x4EDD == bmp {
|
||||
// Ideograph on the symbol row!
|
||||
23
|
||||
} else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
|
||||
4418 + pos
|
||||
} else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
|
||||
10744 + pos
|
||||
} else {
|
||||
return None;
|
||||
};
|
||||
let lead = pointer / 188;
|
||||
let lead_offset = if lead < 0x1F { 0x81usize } else { 0xC1usize };
|
||||
let trail = pointer % 188;
|
||||
let trail_offset = if trail < 0x3F { 0x40usize } else { 0x41usize };
|
||||
Some(((lead + lead_offset) as u8, (trail + trail_offset) as u8))
|
||||
}
|
||||
|
||||
pub struct ShiftJisEncoder;
|
||||
|
||||
impl ShiftJisEncoder {
|
||||
@ -195,28 +224,14 @@ impl ShiftJisEncoder {
|
||||
if bmp_minus_hiragana < 0x53 {
|
||||
handle.write_two(0x82, 0x9F + bmp_minus_hiragana as u8)
|
||||
} else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
|
||||
if let Some((lead, trail)) = jis0208_level1_kanji_shift_jis_encode(bmp) {
|
||||
if let Some((lead, trail)) = encode_kanji(bmp) {
|
||||
handle.write_two(lead, trail)
|
||||
} else {
|
||||
let pointer = if 0x4EDD == bmp {
|
||||
// Ideograph on the symbol row!
|
||||
23
|
||||
} else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
|
||||
4418 + pos
|
||||
} else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
|
||||
10744 + pos
|
||||
} else {
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
};
|
||||
let lead = pointer / 188;
|
||||
let lead_offset = if lead < 0x1F { 0x81usize } else { 0xC1usize };
|
||||
let trail = pointer % 188;
|
||||
let trail_offset = if trail < 0x3F { 0x40usize } else { 0x41usize };
|
||||
handle.write_two((lead + lead_offset) as u8, (trail + trail_offset) as u8)
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
} else {
|
||||
let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
|
||||
|
12
third_party/rust/encoding_rs/src/simd_funcs.rs
vendored
12
third_party/rust/encoding_rs/src/simd_funcs.rs
vendored
@ -286,7 +286,7 @@ pub fn is_u16x8_bidi(s: u16x8) -> bool {
|
||||
| s.eq(u16x8::splat(0x202B))
|
||||
| s.eq(u16x8::splat(0x202E))
|
||||
| s.eq(u16x8::splat(0x2067)))
|
||||
.any()
|
||||
.any()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
@ -360,7 +360,7 @@ mod tests {
|
||||
let ptr = vec.as_mut_ptr();
|
||||
unsafe {
|
||||
store8_unaligned(ptr, first);
|
||||
store8_unaligned(ptr.offset(8), second);
|
||||
store8_unaligned(ptr.add(8), second);
|
||||
}
|
||||
assert_eq!(&vec[..], &basic_latin[..]);
|
||||
}
|
||||
@ -376,7 +376,7 @@ mod tests {
|
||||
0x75, 0x76,
|
||||
];
|
||||
let first = unsafe { load8_unaligned(basic_latin.as_ptr()) };
|
||||
let second = unsafe { load8_unaligned(basic_latin.as_ptr().offset(8)) };
|
||||
let second = unsafe { load8_unaligned(basic_latin.as_ptr().add(8)) };
|
||||
let mut vec = Vec::with_capacity(16);
|
||||
vec.resize(16, 0u8);
|
||||
let ptr = vec.as_mut_ptr();
|
||||
@ -394,7 +394,7 @@ mod tests {
|
||||
0x75, 0x76,
|
||||
];
|
||||
let first = unsafe { load8_unaligned(input.as_ptr()) };
|
||||
let second = unsafe { load8_unaligned(input.as_ptr().offset(8)) };
|
||||
let second = unsafe { load8_unaligned(input.as_ptr().add(8)) };
|
||||
assert!(!simd_is_basic_latin(first | second));
|
||||
}
|
||||
|
||||
@ -405,7 +405,7 @@ mod tests {
|
||||
0x75, 0x76,
|
||||
];
|
||||
let first = unsafe { load8_unaligned(input.as_ptr()) };
|
||||
let second = unsafe { load8_unaligned(input.as_ptr().offset(8)) };
|
||||
let second = unsafe { load8_unaligned(input.as_ptr().add(8)) };
|
||||
assert!(!simd_is_basic_latin(first | second));
|
||||
}
|
||||
|
||||
@ -416,7 +416,7 @@ mod tests {
|
||||
0x75, 0x76,
|
||||
];
|
||||
let first = unsafe { load8_unaligned(input.as_ptr()) };
|
||||
let second = unsafe { load8_unaligned(input.as_ptr().offset(8)) };
|
||||
let second = unsafe { load8_unaligned(input.as_ptr().add(8)) };
|
||||
assert!(!simd_is_basic_latin(first | second));
|
||||
}
|
||||
|
||||
|
236
third_party/rust/encoding_rs/src/single_byte.rs
vendored
236
third_party/rust/encoding_rs/src/single_byte.rs
vendored
@ -9,6 +9,7 @@
|
||||
|
||||
use super::*;
|
||||
use ascii::*;
|
||||
use data::position;
|
||||
use handles::*;
|
||||
use variant::*;
|
||||
|
||||
@ -154,8 +155,8 @@ impl SingleByteDecoder {
|
||||
'outermost: loop {
|
||||
match unsafe {
|
||||
ascii_to_basic_latin(
|
||||
src.as_ptr().offset(converted as isize),
|
||||
dst.as_mut_ptr().offset(converted as isize),
|
||||
src.as_ptr().add(converted),
|
||||
dst.as_mut_ptr().add(converted),
|
||||
length - converted,
|
||||
)
|
||||
} {
|
||||
@ -207,7 +208,7 @@ impl SingleByteDecoder {
|
||||
// byte unconditionally instead of trying to unread it
|
||||
// to make it part of the next SIMD stride.
|
||||
unsafe {
|
||||
*(dst.get_unchecked_mut(converted)) = b as u16;
|
||||
*(dst.get_unchecked_mut(converted)) = u16::from(b);
|
||||
}
|
||||
converted += 1;
|
||||
if b < 60 {
|
||||
@ -230,13 +231,27 @@ impl SingleByteDecoder {
|
||||
|
||||
pub struct SingleByteEncoder {
|
||||
table: &'static [u16; 128],
|
||||
run_bmp_offset: usize,
|
||||
run_byte_offset: usize,
|
||||
run_length: usize,
|
||||
}
|
||||
|
||||
impl SingleByteEncoder {
|
||||
pub fn new(encoding: &'static Encoding, data: &'static [u16; 128]) -> Encoder {
|
||||
pub fn new(
|
||||
encoding: &'static Encoding,
|
||||
data: &'static [u16; 128],
|
||||
run_bmp_offset: u16,
|
||||
run_byte_offset: u8,
|
||||
run_length: u8,
|
||||
) -> Encoder {
|
||||
Encoder::new(
|
||||
encoding,
|
||||
VariantEncoder::SingleByte(SingleByteEncoder { table: data }),
|
||||
VariantEncoder::SingleByte(SingleByteEncoder {
|
||||
table: data,
|
||||
run_bmp_offset: run_bmp_offset as usize,
|
||||
run_byte_offset: run_byte_offset as usize,
|
||||
run_length: run_length as usize,
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
@ -254,54 +269,64 @@ impl SingleByteEncoder {
|
||||
Some(byte_length)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn encode_u16(&self, code_unit: u16) -> Option<u8> {
|
||||
// We search the quadrants in reverse order, but we search forward
|
||||
// within each quadrant. For Windows and ISO encodings, this is
|
||||
// generally faster than just searching the whole table backwards.
|
||||
// (Exceptions: English, German, Czech.) This order is also OK for
|
||||
// KOI encodings. For IBM and Mac encodings, this order is bad,
|
||||
// but we don't really need to optimize for those encodings anyway.
|
||||
// First, we see if the code unit falls into a run of consecutive
|
||||
// code units that can be mapped by offset. This is very efficient
|
||||
// for most non-Latin encodings as well as Latin1-ish encodings.
|
||||
//
|
||||
// For encodings that don't fit this pattern, the run (which may
|
||||
// have the length of just one) just establishes the starting point
|
||||
// for the next rule.
|
||||
//
|
||||
// Next, we do a forward linear search in the part of the index
|
||||
// after the run. Even in non-Latin1-ish Latin encodings (except
|
||||
// macintosh), the lower case letters are here.
|
||||
//
|
||||
// Next, we search the third quadrant up to the start of the run
|
||||
// (upper case letters in Latin encodings except macintosh, in
|
||||
// Greek and in KOI encodings) and then the second quadrant,
|
||||
// except if the run stared before the third quadrant, we search
|
||||
// the second quadrant up to the run.
|
||||
//
|
||||
// Last, we search the first quadrant, which has unused controls
|
||||
// or punctuation in most encodings. This is bad for macintosh
|
||||
// and IBM866, but those are rare.
|
||||
|
||||
// In Windows and ISO encodings, the fourth quadrant holds most of the
|
||||
// lower-case letters for bicameral scripts as well as the Hebrew
|
||||
// letters. There are some Thai letters and combining marks as well as
|
||||
// Thai numerals here. (In KOI8-R, the upper-case letters are here.)
|
||||
for i in 96..128 {
|
||||
if self.table[i] == code_unit {
|
||||
return Some((i + 128) as u8);
|
||||
}
|
||||
// Run of consecutive units
|
||||
let unit_as_usize = code_unit as usize;
|
||||
let offset = unit_as_usize.wrapping_sub(self.run_bmp_offset);
|
||||
if offset < self.run_length {
|
||||
return Some((128 + self.run_byte_offset + offset) as u8);
|
||||
}
|
||||
|
||||
// In Windows and ISO encodings, the third quadrant holds most of the
|
||||
// upper-case letters for bicameral scripts as well as most of the
|
||||
// Arabic letters. Searching this quadrant first would be better for
|
||||
// Arabic. There are a number of Thai letters and combining marks here.
|
||||
// (In KOI8-R, the lower-case letters are here.)
|
||||
for i in 64..96 {
|
||||
if self.table[i] == code_unit {
|
||||
return Some((i + 128) as u8);
|
||||
}
|
||||
// Search after the run
|
||||
let tail_start = self.run_byte_offset + self.run_length;
|
||||
if let Some(pos) = position(&self.table[tail_start..], code_unit) {
|
||||
return Some((128 + tail_start + pos) as u8);
|
||||
}
|
||||
|
||||
// In Windows and ISO encodings, the second quadrant hold most of the
|
||||
// Thai letters. In other scripts, there tends to be symbols here.
|
||||
// Even though the two quadrants above are relevant for Thai, for Thai
|
||||
// it would likely be optimal to search this quadrant first. :-(
|
||||
for i in 32..64 {
|
||||
if self.table[i] == code_unit {
|
||||
return Some((i + 128) as u8);
|
||||
if self.run_byte_offset >= 64 {
|
||||
// Search third quadrant before the run
|
||||
if let Some(pos) = position(&self.table[64..self.run_byte_offset], code_unit) {
|
||||
return Some(((128 + 64) + pos) as u8);
|
||||
}
|
||||
|
||||
// Search second quadrant
|
||||
if let Some(pos) = position(&self.table[32..64], code_unit) {
|
||||
return Some(((128 + 32) + pos) as u8);
|
||||
}
|
||||
} else if let Some(pos) = position(&self.table[32..self.run_byte_offset], code_unit) {
|
||||
// windows-1252, windows-874, ISO-8859-15 and ISO-8859-5
|
||||
// Search second quadrant before the run
|
||||
return Some(((128 + 32) + pos) as u8);
|
||||
}
|
||||
|
||||
// The first quadrant is useless in ISO encodings. In Windows encodings,
|
||||
// there is useful punctuation here that might warrant searching
|
||||
// before the symbols in the second quadrant, but the second quadrant
|
||||
// is searched before this one for the benefit of Thai.
|
||||
for i in 0..32 {
|
||||
if self.table[i] == code_unit {
|
||||
return Some((i + 128) as u8);
|
||||
}
|
||||
// Search first quadrant
|
||||
if let Some(pos) = position(&self.table[..32], code_unit) {
|
||||
return Some((128 + pos) as u8);
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
@ -345,8 +370,8 @@ impl SingleByteEncoder {
|
||||
'outermost: loop {
|
||||
match unsafe {
|
||||
basic_latin_to_ascii(
|
||||
src.as_ptr().offset(converted as isize),
|
||||
dst.as_mut_ptr().offset(converted as isize),
|
||||
src.as_ptr().add(converted),
|
||||
dst.as_mut_ptr().add(converted),
|
||||
length - converted,
|
||||
)
|
||||
} {
|
||||
@ -379,7 +404,7 @@ impl SingleByteEncoder {
|
||||
);
|
||||
}
|
||||
let second =
|
||||
unsafe { *src.get_unchecked(converted + 1) } as u32;
|
||||
u32::from(unsafe { *src.get_unchecked(converted + 1) });
|
||||
if second & 0xFC00u32 != 0xDC00u32 {
|
||||
return (
|
||||
EncoderResult::Unmappable('\u{FFFD}'),
|
||||
@ -389,9 +414,9 @@ impl SingleByteEncoder {
|
||||
}
|
||||
// The next code unit is a low surrogate.
|
||||
let astral: char = unsafe {
|
||||
::std::mem::transmute(
|
||||
((non_ascii as u32) << 10) + second
|
||||
- (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32),
|
||||
::std::char::from_u32_unchecked(
|
||||
(u32::from(non_ascii) << 10) + second
|
||||
- (((0xD800u32 << 10) - 0x1_0000u32) + 0xDC00u32),
|
||||
)
|
||||
};
|
||||
return (
|
||||
@ -408,10 +433,8 @@ impl SingleByteEncoder {
|
||||
converted,
|
||||
);
|
||||
}
|
||||
let thirty_two = non_ascii as u32;
|
||||
let bmp: char = unsafe { ::std::mem::transmute(thirty_two) };
|
||||
return (
|
||||
EncoderResult::Unmappable(bmp),
|
||||
EncoderResult::unmappable_from_bmp(non_ascii),
|
||||
converted + 1, // +1 `for non_ascii`
|
||||
converted,
|
||||
);
|
||||
@ -464,7 +487,6 @@ impl SingleByteEncoder {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::super::data::*;
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
@ -603,64 +625,64 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_single_byte_decode() {
|
||||
decode_single_byte(IBM866, IBM866_DATA);
|
||||
decode_single_byte(ISO_8859_10, ISO_8859_10_DATA);
|
||||
decode_single_byte(ISO_8859_13, ISO_8859_13_DATA);
|
||||
decode_single_byte(ISO_8859_14, ISO_8859_14_DATA);
|
||||
decode_single_byte(ISO_8859_15, ISO_8859_15_DATA);
|
||||
decode_single_byte(ISO_8859_16, ISO_8859_16_DATA);
|
||||
decode_single_byte(ISO_8859_2, ISO_8859_2_DATA);
|
||||
decode_single_byte(ISO_8859_3, ISO_8859_3_DATA);
|
||||
decode_single_byte(ISO_8859_4, ISO_8859_4_DATA);
|
||||
decode_single_byte(ISO_8859_5, ISO_8859_5_DATA);
|
||||
decode_single_byte(ISO_8859_6, ISO_8859_6_DATA);
|
||||
decode_single_byte(ISO_8859_7, ISO_8859_7_DATA);
|
||||
decode_single_byte(ISO_8859_8, ISO_8859_8_DATA);
|
||||
decode_single_byte(KOI8_R, KOI8_R_DATA);
|
||||
decode_single_byte(KOI8_U, KOI8_U_DATA);
|
||||
decode_single_byte(MACINTOSH, MACINTOSH_DATA);
|
||||
decode_single_byte(WINDOWS_1250, WINDOWS_1250_DATA);
|
||||
decode_single_byte(WINDOWS_1251, WINDOWS_1251_DATA);
|
||||
decode_single_byte(WINDOWS_1252, WINDOWS_1252_DATA);
|
||||
decode_single_byte(WINDOWS_1253, WINDOWS_1253_DATA);
|
||||
decode_single_byte(WINDOWS_1254, WINDOWS_1254_DATA);
|
||||
decode_single_byte(WINDOWS_1255, WINDOWS_1255_DATA);
|
||||
decode_single_byte(WINDOWS_1256, WINDOWS_1256_DATA);
|
||||
decode_single_byte(WINDOWS_1257, WINDOWS_1257_DATA);
|
||||
decode_single_byte(WINDOWS_1258, WINDOWS_1258_DATA);
|
||||
decode_single_byte(WINDOWS_874, WINDOWS_874_DATA);
|
||||
decode_single_byte(X_MAC_CYRILLIC, X_MAC_CYRILLIC_DATA);
|
||||
decode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866);
|
||||
decode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10);
|
||||
decode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13);
|
||||
decode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14);
|
||||
decode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15);
|
||||
decode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16);
|
||||
decode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2);
|
||||
decode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3);
|
||||
decode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4);
|
||||
decode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5);
|
||||
decode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6);
|
||||
decode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7);
|
||||
decode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8);
|
||||
decode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r);
|
||||
decode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u);
|
||||
decode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh);
|
||||
decode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250);
|
||||
decode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251);
|
||||
decode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252);
|
||||
decode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253);
|
||||
decode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254);
|
||||
decode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255);
|
||||
decode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256);
|
||||
decode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257);
|
||||
decode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258);
|
||||
decode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874);
|
||||
decode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_byte_encode() {
|
||||
encode_single_byte(IBM866, IBM866_DATA);
|
||||
encode_single_byte(ISO_8859_10, ISO_8859_10_DATA);
|
||||
encode_single_byte(ISO_8859_13, ISO_8859_13_DATA);
|
||||
encode_single_byte(ISO_8859_14, ISO_8859_14_DATA);
|
||||
encode_single_byte(ISO_8859_15, ISO_8859_15_DATA);
|
||||
encode_single_byte(ISO_8859_16, ISO_8859_16_DATA);
|
||||
encode_single_byte(ISO_8859_2, ISO_8859_2_DATA);
|
||||
encode_single_byte(ISO_8859_3, ISO_8859_3_DATA);
|
||||
encode_single_byte(ISO_8859_4, ISO_8859_4_DATA);
|
||||
encode_single_byte(ISO_8859_5, ISO_8859_5_DATA);
|
||||
encode_single_byte(ISO_8859_6, ISO_8859_6_DATA);
|
||||
encode_single_byte(ISO_8859_7, ISO_8859_7_DATA);
|
||||
encode_single_byte(ISO_8859_8, ISO_8859_8_DATA);
|
||||
encode_single_byte(KOI8_R, KOI8_R_DATA);
|
||||
encode_single_byte(KOI8_U, KOI8_U_DATA);
|
||||
encode_single_byte(MACINTOSH, MACINTOSH_DATA);
|
||||
encode_single_byte(WINDOWS_1250, WINDOWS_1250_DATA);
|
||||
encode_single_byte(WINDOWS_1251, WINDOWS_1251_DATA);
|
||||
encode_single_byte(WINDOWS_1252, WINDOWS_1252_DATA);
|
||||
encode_single_byte(WINDOWS_1253, WINDOWS_1253_DATA);
|
||||
encode_single_byte(WINDOWS_1254, WINDOWS_1254_DATA);
|
||||
encode_single_byte(WINDOWS_1255, WINDOWS_1255_DATA);
|
||||
encode_single_byte(WINDOWS_1256, WINDOWS_1256_DATA);
|
||||
encode_single_byte(WINDOWS_1257, WINDOWS_1257_DATA);
|
||||
encode_single_byte(WINDOWS_1258, WINDOWS_1258_DATA);
|
||||
encode_single_byte(WINDOWS_874, WINDOWS_874_DATA);
|
||||
encode_single_byte(X_MAC_CYRILLIC, X_MAC_CYRILLIC_DATA);
|
||||
encode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866);
|
||||
encode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10);
|
||||
encode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13);
|
||||
encode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14);
|
||||
encode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15);
|
||||
encode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16);
|
||||
encode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2);
|
||||
encode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3);
|
||||
encode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4);
|
||||
encode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5);
|
||||
encode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6);
|
||||
encode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7);
|
||||
encode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8);
|
||||
encode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r);
|
||||
encode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u);
|
||||
encode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh);
|
||||
encode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250);
|
||||
encode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251);
|
||||
encode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252);
|
||||
encode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253);
|
||||
encode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254);
|
||||
encode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255);
|
||||
encode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256);
|
||||
encode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257);
|
||||
encode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258);
|
||||
encode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874);
|
||||
encode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic);
|
||||
}
|
||||
// END GENERATED CODE
|
||||
|
||||
|
7
third_party/rust/encoding_rs/src/utf_16.rs
vendored
7
third_party/rust/encoding_rs/src/utf_16.rs
vendored
@ -29,8 +29,7 @@ impl Utf16Decoder {
|
||||
}
|
||||
|
||||
pub fn additional_from_state(&self) -> usize {
|
||||
1
|
||||
+ if self.lead_byte.is_some() { 1 } else { 0 }
|
||||
1 + if self.lead_byte.is_some() { 1 } else { 0 }
|
||||
+ if self.lead_surrogate == 0 { 0 } else { 2 }
|
||||
}
|
||||
|
||||
@ -120,9 +119,9 @@ impl Utf16Decoder {
|
||||
Some(lead) => {
|
||||
self.lead_byte = None;
|
||||
let code_unit = if self.be {
|
||||
(lead as u16) << 8 | b as u16
|
||||
u16::from(lead) << 8 | u16::from(b)
|
||||
} else {
|
||||
(b as u16) << 8 | (lead as u16)
|
||||
u16::from(b) << 8 | u16::from(lead)
|
||||
};
|
||||
let high_bits = code_unit & 0xFC00u16;
|
||||
if high_bits == 0xD800u16 {
|
||||
|
705
third_party/rust/encoding_rs/src/utf_8.rs
vendored
705
third_party/rust/encoding_rs/src/utf_8.rs
vendored
@ -7,12 +7,10 @@
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
#[cfg(feature = "parallel-utf8")]
|
||||
extern crate rayon;
|
||||
|
||||
use super::*;
|
||||
use ascii::ascii_to_basic_latin;
|
||||
use ascii::basic_latin_to_ascii;
|
||||
use ascii::validate_ascii;
|
||||
use handles::*;
|
||||
use variant::*;
|
||||
|
||||
@ -34,111 +32,211 @@ cfg_if!{
|
||||
}
|
||||
}
|
||||
|
||||
// Keep this cfg_if in sync with whether the utf_8_core module is defined in lib.rs.
|
||||
cfg_if! {
|
||||
// When running 32-bit ARM code on Raspberry Pi 3, which has a 64-bit CPU,
|
||||
// this is a pessimization for non-Latin, non-CJK scripts. However, this
|
||||
// optimization seems to work across scripts when running 32-bit ARM code
|
||||
// on a 32-bit ARM CPU (particularly good on Exynos 5) and when running
|
||||
// 64-bit ARM code on a 64-bit ARM CPU.
|
||||
if #[cfg(any(all(feature = "simd-accel", target_feature = "sse2"), all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_arch = "arm")))] {
|
||||
use utf_8_core::run_utf8_validation;
|
||||
} else {
|
||||
use ::std::str::Utf8Error;
|
||||
#[inline(always)]
|
||||
fn run_utf8_validation(v: &[u8]) -> Result<&str, Utf8Error> {
|
||||
::std::str::from_utf8(v)
|
||||
}
|
||||
}
|
||||
#[repr(align(64))] // Align to cache lines
|
||||
pub struct Utf8Data {
|
||||
pub table: [u8; 384],
|
||||
}
|
||||
|
||||
pub const UTF8_NORMAL_TRAIL: u8 = 1 << 3;
|
||||
|
||||
pub const UTF8_THREE_BYTE_SPECIAL_LOWER_BOUND_TRAIL: u8 = 1 << 4;
|
||||
|
||||
pub const UTF8_THREE_BYTE_SPECIAL_UPPER_BOUND_TRAIL: u8 = 1 << 5;
|
||||
|
||||
pub const UTF8_FOUR_BYTE_SPECIAL_LOWER_BOUND_TRAIL: u8 = 1 << 6;
|
||||
|
||||
pub const UTF8_FOUR_BYTE_SPECIAL_UPPER_BOUND_TRAIL: u8 = 1 << 7;
|
||||
|
||||
// BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
|
||||
// Instead, please regenerate using generate-encoding-data.py
|
||||
|
||||
/// Bit is 1 if the trail is invalid.
|
||||
pub static UTF8_TRAIL_INVALID: [u8; 256] = [
|
||||
248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248,
|
||||
248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248,
|
||||
248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248,
|
||||
248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248,
|
||||
248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248,
|
||||
248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248,
|
||||
248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 80, 80, 80, 80, 80, 80,
|
||||
80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144,
|
||||
144, 144, 144, 144, 144, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160,
|
||||
160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 248,
|
||||
248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248,
|
||||
248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248,
|
||||
248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248,
|
||||
248, 248, 248, 248, 248, 248,
|
||||
];
|
||||
pub static UTF8_DATA: Utf8Data = Utf8Data {
|
||||
table: [
|
||||
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
|
||||
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
|
||||
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
|
||||
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
|
||||
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
|
||||
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
|
||||
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
|
||||
252, 252, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 148, 148, 148,
|
||||
148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 164, 164, 164, 164, 164,
|
||||
164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164,
|
||||
164, 164, 164, 164, 164, 164, 164, 164, 164, 252, 252, 252, 252, 252, 252, 252, 252, 252,
|
||||
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
|
||||
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
|
||||
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
|
||||
252, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 16, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 32, 8, 8, 64, 8, 8, 8, 128, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
],
|
||||
};
|
||||
|
||||
// END GENERATED CODE
|
||||
|
||||
#[cfg(feature = "parallel-utf8")]
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(never_loop))]
|
||||
pub fn utf8_valid_up_to(bytes: &[u8]) -> usize {
|
||||
let mut len = bytes.len();
|
||||
// The purpose of the outer loop is to avoid recursion when the attempt
|
||||
// to find the split point discovers and over-long sequence.
|
||||
pub fn utf8_valid_up_to(src: &[u8]) -> usize {
|
||||
// This algorithm differs from the UTF-8 validation algorithm, but making
|
||||
// this one consistent with that one makes this slower for reasons I don't
|
||||
// understand.
|
||||
let mut read = 0;
|
||||
'outer: loop {
|
||||
// This magic number has been determined on i7-4770 with SSE2 enabled.
|
||||
// It's very likely that the number should be different when different
|
||||
// ISA is used for ASCII acceleration. The number has been chosen
|
||||
// to optimize the all-ASCII case. With mostly non-ASCII, the number
|
||||
// should be much smaller, but that would pessimize the all-ASCII case,
|
||||
// which we are trying to optimize here.
|
||||
if len < 290000 {
|
||||
return match run_utf8_validation(&bytes[..len]) {
|
||||
Ok(_) => bytes.len(),
|
||||
Err(e) => e.valid_up_to(),
|
||||
};
|
||||
}
|
||||
let mid = len >> 1;
|
||||
let mut adjusted = mid;
|
||||
let mut i = 0;
|
||||
'inner: loop {
|
||||
// No need to check for `adjusted` reaching `len` because we
|
||||
// already know that `len` is way larger than `(len / 2) + 4`.
|
||||
if i == 3 {
|
||||
// `mid` landed inside an overlong sequence.
|
||||
len = mid;
|
||||
continue 'outer;
|
||||
let mut byte = {
|
||||
let src_remaining = &src[read..];
|
||||
match validate_ascii(src_remaining) {
|
||||
None => {
|
||||
return src.len();
|
||||
}
|
||||
Some((non_ascii, consumed)) => {
|
||||
read += consumed;
|
||||
non_ascii
|
||||
}
|
||||
}
|
||||
if (bytes[adjusted] & 0xC0) != 0x80 {
|
||||
};
|
||||
// Check for the longest sequence to avoid checking twice for the
|
||||
// multi-byte sequences. This can't overflow with 64-bit address space,
|
||||
// because full 64 bits aren't in use. In the 32-bit PAE case, for this
|
||||
// to overflow would mean that the source slice would be so large that
|
||||
// the address space of the process would not have space for any code.
|
||||
// Therefore, the slice cannot be so long that this would overflow.
|
||||
if unsafe { likely(read + 4 <= src.len()) } {
|
||||
'inner: loop {
|
||||
// At this point, `byte` is not included in `read`, because we
|
||||
// don't yet know that a) the UTF-8 sequence is valid and b) that there
|
||||
// is output space if it is an astral sequence.
|
||||
// We know, thanks to `ascii_to_basic_latin` that there is output
|
||||
// space for at least one UTF-16 code unit, so no need to check
|
||||
// for output space in the BMP cases.
|
||||
// Inspecting the lead byte directly is faster than what the
|
||||
// std lib does!
|
||||
if unsafe { likely(in_inclusive_range8(byte, 0xC2, 0xDF)) } {
|
||||
// Two-byte
|
||||
let second = unsafe { *(src.get_unchecked(read + 1)) };
|
||||
if !in_inclusive_range8(second, 0x80, 0xBF) {
|
||||
break 'outer;
|
||||
}
|
||||
read += 2;
|
||||
|
||||
// Next lead (manually inlined)
|
||||
if unsafe { likely(read + 4 <= src.len()) } {
|
||||
byte = unsafe { *(src.get_unchecked(read)) };
|
||||
if byte < 0x80 {
|
||||
read += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
continue 'inner;
|
||||
}
|
||||
break 'inner;
|
||||
}
|
||||
if unsafe { likely(byte < 0xF0) } {
|
||||
'three: loop {
|
||||
// Three-byte
|
||||
let second = unsafe { *(src.get_unchecked(read + 1)) };
|
||||
let third = unsafe { *(src.get_unchecked(read + 2)) };
|
||||
if ((UTF8_DATA.table[usize::from(second)]
|
||||
& unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
|
||||
| (third >> 6))
|
||||
!= 2
|
||||
{
|
||||
break 'outer;
|
||||
}
|
||||
read += 3;
|
||||
|
||||
// Next lead (manually inlined)
|
||||
if unsafe { likely(read + 4 <= src.len()) } {
|
||||
byte = unsafe { *(src.get_unchecked(read)) };
|
||||
if in_inclusive_range8(byte, 0xE0, 0xEF) {
|
||||
continue 'three;
|
||||
}
|
||||
if unsafe { likely(byte < 0x80) } {
|
||||
read += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
continue 'inner;
|
||||
}
|
||||
break 'inner;
|
||||
}
|
||||
}
|
||||
// Four-byte
|
||||
let second = unsafe { *(src.get_unchecked(read + 1)) };
|
||||
let third = unsafe { *(src.get_unchecked(read + 2)) };
|
||||
let fourth = unsafe { *(src.get_unchecked(read + 3)) };
|
||||
if (u16::from(
|
||||
UTF8_DATA.table[usize::from(second)]
|
||||
& unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) },
|
||||
) | u16::from(third >> 6)
|
||||
| (u16::from(fourth & 0xC0) << 2))
|
||||
!= 0x202
|
||||
{
|
||||
break 'outer;
|
||||
}
|
||||
read += 4;
|
||||
|
||||
// Next lead
|
||||
if unsafe { likely(read + 4 <= src.len()) } {
|
||||
byte = unsafe { *(src.get_unchecked(read)) };
|
||||
if byte < 0x80 {
|
||||
read += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
continue 'inner;
|
||||
}
|
||||
break 'inner;
|
||||
}
|
||||
adjusted += 1;
|
||||
i += 1;
|
||||
}
|
||||
let (head, tail) = bytes[..len].split_at(adjusted);
|
||||
let (head_valid_up_to, tail_valid_up_to) =
|
||||
rayon::join(|| utf8_valid_up_to(head), || utf8_valid_up_to(tail));
|
||||
if head_valid_up_to == adjusted {
|
||||
return adjusted + tail_valid_up_to;
|
||||
// We can't have a complete 4-byte sequence, but we could still have
|
||||
// one to three shorter sequences.
|
||||
'tail: loop {
|
||||
// >= is better for bound check elision than ==
|
||||
if read >= src.len() {
|
||||
break 'outer;
|
||||
}
|
||||
byte = src[read];
|
||||
// At this point, `byte` is not included in `read`, because we
|
||||
// don't yet know that a) the UTF-8 sequence is valid and b) that there
|
||||
// is output space if it is an astral sequence.
|
||||
// Inspecting the lead byte directly is faster than what the
|
||||
// std lib does!
|
||||
if byte < 0x80 {
|
||||
read += 1;
|
||||
continue 'tail;
|
||||
}
|
||||
if in_inclusive_range8(byte, 0xC2, 0xDF) {
|
||||
// Two-byte
|
||||
let new_read = read + 2;
|
||||
if new_read > src.len() {
|
||||
break 'outer;
|
||||
}
|
||||
let second = src[read + 1];
|
||||
if !in_inclusive_range8(second, 0x80, 0xBF) {
|
||||
break 'outer;
|
||||
}
|
||||
read += 2;
|
||||
continue 'tail;
|
||||
}
|
||||
// We need to exclude valid four byte lead bytes, because
|
||||
// `UTF8_DATA.second_mask` covers
|
||||
if byte < 0xF0 {
|
||||
// Three-byte
|
||||
let new_read = read + 3;
|
||||
if new_read > src.len() {
|
||||
break 'outer;
|
||||
}
|
||||
let second = src[read + 1];
|
||||
let third = src[read + 2];
|
||||
if ((UTF8_DATA.table[usize::from(second)]
|
||||
& unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
|
||||
| (third >> 6))
|
||||
!= 2
|
||||
{
|
||||
break 'outer;
|
||||
}
|
||||
read += 3;
|
||||
// `'tail` handles sequences shorter than 4, so
|
||||
// there can't be another sequence after this one.
|
||||
break 'outer;
|
||||
}
|
||||
break 'outer;
|
||||
}
|
||||
return head_valid_up_to;
|
||||
}
|
||||
read
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "parallel-utf8"))]
|
||||
pub fn utf8_valid_up_to(bytes: &[u8]) -> usize {
|
||||
match run_utf8_validation(bytes) {
|
||||
Ok(_) => bytes.len(),
|
||||
Err(e) => e.valid_up_to(),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(never_loop, cyclomatic_complexity))]
|
||||
#[cfg_attr(
|
||||
feature = "cargo-clippy",
|
||||
allow(never_loop, cyclomatic_complexity)
|
||||
)]
|
||||
pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usize, usize) {
|
||||
// This algorithm differs from the UTF-8 validation algorithm, but making
|
||||
// this one consistent with that one makes this slower for reasons I don't
|
||||
@ -166,8 +264,12 @@ pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usiz
|
||||
}
|
||||
};
|
||||
// Check for the longest sequence to avoid checking twice for the
|
||||
// multi-byte sequences.
|
||||
if read + 4 <= src.len() {
|
||||
// multi-byte sequences. This can't overflow with 64-bit address space,
|
||||
// because full 64 bits aren't in use. In the 32-bit PAE case, for this
|
||||
// to overflow would mean that the source slice would be so large that
|
||||
// the address space of the process would not have space for any code.
|
||||
// Therefore, the slice cannot be so long that this would overflow.
|
||||
if unsafe { likely(read + 4 <= src.len()) } {
|
||||
'inner: loop {
|
||||
// At this point, `byte` is not included in `read`, because we
|
||||
// don't yet know that a) the UTF-8 sequence is valid and b) that there
|
||||
@ -175,278 +277,183 @@ pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usiz
|
||||
// We know, thanks to `ascii_to_basic_latin` that there is output
|
||||
// space for at least one UTF-16 code unit, so no need to check
|
||||
// for output space in the BMP cases.
|
||||
// Matching directly on the lead byte is faster than what the
|
||||
// Inspecting the lead byte directly is faster than what the
|
||||
// std lib does!
|
||||
match byte {
|
||||
0...0x7F => {
|
||||
// ASCII: write and go back to SIMD.
|
||||
dst[written] = byte as u16;
|
||||
if unsafe { likely(in_inclusive_range8(byte, 0xC2, 0xDF)) } {
|
||||
// Two-byte
|
||||
let second = unsafe { *(src.get_unchecked(read + 1)) };
|
||||
if !in_inclusive_range8(second, 0x80, 0xBF) {
|
||||
break 'outer;
|
||||
}
|
||||
unsafe {
|
||||
*(dst.get_unchecked_mut(written)) =
|
||||
((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F)
|
||||
};
|
||||
read += 2;
|
||||
written += 1;
|
||||
|
||||
// Next lead (manually inlined)
|
||||
if written == dst.len() {
|
||||
break 'outer;
|
||||
}
|
||||
if unsafe { likely(read + 4 <= src.len()) } {
|
||||
byte = unsafe { *(src.get_unchecked(read)) };
|
||||
if byte < 0x80 {
|
||||
unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
|
||||
read += 1;
|
||||
written += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
continue 'inner;
|
||||
}
|
||||
break 'inner;
|
||||
}
|
||||
if unsafe { likely(byte < 0xF0) } {
|
||||
'three: loop {
|
||||
// Three-byte
|
||||
let second = unsafe { *(src.get_unchecked(read + 1)) };
|
||||
let third = unsafe { *(src.get_unchecked(read + 2)) };
|
||||
if ((UTF8_DATA.table[usize::from(second)]
|
||||
& unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
|
||||
| (third >> 6))
|
||||
!= 2
|
||||
{
|
||||
break 'outer;
|
||||
}
|
||||
let point = ((u16::from(byte) & 0xF) << 12)
|
||||
| ((u16::from(second) & 0x3F) << 6)
|
||||
| (u16::from(third) & 0x3F);
|
||||
unsafe { *(dst.get_unchecked_mut(written)) = point };
|
||||
read += 3;
|
||||
written += 1;
|
||||
|
||||
// Next lead (manually inlined)
|
||||
if written == dst.len() {
|
||||
break 'outer;
|
||||
}
|
||||
if unsafe { likely(read + 4 <= src.len()) } {
|
||||
byte = unsafe { *(src.get_unchecked(read)) };
|
||||
if in_inclusive_range8(byte, 0xE0, 0xEF) {
|
||||
continue 'three;
|
||||
}
|
||||
if unsafe { likely(byte < 0x80) } {
|
||||
unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
|
||||
read += 1;
|
||||
written += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
continue 'inner;
|
||||
}
|
||||
break 'inner;
|
||||
}
|
||||
}
|
||||
// Four-byte
|
||||
if written + 1 == dst.len() {
|
||||
break 'outer;
|
||||
}
|
||||
let second = unsafe { *(src.get_unchecked(read + 1)) };
|
||||
let third = unsafe { *(src.get_unchecked(read + 2)) };
|
||||
let fourth = unsafe { *(src.get_unchecked(read + 3)) };
|
||||
if (u16::from(
|
||||
UTF8_DATA.table[usize::from(second)]
|
||||
& unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) },
|
||||
) | u16::from(third >> 6)
|
||||
| (u16::from(fourth & 0xC0) << 2))
|
||||
!= 0x202
|
||||
{
|
||||
break 'outer;
|
||||
}
|
||||
let point = ((u32::from(byte) & 0x7) << 18)
|
||||
| ((u32::from(second) & 0x3F) << 12)
|
||||
| ((u32::from(third) & 0x3F) << 6)
|
||||
| (u32::from(fourth) & 0x3F);
|
||||
unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 };
|
||||
unsafe {
|
||||
*(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16
|
||||
};
|
||||
read += 4;
|
||||
written += 2;
|
||||
|
||||
// Next lead
|
||||
if written == dst.len() {
|
||||
break 'outer;
|
||||
}
|
||||
if unsafe { likely(read + 4 <= src.len()) } {
|
||||
byte = unsafe { *(src.get_unchecked(read)) };
|
||||
if byte < 0x80 {
|
||||
unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
|
||||
read += 1;
|
||||
written += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
0xC2...0xDF => {
|
||||
// Two-byte
|
||||
let second = src[read + 1];
|
||||
if (UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL) != 0 {
|
||||
break 'outer;
|
||||
}
|
||||
let point = (((byte as u32) & 0x1Fu32) << 6) | (second as u32 & 0x3Fu32);
|
||||
dst[written] = point as u16;
|
||||
read += 2;
|
||||
written += 1;
|
||||
}
|
||||
0xE1...0xEC | 0xEE...0xEF => {
|
||||
// Three-byte normal
|
||||
let second = src[read + 1];
|
||||
let third = src[read + 2];
|
||||
if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
|
||||
!= 0
|
||||
{
|
||||
break 'outer;
|
||||
}
|
||||
let point = (((byte as u32) & 0xFu32) << 12)
|
||||
| ((second as u32 & 0x3Fu32) << 6)
|
||||
| (third as u32 & 0x3Fu32);
|
||||
dst[written] = point as u16;
|
||||
read += 3;
|
||||
written += 1;
|
||||
}
|
||||
0xE0 => {
|
||||
// Three-byte special lower bound
|
||||
let second = src[read + 1];
|
||||
let third = src[read + 2];
|
||||
if ((UTF8_TRAIL_INVALID[second as usize]
|
||||
& UTF8_THREE_BYTE_SPECIAL_LOWER_BOUND_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
|
||||
!= 0
|
||||
{
|
||||
break 'outer;
|
||||
}
|
||||
let point = (((byte as u32) & 0xFu32) << 12)
|
||||
| ((second as u32 & 0x3Fu32) << 6)
|
||||
| (third as u32 & 0x3Fu32);
|
||||
dst[written] = point as u16;
|
||||
read += 3;
|
||||
written += 1;
|
||||
}
|
||||
0xED => {
|
||||
// Three-byte special upper bound
|
||||
let second = src[read + 1];
|
||||
let third = src[read + 2];
|
||||
if ((UTF8_TRAIL_INVALID[second as usize]
|
||||
& UTF8_THREE_BYTE_SPECIAL_UPPER_BOUND_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
|
||||
!= 0
|
||||
{
|
||||
break 'outer;
|
||||
}
|
||||
let point = (((byte as u32) & 0xFu32) << 12)
|
||||
| ((second as u32 & 0x3Fu32) << 6)
|
||||
| (third as u32 & 0x3Fu32);
|
||||
dst[written] = point as u16;
|
||||
read += 3;
|
||||
written += 1;
|
||||
}
|
||||
0xF1...0xF3 => {
|
||||
// Four-byte normal
|
||||
if written + 1 == dst.len() {
|
||||
break 'outer;
|
||||
}
|
||||
let second = src[read + 1];
|
||||
let third = src[read + 2];
|
||||
let fourth = src[read + 3];
|
||||
if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[fourth as usize] & UTF8_NORMAL_TRAIL))
|
||||
!= 0
|
||||
{
|
||||
break 'outer;
|
||||
}
|
||||
let point = (((byte as u32) & 0x7u32) << 18)
|
||||
| ((second as u32 & 0x3Fu32) << 12)
|
||||
| ((third as u32 & 0x3Fu32) << 6)
|
||||
| (fourth as u32 & 0x3Fu32);
|
||||
dst[written] = (0xD7C0 + (point >> 10)) as u16;
|
||||
dst[written + 1] = (0xDC00 + (point & 0x3FF)) as u16;
|
||||
read += 4;
|
||||
written += 2;
|
||||
}
|
||||
0xF0 => {
|
||||
// Four-byte special lower bound
|
||||
if written + 1 == dst.len() {
|
||||
break 'outer;
|
||||
}
|
||||
let second = src[read + 1];
|
||||
let third = src[read + 2];
|
||||
let fourth = src[read + 3];
|
||||
if ((UTF8_TRAIL_INVALID[second as usize]
|
||||
& UTF8_FOUR_BYTE_SPECIAL_LOWER_BOUND_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[fourth as usize] & UTF8_NORMAL_TRAIL))
|
||||
!= 0
|
||||
{
|
||||
break 'outer;
|
||||
}
|
||||
let point = (((byte as u32) & 0x7u32) << 18)
|
||||
| ((second as u32 & 0x3Fu32) << 12)
|
||||
| ((third as u32 & 0x3Fu32) << 6)
|
||||
| (fourth as u32 & 0x3Fu32);
|
||||
dst[written] = (0xD7C0 + (point >> 10)) as u16;
|
||||
dst[written + 1] = (0xDC00 + (point & 0x3FF)) as u16;
|
||||
read += 4;
|
||||
written += 2;
|
||||
}
|
||||
0xF4 => {
|
||||
// Four-byte special upper bound
|
||||
if written + 1 == dst.len() {
|
||||
break 'outer;
|
||||
}
|
||||
let second = src[read + 1];
|
||||
let third = src[read + 2];
|
||||
let fourth = src[read + 3];
|
||||
if ((UTF8_TRAIL_INVALID[second as usize]
|
||||
& UTF8_FOUR_BYTE_SPECIAL_UPPER_BOUND_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[fourth as usize] & UTF8_NORMAL_TRAIL))
|
||||
!= 0
|
||||
{
|
||||
break 'outer;
|
||||
}
|
||||
let point = (((byte as u32) & 0x7u32) << 18)
|
||||
| ((second as u32 & 0x3Fu32) << 12)
|
||||
| ((third as u32 & 0x3Fu32) << 6)
|
||||
| (fourth as u32 & 0x3Fu32);
|
||||
dst[written] = (0xD7C0 + (point >> 10)) as u16;
|
||||
dst[written + 1] = (0xDC00 + (point & 0x3FF)) as u16;
|
||||
read += 4;
|
||||
written += 2;
|
||||
}
|
||||
_ => {
|
||||
// Invalid lead
|
||||
break 'outer;
|
||||
}
|
||||
continue 'inner;
|
||||
}
|
||||
if written == dst.len() {
|
||||
break 'outer;
|
||||
}
|
||||
if read + 4 > src.len() {
|
||||
if read == src.len() {
|
||||
break 'outer;
|
||||
}
|
||||
byte = src[read];
|
||||
break 'inner;
|
||||
}
|
||||
byte = src[read];
|
||||
continue 'inner;
|
||||
break 'inner;
|
||||
}
|
||||
}
|
||||
// We can't have a complete 4-byte sequence, but we could still have
|
||||
// a complete shorter sequence.
|
||||
|
||||
// At this point, `byte` is not included in `read`, because we
|
||||
// don't yet know that a) the UTF-8 sequence is valid and b) that there
|
||||
// is output space if it is an astral sequence.
|
||||
// We know, thanks to `ascii_to_basic_latin` that there is output
|
||||
// space for at least one UTF-16 code unit, so no need to check
|
||||
// for output space in the BMP cases.
|
||||
// Matching directly on the lead byte is faster than what the
|
||||
// std lib does!
|
||||
match byte {
|
||||
0...0x7F => {
|
||||
// ASCII: write and go back to SIMD.
|
||||
dst[written] = byte as u16;
|
||||
// one to three shorter sequences.
|
||||
'tail: loop {
|
||||
// >= is better for bound check elision than ==
|
||||
if read >= src.len() || written >= src.len() {
|
||||
break 'outer;
|
||||
}
|
||||
byte = src[read];
|
||||
// At this point, `byte` is not included in `read`, because we
|
||||
// don't yet know that a) the UTF-8 sequence is valid and b) that there
|
||||
// is output space if it is an astral sequence.
|
||||
// Inspecting the lead byte directly is faster than what the
|
||||
// std lib does!
|
||||
if byte < 0x80 {
|
||||
dst[written] = u16::from(byte);
|
||||
read += 1;
|
||||
written += 1;
|
||||
continue 'outer;
|
||||
continue 'tail;
|
||||
}
|
||||
0xC2...0xDF => {
|
||||
if in_inclusive_range8(byte, 0xC2, 0xDF) {
|
||||
// Two-byte
|
||||
let new_read = read + 2;
|
||||
if new_read > src.len() {
|
||||
break 'outer;
|
||||
}
|
||||
let second = src[read + 1];
|
||||
if (UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL) != 0 {
|
||||
if !in_inclusive_range8(second, 0x80, 0xBF) {
|
||||
break 'outer;
|
||||
}
|
||||
let point = (((byte as u32) & 0x1Fu32) << 6) | (second as u32 & 0x3Fu32);
|
||||
dst[written] = point as u16;
|
||||
read = new_read;
|
||||
dst[written] = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F);
|
||||
read += 2;
|
||||
written += 1;
|
||||
continue 'tail;
|
||||
}
|
||||
0xE1...0xEC | 0xEE...0xEF => {
|
||||
// Three-byte normal
|
||||
// We need to exclude valid four byte lead bytes, because
|
||||
// `UTF8_DATA.second_mask` covers
|
||||
if byte < 0xF0 {
|
||||
// Three-byte
|
||||
let new_read = read + 3;
|
||||
if new_read > src.len() {
|
||||
break 'outer;
|
||||
}
|
||||
let second = src[read + 1];
|
||||
let third = src[read + 2];
|
||||
if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
|
||||
!= 0
|
||||
if ((UTF8_DATA.table[usize::from(second)]
|
||||
& unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
|
||||
| (third >> 6))
|
||||
!= 2
|
||||
{
|
||||
break 'outer;
|
||||
}
|
||||
let point = (((byte as u32) & 0xFu32) << 12)
|
||||
| ((second as u32 & 0x3Fu32) << 6)
|
||||
| (third as u32 & 0x3Fu32);
|
||||
dst[written] = point as u16;
|
||||
read = new_read;
|
||||
let point = ((u16::from(byte) & 0xF) << 12)
|
||||
| ((u16::from(second) & 0x3F) << 6)
|
||||
| (u16::from(third) & 0x3F);
|
||||
dst[written] = point;
|
||||
read += 3;
|
||||
written += 1;
|
||||
}
|
||||
0xE0 => {
|
||||
// Three-byte special lower bound
|
||||
let new_read = read + 3;
|
||||
if new_read > src.len() {
|
||||
break 'outer;
|
||||
}
|
||||
let second = src[read + 1];
|
||||
let third = src[read + 2];
|
||||
if ((UTF8_TRAIL_INVALID[second as usize]
|
||||
& UTF8_THREE_BYTE_SPECIAL_LOWER_BOUND_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
|
||||
!= 0
|
||||
{
|
||||
break 'outer;
|
||||
}
|
||||
let point = (((byte as u32) & 0xFu32) << 12)
|
||||
| ((second as u32 & 0x3Fu32) << 6)
|
||||
| (third as u32 & 0x3Fu32);
|
||||
dst[written] = point as u16;
|
||||
read = new_read;
|
||||
written += 1;
|
||||
}
|
||||
0xED => {
|
||||
// Three-byte special upper bound
|
||||
let new_read = read + 3;
|
||||
if new_read > src.len() {
|
||||
break 'outer;
|
||||
}
|
||||
let second = src[read + 1];
|
||||
let third = src[read + 2];
|
||||
if ((UTF8_TRAIL_INVALID[second as usize]
|
||||
& UTF8_THREE_BYTE_SPECIAL_UPPER_BOUND_TRAIL)
|
||||
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
|
||||
!= 0
|
||||
{
|
||||
break 'outer;
|
||||
}
|
||||
let point = (((byte as u32) & 0xFu32) << 12)
|
||||
| ((second as u32 & 0x3Fu32) << 6)
|
||||
| (third as u32 & 0x3Fu32);
|
||||
dst[written] = point as u16;
|
||||
read = new_read;
|
||||
written += 1;
|
||||
}
|
||||
_ => {
|
||||
// Invalid lead or 4-byte lead
|
||||
// `'tail` handles sequences shorter than 4, so
|
||||
// there can't be another sequence after this one.
|
||||
break 'outer;
|
||||
}
|
||||
break 'outer;
|
||||
}
|
||||
break 'outer;
|
||||
}
|
||||
(read, written)
|
||||
}
|
||||
@ -534,7 +541,7 @@ impl Utf8Decoder {
|
||||
}
|
||||
if b < 0xE0u8 {
|
||||
self.bytes_needed = 1;
|
||||
self.code_point = b as u32 & 0x1F;
|
||||
self.code_point = u32::from(b) & 0x1F;
|
||||
continue;
|
||||
}
|
||||
if b < 0xF0u8 {
|
||||
@ -544,7 +551,7 @@ impl Utf8Decoder {
|
||||
self.upper_boundary = 0x9Fu8;
|
||||
}
|
||||
self.bytes_needed = 2;
|
||||
self.code_point = b as u32 & 0xF;
|
||||
self.code_point = u32::from(b) & 0xF;
|
||||
continue;
|
||||
}
|
||||
if b < 0xF5u8 {
|
||||
@ -554,7 +561,7 @@ impl Utf8Decoder {
|
||||
self.upper_boundary = 0x8Fu8;
|
||||
}
|
||||
self.bytes_needed = 3;
|
||||
self.code_point = b as u32 & 0x7;
|
||||
self.code_point = u32::from(b) & 0x7;
|
||||
continue;
|
||||
}
|
||||
return (
|
||||
@ -579,7 +586,7 @@ impl Utf8Decoder {
|
||||
}
|
||||
self.lower_boundary = 0x80u8;
|
||||
self.upper_boundary = 0xBFu8;
|
||||
self.code_point = (self.code_point << 6) | (b as u32 & 0x3F);
|
||||
self.code_point = (self.code_point << 6) | (u32::from(b) & 0x3F);
|
||||
self.bytes_seen += 1;
|
||||
if self.bytes_seen != self.bytes_needed {
|
||||
continue;
|
||||
@ -683,7 +690,8 @@ impl Utf8Encoder {
|
||||
unsafe {
|
||||
*(dst.get_unchecked_mut(written)) = (unit >> 12) as u8 | 0xE0u8;
|
||||
written += 1;
|
||||
*(dst.get_unchecked_mut(written)) = ((unit & 0xFC0) >> 6) as u8 | 0x80u8;
|
||||
*(dst.get_unchecked_mut(written)) =
|
||||
((unit & 0xFC0) >> 6) as u8 | 0x80u8;
|
||||
written += 1;
|
||||
*(dst.get_unchecked_mut(written)) = (unit & 0x3F) as u8 | 0x80u8;
|
||||
written += 1;
|
||||
@ -709,19 +717,22 @@ impl Utf8Encoder {
|
||||
}
|
||||
let second = src[read];
|
||||
let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
|
||||
if unsafe { likely(second_minus_low_surrogate_start <= (0xDFFF - 0xDC00)) } {
|
||||
if unsafe { likely(second_minus_low_surrogate_start <= (0xDFFF - 0xDC00)) }
|
||||
{
|
||||
// The next code unit is a low surrogate. Advance position.
|
||||
read += 1;
|
||||
let astral = ((unit as u32) << 10) + second as u32
|
||||
let astral = (u32::from(unit) << 10) + u32::from(second)
|
||||
- (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
|
||||
unsafe {
|
||||
*(dst.get_unchecked_mut(written)) = (astral >> 18) as u8 | 0xF0u8;
|
||||
written += 1;
|
||||
*(dst.get_unchecked_mut(written)) = ((astral & 0x3F000u32) >> 12) as u8 | 0x80u8;
|
||||
*(dst.get_unchecked_mut(written)) =
|
||||
((astral & 0x3F000u32) >> 12) as u8 | 0x80u8;
|
||||
written += 1;
|
||||
*(dst.get_unchecked_mut(written)) = ((astral & 0xFC0u32) >> 6) as u8 | 0x80u8;
|
||||
*(dst.get_unchecked_mut(written)) =
|
||||
((astral & 0xFC0u32) >> 6) as u8 | 0x80u8;
|
||||
written += 1;
|
||||
*(dst.get_unchecked_mut(written)) = (astral & 0x3Fu32) as u8 | 0x80u8;
|
||||
*(dst.get_unchecked_mut(written)) = (astral & 0x3F) as u8 | 0x80u8;
|
||||
written += 1;
|
||||
}
|
||||
break;
|
||||
@ -774,22 +785,18 @@ impl Utf8Encoder {
|
||||
dst: &mut [u8],
|
||||
_last: bool,
|
||||
) -> (EncoderResult, usize, usize) {
|
||||
let mut to_write = src.len();
|
||||
let bytes = src.as_bytes();
|
||||
let mut to_write = bytes.len();
|
||||
if to_write <= dst.len() {
|
||||
unsafe {
|
||||
::std::ptr::copy_nonoverlapping(src.as_ptr(), dst.as_mut_ptr(), to_write);
|
||||
}
|
||||
(&mut dst[..to_write]).copy_from_slice(bytes);
|
||||
return (EncoderResult::InputEmpty, to_write, to_write);
|
||||
}
|
||||
to_write = dst.len();
|
||||
// Move back until we find a UTF-8 sequence boundary.
|
||||
let bytes = src.as_bytes();
|
||||
while (bytes[to_write] & 0xC0) == 0x80 {
|
||||
to_write -= 1;
|
||||
}
|
||||
unsafe {
|
||||
::std::ptr::copy_nonoverlapping(src.as_ptr(), dst.as_mut_ptr(), to_write);
|
||||
}
|
||||
(&mut dst[..to_write]).copy_from_slice(&bytes[..to_write]);
|
||||
(EncoderResult::OutputFull, to_write, to_write)
|
||||
}
|
||||
}
|
||||
|
430
third_party/rust/encoding_rs/src/utf_8_core.rs
vendored
430
third_party/rust/encoding_rs/src/utf_8_core.rs
vendored
@ -1,430 +0,0 @@
|
||||
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// The initial revision of this file was extracted from the "UTF-8 validation"
|
||||
// section of the file src/libcore/str/mod.rs from Rust project at revision
|
||||
// 7ad7232422f7e5bbfa0e52dabe36c12677df19e2. The Utf8Error struct also comes
|
||||
// from that file. Subsequently, changes from the mentioned file at revision
|
||||
// 85eadf84f3945dc431643ea43d34f15193fdafb4 were merged into this file.
|
||||
|
||||
use ascii::validate_ascii;
|
||||
|
||||
/// Errors which can occur when attempting to interpret a sequence of `u8`
|
||||
/// as a string.
|
||||
///
|
||||
/// As such, the `from_utf8` family of functions and methods for both `String`s
|
||||
/// and `&str`s make use of this error, for example.
|
||||
#[derive(Copy, Eq, PartialEq, Clone, Debug)]
|
||||
pub struct Utf8Error {
|
||||
valid_up_to: usize,
|
||||
}
|
||||
|
||||
impl Utf8Error {
|
||||
/// Returns the index in the given string up to which valid UTF-8 was
|
||||
/// verified.
|
||||
///
|
||||
/// It is the maximum index such that `from_utf8(input[..index])`
|
||||
/// would return `Ok(_)`.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// Basic usage:
|
||||
///
|
||||
/// ```
|
||||
/// use std::str;
|
||||
///
|
||||
/// // some invalid bytes, in a vector
|
||||
/// let sparkle_heart = vec![0, 159, 146, 150];
|
||||
///
|
||||
/// // std::str::from_utf8 returns a Utf8Error
|
||||
/// let error = str::from_utf8(&sparkle_heart).unwrap_err();
|
||||
///
|
||||
/// // the second byte is invalid here
|
||||
/// assert_eq!(1, error.valid_up_to());
|
||||
/// ```
|
||||
pub fn valid_up_to(&self) -> usize {
|
||||
self.valid_up_to
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(eval_order_dependence))]
|
||||
#[inline(always)]
|
||||
pub fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
|
||||
let mut index = 0;
|
||||
let len = v.len();
|
||||
|
||||
'outer: loop {
|
||||
let mut first = {
|
||||
let remaining = &v[index..];
|
||||
match validate_ascii(remaining) {
|
||||
None => {
|
||||
// offset += remaining.len();
|
||||
break 'outer;
|
||||
}
|
||||
Some((non_ascii, consumed)) => {
|
||||
index += consumed;
|
||||
non_ascii
|
||||
}
|
||||
}
|
||||
};
|
||||
let old_offset = index;
|
||||
macro_rules! err {
|
||||
($error_len:expr) => {
|
||||
return Err(Utf8Error {
|
||||
valid_up_to: old_offset,
|
||||
});
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! next {
|
||||
() => {{
|
||||
index += 1;
|
||||
// we needed data, but there was none: error!
|
||||
if index >= len {
|
||||
err!(None)
|
||||
}
|
||||
v[index]
|
||||
}};
|
||||
}
|
||||
|
||||
'inner: loop {
|
||||
let w = UTF8_CHAR_WIDTH[first as usize];
|
||||
// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
|
||||
// first C2 80 last DF BF
|
||||
// 3-byte encoding is for codepoints \u{0800} to \u{ffff}
|
||||
// first E0 A0 80 last EF BF BF
|
||||
// excluding surrogates codepoints \u{d800} to \u{dfff}
|
||||
// ED A0 80 to ED BF BF
|
||||
// 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff
|
||||
// first F0 90 80 80 last F4 8F BF BF
|
||||
//
|
||||
// Use the UTF-8 syntax from the RFC
|
||||
//
|
||||
// https://tools.ietf.org/html/rfc3629
|
||||
// UTF8-1 = %x00-7F
|
||||
// UTF8-2 = %xC2-DF UTF8-tail
|
||||
// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
|
||||
// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
|
||||
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
|
||||
// %xF4 %x80-8F 2( UTF8-tail )
|
||||
match w {
|
||||
2 => {
|
||||
if next!() & !CONT_MASK != TAG_CONT_U8 {
|
||||
err!(Some(1))
|
||||
}
|
||||
}
|
||||
3 => {
|
||||
match (first, next!()) {
|
||||
(0xE0, 0xA0...0xBF)
|
||||
| (0xE1...0xEC, 0x80...0xBF)
|
||||
| (0xED, 0x80...0x9F)
|
||||
| (0xEE...0xEF, 0x80...0xBF) => {}
|
||||
_ => err!(Some(1)),
|
||||
}
|
||||
if next!() & !CONT_MASK != TAG_CONT_U8 {
|
||||
err!(Some(2))
|
||||
}
|
||||
}
|
||||
4 => {
|
||||
match (first, next!()) {
|
||||
(0xF0, 0x90...0xBF) | (0xF1...0xF3, 0x80...0xBF) | (0xF4, 0x80...0x8F) => {}
|
||||
_ => err!(Some(1)),
|
||||
}
|
||||
if next!() & !CONT_MASK != TAG_CONT_U8 {
|
||||
err!(Some(2))
|
||||
}
|
||||
if next!() & !CONT_MASK != TAG_CONT_U8 {
|
||||
err!(Some(3))
|
||||
}
|
||||
}
|
||||
_ => err!(Some(1)),
|
||||
}
|
||||
index += 1;
|
||||
if index == len {
|
||||
break 'outer;
|
||||
}
|
||||
first = v[index];
|
||||
// This check is separate from the above `match`, because merging
|
||||
// this check into it causes a notable performance drop.
|
||||
if first < 0x80 {
|
||||
index += 1;
|
||||
continue 'outer;
|
||||
}
|
||||
continue 'inner;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// https://tools.ietf.org/html/rfc3629
|
||||
static UTF8_CHAR_WIDTH: [u8; 256] = [
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1, // 0x1F
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1, // 0x3F
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1, // 0x5F
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1, // 0x7F
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0, // 0x9F
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0, // 0xBF
|
||||
0,
|
||||
0,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2, // 0xDF
|
||||
3,
|
||||
3,
|
||||
3,
|
||||
3,
|
||||
3,
|
||||
3,
|
||||
3,
|
||||
3,
|
||||
3,
|
||||
3,
|
||||
3,
|
||||
3,
|
||||
3,
|
||||
3,
|
||||
3,
|
||||
3, // 0xEF
|
||||
4,
|
||||
4,
|
||||
4,
|
||||
4,
|
||||
4,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0, // 0xFF
|
||||
];
|
||||
|
||||
/// Mask of the value bits of a continuation byte
|
||||
const CONT_MASK: u8 = 0b0011_1111;
|
||||
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte
|
||||
const TAG_CONT_U8: u8 = 0b1000_0000;
|
15
third_party/rust/encoding_rs/src/variant.rs
vendored
15
third_party/rust/encoding_rs/src/variant.rs
vendored
@ -289,7 +289,7 @@ impl VariantEncoder {
|
||||
}
|
||||
|
||||
pub enum VariantEncoding {
|
||||
SingleByte(&'static [u16; 128]),
|
||||
SingleByte(&'static [u16; 128], u16, u8, u8),
|
||||
Utf8,
|
||||
Gbk,
|
||||
Gb18030,
|
||||
@ -307,7 +307,7 @@ pub enum VariantEncoding {
|
||||
impl VariantEncoding {
|
||||
pub fn new_variant_decoder(&self) -> VariantDecoder {
|
||||
match *self {
|
||||
VariantEncoding::SingleByte(table) => SingleByteDecoder::new(table),
|
||||
VariantEncoding::SingleByte(table, _, _, _) => SingleByteDecoder::new(table),
|
||||
VariantEncoding::Utf8 => Utf8Decoder::new(),
|
||||
VariantEncoding::Gbk | VariantEncoding::Gb18030 => Gb18030Decoder::new(),
|
||||
VariantEncoding::Big5 => Big5Decoder::new(),
|
||||
@ -324,7 +324,9 @@ impl VariantEncoding {
|
||||
|
||||
pub fn new_encoder(&self, encoding: &'static Encoding) -> Encoder {
|
||||
match *self {
|
||||
VariantEncoding::SingleByte(table) => SingleByteEncoder::new(encoding, table),
|
||||
VariantEncoding::SingleByte(table, run_bmp_offset, run_byte_offset, run_length) => {
|
||||
SingleByteEncoder::new(encoding, table, run_bmp_offset, run_byte_offset, run_length)
|
||||
}
|
||||
VariantEncoding::Utf8 => Utf8Encoder::new(encoding),
|
||||
VariantEncoding::Gbk => Gb18030Encoder::new(encoding, false),
|
||||
VariantEncoding::Gb18030 => Gb18030Encoder::new(encoding, true),
|
||||
@ -339,4 +341,11 @@ impl VariantEncoding {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_single_byte(&self) -> bool {
|
||||
match *self {
|
||||
VariantEncoding::SingleByte(_, _, _, _) | VariantEncoding::UserDefined => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -56,7 +56,7 @@ impl UserDefinedDecoder {
|
||||
destination_handle.write_ascii(b);
|
||||
continue;
|
||||
}
|
||||
destination_handle.write_upper_bmp((b as usize + 0xF700usize) as u16);
|
||||
destination_handle.write_upper_bmp(u16::from(b) + 0xF700);
|
||||
continue;
|
||||
},
|
||||
self,
|
||||
@ -93,9 +93,9 @@ impl UserDefinedDecoder {
|
||||
*to = {
|
||||
let unit = *from;
|
||||
if unit < 0x80 {
|
||||
unit as u16
|
||||
u16::from(unit)
|
||||
} else {
|
||||
(unit as u16) + 0xF700
|
||||
u16::from(unit) + 0xF700
|
||||
}
|
||||
}
|
||||
});
|
||||
@ -120,11 +120,11 @@ impl UserDefinedDecoder {
|
||||
let src_ptr = src.as_ptr();
|
||||
let dst_ptr = dst.as_mut_ptr();
|
||||
for i in 0..simd_iterations {
|
||||
let input = unsafe { load16_unaligned(src_ptr.offset((i * 16) as isize)) };
|
||||
let input = unsafe { load16_unaligned(src_ptr.add(i * 16)) };
|
||||
let (first, second) = simd_unpack(input);
|
||||
unsafe {
|
||||
store8_unaligned(dst_ptr.offset((i * 16) as isize), shift_upper(first));
|
||||
store8_unaligned(dst_ptr.offset(((i * 16) + 8) as isize), shift_upper(second));
|
||||
store8_unaligned(dst_ptr.add(i * 16), shift_upper(first));
|
||||
store8_unaligned(dst_ptr.add((i * 16) + 8), shift_upper(second));
|
||||
}
|
||||
}
|
||||
let src_tail = &src[tail_start..length];
|
||||
@ -136,9 +136,9 @@ impl UserDefinedDecoder {
|
||||
*to = {
|
||||
let unit = *from;
|
||||
if unit < 0x80 {
|
||||
unit as u16
|
||||
u16::from(unit)
|
||||
} else {
|
||||
(unit as u16) + 0xF700
|
||||
u16::from(unit) + 0xF700
|
||||
}
|
||||
}
|
||||
});
|
||||
@ -182,7 +182,7 @@ impl UserDefinedEncoder {
|
||||
destination_handle.written(),
|
||||
);
|
||||
}
|
||||
destination_handle.write_one((c as usize - 0xF700usize) as u8);
|
||||
destination_handle.write_one((u32::from(c) - 0xF700) as u8);
|
||||
continue;
|
||||
},
|
||||
self,
|
||||
|
Loading…
Reference in New Issue
Block a user