Bug 1507726 - Update encoding_rs to 0.8.12. r=m_kato

* Improves UTF-8 validation performance.
 * Improves UTF-8 to UTF-16 decode performance.
 * Improves non-Latin and Latin1-ish Latin single-byte encode performance.
 * Improves code quality by addressing some clippy lints.

The optional legacy CJK encoder changes are not used by Firefox.

Differential Revision: https://phabricator.services.mozilla.com/D12514

--HG--
extra : moz-landing-system : lando
This commit is contained in:
Henri Sivonen 2018-11-22 01:41:51 +00:00
parent 2e676cc49d
commit 60fabe50a9
29 changed files with 98214 additions and 2917 deletions

10
Cargo.lock generated
View File

@ -799,21 +799,21 @@ name = "encoding_c"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"encoding_rs 0.8.9 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding_rs 0.8.12 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "encoding_glue"
version = "0.1.0"
dependencies = [
"encoding_rs 0.8.9 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding_rs 0.8.12 (registry+https://github.com/rust-lang/crates.io-index)",
"nserror 0.1.0",
"nsstring 0.1.0",
]
[[package]]
name = "encoding_rs"
version = "0.8.9"
version = "0.8.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cfg-if 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
@ -1704,7 +1704,7 @@ name = "nsstring"
version = "0.1.0"
dependencies = [
"bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding_rs 0.8.9 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding_rs 0.8.12 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@ -3201,7 +3201,7 @@ dependencies = [
"checksum either 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "18785c1ba806c258137c937e44ada9ee7e69a37e3c72077542cd2f069d78562a"
"checksum ena 0.9.3 (registry+https://github.com/rust-lang/crates.io-index)" = "88dc8393b3c7352f94092497f6b52019643e493b6b890eb417cdb7c46117e621"
"checksum encoding_c 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "769ecb8b33323998e482b218c0d13cd64c267609023b4b7ec3ee740714c318ee"
"checksum encoding_rs 0.8.9 (registry+https://github.com/rust-lang/crates.io-index)" = "f1a79fa56c329a5b087de13748054fb3b974c4a672c12c71f0b66e35c5addec5"
"checksum encoding_rs 0.8.12 (registry+https://github.com/rust-lang/crates.io-index)" = "ca20350a7cb5aab5b9034731123d6d412caf3e92d4985e739e411ba0955fd0eb"
"checksum env_logger 0.5.6 (registry+https://github.com/rust-lang/crates.io-index)" = "0561146661ae44c579e993456bc76d11ce1e0c7d745e57b2fa7146b6e49fa2ad"
"checksum error-chain 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ff511d5dc435d703f4971bc399647c9bc38e20cb41452e3b9feb4765419ed3f3"
"checksum euclid 0.19.3 (registry+https://github.com/rust-lang/crates.io-index)" = "600657e7e5c03bfbccdc68721bc3b5abcb761553973387124eae9c9e4f02c210"

File diff suppressed because one or more lines are too long

View File

@ -28,7 +28,9 @@ taken as a waiver of copyright notice.
Please do not contribute implementations of encodings that are not specified
in the [Encoding Standard](https://encoding.spec.whatwg.org/).
For example, an implementation of UTF-7 would be explicitly not welcome.
For example, an implementation of UTF-7 is explicitly out of scope for this
crate and is, therefore, provided by the [charset](https://crates.io/crates/charset)
crate instead.
## Compatibility with Stable Rust

View File

@ -10,17 +10,3 @@ according to those terms.
Test code within encoding_rs is dedicated to the Public Domain when so
designated (see the individual files for PD/CC0-dedicated sections).
The file utf_8_core.rs was extracted from the Rust project at revision
7ad7232422f7e5bbfa0e52dabe36c12677df19e2, whose COPYRIGHT file said (in part):
The Rust Project is copyright 2010, The Rust Project
Developers.
Licensed under the Apache License, Version 2.0
<LICENSE-APACHE or
http://www.apache.org/licenses/LICENSE-2.0> or the MIT
license <LICENSE-MIT or http://opensource.org/licenses/MIT>,
at your option. All files in the project carrying such
notice may not be copied, modified, or distributed except
according to those terms.

View File

@ -12,14 +12,14 @@
[package]
name = "encoding_rs"
version = "0.8.9"
version = "0.8.12"
authors = ["Henri Sivonen <hsivonen@hsivonen.fi>"]
description = "A Gecko-oriented implementation of the Encoding Standard"
homepage = "https://docs.rs/encoding_rs/"
documentation = "https://docs.rs/encoding_rs/"
readme = "README.md"
keywords = ["encoding", "web", "unicode", "charset"]
categories = ["text-processing", "encoding", "web-programming", "email"]
categories = ["text-processing", "encoding", "web-programming", "internationalization"]
license = "MIT/Apache-2.0"
repository = "https://github.com/hsivonen/encoding_rs"
[profile.release]
@ -44,6 +44,12 @@ version = "1.0"
version = "1.0"
[features]
fast-big5-hanzi-encode = []
fast-gb-hanzi-encode = []
fast-hangul-encode = []
fast-hanja-encode = []
fast-kanji-encode = []
fast-legacy-encode = ["fast-hangul-encode", "fast-hanja-encode", "fast-kanji-encode", "fast-gb-hanzi-encode", "fast-big5-hanzi-encode"]
less-slow-big5-hanzi-encode = []
less-slow-gb-hanzi-encode = []
less-slow-kanji-encode = []

View File

@ -76,3 +76,31 @@ On [Cortex-A57](https://stackoverflow.com/questions/45714535/performance-of-unal
Currently, Aarch64 runs the generic ALU UTF-8 validation code that aligns
reads. That's probably unnecessary on Aarch64. (SIMD was slower than ALU!)
## Table-driven UTF-8 validation
When there are at least four bytes left, read all four. With each byte
index into tables corresponding to magic values indexable by byte in
each position.
In the value read from the table indexed by lead byte, encode the
following in 16 bits: advance 2 bits (2, 3 or 4 bytes), 9 positional
bits one of which is set to indicate the type of lead byte (8 valid
types, in the 8 lowest bits, and invalid, ASCII would be tenth type),
and the mask for extracting the payload bits from the lead byte
(for conversion to UTF-16 or UTF-32).
In the tables indexable by the trail bytes, in each positions
corresponding byte the lead byte type, store 1 if the trail is
invalid given the lead and 0 if valid given the lead.
Use the low 8 bits of the of the 16 bits read from the first
table to mask (bitwise AND) one positional bit from each of the
three other values. Bitwise OR the results together with the
bit that is 1 if the lead is invalid. If the result is zero,
the sequence is valid. Otherwise it's invalid.
Use the advance to advance. In the conversion to UTF-16 or
UTF-32 case, use the mast for extracting the meaningful
bits from the lead byte to mask them from the lead. Shift
left by 6 as many times as the advance indicates, etc.

View File

@ -23,32 +23,3 @@ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
The file utf_8_core.rs was extracted from the Rust project at revision
7ad7232422f7e5bbfa0e52dabe36c12677df19e2, whose LICENSE-MIT file said:
Copyright (c) 2010 The Rust Project Developers
Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the
Software without restriction, including without
limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice
shall be included in all copies or substantial portions
of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

View File

@ -75,6 +75,12 @@ a `std::io::Read`, decode it into UTF-8 and presenting the result via
`std::io::Read`. The [`encoding_rs_io`](https://crates.io/crates/encoding_rs_io)
crate provides that capability.
## Decoding Email
For decoding character encodings that occur in email, use the
[`charset`](https://crates.io/crates/charset) crate instead of using this
one directly. (It wraps this crate and adds UTF-7 decoding.)
## Licensing
Please see the file named
@ -105,7 +111,7 @@ These bindings do not cover the `mem` module.
## Optional features
There are currently three optional cargo features:
There are currently these optional cargo features:
### `simd-accel`
@ -121,6 +127,8 @@ Enabling this feature breaks the build unless the target is x86 with SSE2
use an x86 target without SSE2, i.e. `i586` in `rustup` terms), ARMv7 or
thumbv7 with NEON (`-C target_feature=+neon`), x86_64 or Aarch64.
Used by Firefox.
### `serde`
Enables support for serializing and deserializing `&'static Encoding`-typed
@ -128,27 +136,134 @@ struct fields using [Serde][1].
[1]: https://serde.rs/
Not used by Firefox.
### `fast-legacy-encode`
A catch-all option for enabling the fastest legacy encode options. _Does not
affect decode speed or UTF-8 encode speed._
At present, this option is equivalent to enabling the following options:
* `fast-hangul-encode`
* `fast-hanja-encode`
* `fast-kanji-encode`
* `fast-gb-hanzi-encode`
* `fast-big5-hanzi-encode`
Adds 176 KB to the binary size.
Not used by Firefox.
### `fast-hangul-encode`
Changes encoding precomposed Hangul syllables into EUC-KR from binary
search over the decode-optimized tables to lookup by index making Korean
plain-text encode about 4 times as fast as without this option.
Adds 20 KB to the binary size.
Does _not_ affect decode speed.
Not used by Firefox.
### `fast-hanja-encode`
Changes encoding of Hanja into EUC-KR from linear search over the
decode-optimized table to lookup by index. Since Hanja is practically absent
in modern Korean text, this option doesn't affect perfomance in the common
case and mainly makes sense if you want to make your application resilient
agaist denial of service by someone intentionally feeding it a lot of Hanja
to encode into EUC-KR.
Adds 40 KB to the binary size.
Does _not_ affect decode speed.
Not used by Firefox.
### `fast-kanji-encode`
Changes encoding of Kanji into Shift_JIS, EUC-JP and ISO-2022-JP from linear
search over the decode-optimized tables to lookup by index making Japanese
plain-text encode to legacy encodings 30 to 50 times as fast as without this
option (about 2 times as fast as with `less-slow-kanji-encode`).
Takes precedence over `less-slow-kanji-encode`.
Adds 36 KB to the binary size (24 KB compared to `less-slow-kanji-encode`).
Does _not_ affect decode speed.
Not used by Firefox.
### `less-slow-kanji-encode`
Makes JIS X 0208 Level 1 Kanji (the most common Kanji in Shift_JIS, EUC-JP and
ISO-2022-JP) encode less slow (binary search instead of linear search) at the
expense of binary size. (Does _not_ affect decode speed.)
ISO-2022-JP) encode less slow (binary search instead of linear search) making
Japanese plain-text encode to legacy encodings 14 to 23 times as fast as
without this option.
Adds 12 KB to the binary size.
Does _not_ affect decode speed.
Not used by Firefox.
### `fast-gb-hanzi-encode`
Changes encoding of Hanzi in the CJK Unified Ideographs block into GBK and
gb18030 from linear search over a part the decode-optimized tables followed
by a binary search over another part of the decode-optimized tables to lookup
by index making Simplified Chinese plain-text encode to the legacy encodings
100 to 110 times as fast as without this option (about 2.5 times as fast as
with `less-slow-gb-hanzi-encode`).
Takes precedence over `less-slow-gb-hanzi-encode`.
Adds 36 KB to the binary size (24 KB compared to `less-slow-gb-hanzi-encode`).
Does _not_ affect decode speed.
Not used by Firefox.
### `less-slow-gb-hanzi-encode`
Makes GB2312 Level 1 Hanzi (the most common Hanzi in gb18030 and GBK) encode
less slow (binary search instead of linear search) at the expense of binary
size. (Does _not_ affect decode speed.)
less slow (binary search instead of linear search) making Simplified Chinese
plain-text encode to the legacy encodings about 40 times as fast as without
this option.
Adds 12 KB to the binary size.
Does _not_ affect decode speed.
Not used by Firefox.
### `fast-big5-hanzi-encode`
Changes encoding of Hanzi in the CJK Unified Ideographs block into Big5 from
linear search over a part the decode-optimized tables to lookup by index
making Traditional Chinese plain-text encode to Big5 105 to 125 times as fast
as without this option (about 3 times as fast as with
`less-slow-big5-hanzi-encode`).
Takes precedence over `less-slow-big5-hanzi-encode`.
Adds 40 KB to the binary size (20 KB compared to `less-slow-big5-hanzi-encode`).
Does _not_ affect decode speed.
Not used by Firefox.
### `less-slow-big5-hanzi-encode`
Makes Big5 Level 1 Hanzi (the most common Hanzi in Big5) encode less slow
(binary search instead of linear search) at the expense of binary size. (Does
_not_ affect decode speed.)
(binary search instead of linear search) making Traditional Chinese
plain-text encode to Big5 about 36 times as fast as without this option.
Adds 20 KB to the binary size.
Does _not_ affect decode speed.
Not used by Firefox.
@ -156,29 +271,26 @@ Not used by Firefox.
For decoding to UTF-16, the goal is to perform at least as well as Gecko's old
uconv. For decoding to UTF-8, the goal is to perform at least as well as
rust-encoding.
rust-encoding. These goals have been achieved.
Encoding to UTF-8 should be fast. (UTF-8 to UTF-8 encode should be equivalent
to `memcpy` and UTF-16 to UTF-8 should be fast.)
Speed is a non-goal when encoding to legacy encodings. Encoding to legacy
encodings should not be optimized for speed at the expense of code size as long
as form submission and URL parsing in Gecko don't become noticeably too slow
in real-world use.
Speed is a non-goal when encoding to legacy encodings. By default, encoding to
legacy encodings should not be optimized for speed at the expense of code size
as long as form submission and URL parsing in Gecko don't become noticeably
too slow in real-world use.
In the interest of binary size, by default, encoding_rs does not have any
encode-specific data tables. Therefore, encoders search the decode-optimized
data tables. This is a linear search in most cases. As a result, encode to
legacy encodings varies from slow to extremely slow relative to other
libraries. Still, with realistic work loads, this seemed fast enough
not to be user-visibly slow on Raspberry Pi 3 (which stood in for a phone
for testing) in the Web-exposed encoder use cases.
In the interest of binary size, by default, encoding_rs does not have
encode-specific data tables beyond 32 bits of encode-specific data for each
single-byte encoding. Therefore, encoders search the decode-optimized data
tables. This is a linear search in most cases. As a result, by default, encode
to legacy encodings varies from slow to extremely slow relative to other
libraries. Still, with realistic work loads, this seemed fast enough not to be
user-visibly slow on Raspberry Pi 3 (which stood in for a phone for testing)
in the Web-exposed encoder use cases.
See the cargo features above for optionally making Kanji and Hanzi legacy
encode a bit less slow.
Actually fast options for legacy encode may be added in the future, but there
do not appear to be pressing use cases.
See the cargo features above for optionally making CJK legacy encode fast.
A framework for measuring performance is [available separately][2].
@ -187,15 +299,15 @@ A framework for measuring performance is [available separately][2].
## Rust Version Compatibility
It is a goal to support the latest stable Rust, the latest nightly Rust and
the version of Rust that's used for Firefox Nightly (currently 1.25.0).
the version of Rust that's used for Firefox Nightly (currently 1.29.0).
These are tested on Travis.
Additionally, beta and the oldest known to work Rust version (currently
1.21.0) are tested on Travis. The oldest Rust known to work is tested as
1.29.0) are tested on Travis. The oldest Rust known to work is tested as
a canary so that when the oldest known to work no longer works, the change
can be documented here. At this time, there is no firm commitment to support
a version older than what's required by Firefox. The oldest supported Rust
is expected to move forward rapidly when `stdsimd` can replace the `simd`
is expected to move forward rapidly when `packed_simd` can replace the `simd`
crate without performance regression.
## Compatibility with rust-encoding
@ -207,6 +319,19 @@ encoding_rs is
written with the assuption that Firefox would need it, but it is not currently
used in Firefox.
## Regenerating Generated Code
To regenerate the generated code:
* Have Python 2 installed.
* Clone [`https://github.com/hsivonen/encoding_c`](https://github.com/hsivonen/encoding_c)
next to the `encoding_rs` directory.
* Clone [`https://github.com/whatwg/encoding`](https://github.com/whatwg/encoding)
next to the `encoding_rs` directory.
* Checkout revision `f381389` of the `encoding` repo.
* With the `encoding_rs` directory as the working directory, run
`python generate-encoding-data.py`.
## Roadmap
- [x] Design the low-level API.
@ -231,18 +356,53 @@ used in Firefox.
- [ ] ~Parallelize UTF-8 validation using [Rayon](https://github.com/nikomatsakis/rayon).~
(This turned out to be a pessimization in the ASCII case due to memory bandwidth reasons.)
- [x] Provide an XPCOM/MFBT-flavored C++ API.
- [ ] Investigate accelerating single-byte encode with a single fast-tracked
- [x] Investigate accelerating single-byte encode with a single fast-tracked
range per encoding.
- [x] Replace uconv with encoding_rs in Gecko.
- [x] Implement the rust-encoding API in terms of encoding_rs.
- [x] Add SIMD acceleration for Aarch64.
- [x] Investigate the use of NEON on 32-bit ARM.
- [ ] Investigate Björn Höhrmann's lookup table acceleration for UTF-8 as
adapted to Rust in rust-encoding.
- [ ] Add actually fast CJK encode options.
- [ ] ~Investigate Björn Höhrmann's lookup table acceleration for UTF-8 as
adapted to Rust in rust-encoding.~
- [x] Add actually fast CJK encode options.
- [ ] Investigate [Bob Steagall's lookup table acceleration for UTF-8](https://github.com/BobSteagall/CppNow2018/blob/master/FastConversionFromUTF-8/Fast%20Conversion%20From%20UTF-8%20with%20C%2B%2B%2C%20DFAs%2C%20and%20SSE%20Intrinsics%20-%20Bob%20Steagall%20-%20C%2B%2BNow%202018.pdf).
## Release Notes
### 0.8.12
* Removed the `clippy::` prefix from clippy lint names.
### 0.8.11
* Changed minimum Rust requirement to 1.29.0 (for the ability to refer
to the interior of a `static` when defining another `static`).
* Explicitly aligned the lookup tables for single-byte encodings and
UTF-8 to cache lines in the hope of freeing up one cache line for
other data. (Perhaps the tables were already aligned and this is
placebo.)
* Added 32 bits of encode-oriented data for each single-byte encoding.
The change was performance-neutral for non-Latin1-ish Latin legacy
encodings, improved Latin1-ish and Arabic legacy encode speed
somewhat (new speed is 2.4x the old speed for German, 2.3x for
Arabic, 1.7x for Portuguese and 1.4x for French) and improved
non-Latin1, non-Arabic legacy single-byte encode a lot (7.2x for
Thai, 6x for Greek, 5x for Russian, 4x for Hebrew).
* Added compile-time options for fast CJK legacy encode options (at
the cost of binary size (up to 176 KB) and run-time memory usage).
These options still retain the overall code structure instead of
rewriting the CJK encoders totally, so the speed isn't as good as
what could be achieved by using even more memory / making the
binary even langer.
* Made UTF-8 decode and validation faster.
* Added method `is_single_byte()` on `Encoding`.
* Added `mem::decode_latin1()` and `mem::encode_latin1_lossy()`.
### 0.8.10
* Disabled a unit test that tests a panic condition when the assertion
being tested is disabled.
### 0.8.9
* Made `--features simd-accel` work with stable-channel compiler to

View File

@ -12,6 +12,15 @@
import json
import subprocess
import sys
import os.path
if (not os.path.isfile("../encoding/encodings.json")) or (not os.path.isfile("../encoding/indexes.json")):
sys.stderr.write("This script needs a clone of https://github.com/whatwg/encoding/ (preferably at revision f381389) next to the encoding_rs directory.\n");
sys.exit(-1)
if not os.path.isfile("../encoding_c/src/lib.rs"):
sys.stderr.write("This script also writes the generated parts of the encoding_c crate and needs a clone of https://github.com/hsivonen/encoding_c next to the encoding_rs directory.\n");
sys.exit(-1)
def cmp_from_end(one, other):
c = cmp(len(one), len(other))
@ -52,9 +61,12 @@ def static_u16_table(name, data):
''')
def static_u16_table_from_indexable(name, data, item, feature):
data_file.write('''#[cfg(feature = "%s")]
data_file.write('''#[cfg(all(
feature = "less-slow-%s",
not(feature = "fast-%s")
))]
static %s: [u16; %d] = [
''' % (feature, name, len(data)))
''' % (feature, feature, name, len(data)))
for i in xrange(len(data)):
data_file.write('0x%04X,\n' % data[i][item])
@ -64,12 +76,30 @@ static %s: [u16; %d] = [
''')
def static_u8_pair_table_from_indexable(name, data, item, feature):
data_file.write('''#[cfg(all(
feature = "less-slow-%s",
not(feature = "fast-%s")
))]
static %s: [[u8; 2]; %d] = [
''' % (feature, feature, name, len(data)))
for i in xrange(len(data)):
data_file.write('[0x%02X, 0x%02X],\n' % data[i][item])
data_file.write('''];
''')
def static_u8_pair_table(name, data, feature):
data_file.write('''#[cfg(feature = "%s")]
static %s: [[u8; 2]; %d] = [
''' % (feature, name, len(data)))
for i in xrange(len(data)):
data_file.write('[0x%02X, 0x%02X],\n' % data[i][item])
pair = data[i]
if not pair:
pair = (0, 0)
data_file.write('[0x%02X, 0x%02X],\n' % pair)
data_file.write('''];
@ -167,6 +197,46 @@ encoding_by_alias_code_page = {
51949: "EUC-KR",
}
# The position in the index (0 is the first index entry,
# i.e. byte value 0x80) that starts the longest run of
# consecutive code points. Must not be in the first
# quadrant. If the character to be encoded is not in this
# run, the part of the index after the run is searched
# forward. Then the part of the index from 32 to the start
# of the run. The first quadrant is searched last.
#
# If there is no obviously most useful longest run,
# the index here is just used to affect the search order.
start_of_longest_run_in_single_byte = {
"IBM866": 96, # 0 would be longest, but we don't want to start in the first quadrant
"windows-874": 33,
"windows-1250": 92,
"windows-1251": 64,
"windows-1252": 32,
"windows-1253": 83,
"windows-1254": 95,
"windows-1255": 96,
"windows-1256": 65,
"windows-1257": 95, # not actually longest
"windows-1258": 95, # not actually longest
"macintosh": 106, # useless
"x-mac-cyrillic": 96,
"KOI8-R": 64, # not actually longest
"KOI8-U": 64, # not actually longest
"ISO-8859-2": 95, # not actually longest
"ISO-8859-3": 95, # not actually longest
"ISO-8859-4": 95, # not actually longest
"ISO-8859-5": 46,
"ISO-8859-6": 65,
"ISO-8859-7": 83,
"ISO-8859-8": 96,
"ISO-8859-10": 90, # not actually longest
"ISO-8859-13": 95, # not actually longest
"ISO-8859-14": 95,
"ISO-8859-15": 63,
"ISO-8859-16": 95, # not actually longest
}
#
for group in data:
@ -201,6 +271,25 @@ for label in labels:
longest_label_length = len(label.label)
longest_label = label.label
def longest_run_for_single_byte(name):
if name == u"ISO-8859-8-I":
name = u"ISO-8859-8"
index = indexes[name.lower()]
run_byte_offset = start_of_longest_run_in_single_byte[name]
run_bmp_offset = index[run_byte_offset]
previous_code_point = run_bmp_offset
run_length = 1
while True:
i = run_byte_offset + run_length
if i == len(index):
break
code_point = index[i]
if previous_code_point + 1 != code_point:
break
previous_code_point = code_point
run_length += 1
return (run_bmp_offset, run_byte_offset, run_length)
def is_single_byte(name):
for encoding in single_byte:
if name == encoding["name"]:
@ -217,11 +306,11 @@ def read_non_generated(path):
generated_begin_index = full.find(generated_begin)
if generated_begin_index < 0:
print "Can't find generated code start marker in %s. Exiting." % path
sys.stderr.write("Can't find generated code start marker in %s. Exiting.\n" % path)
sys.exit(-1)
generated_end_index = full.find(generated_end)
if generated_end_index < 0:
print "Can't find generated code end marker in %s. Exiting." % path
sys.stderr.write("Can't find generated code end marker in %s. Exiting.\n" % path)
sys.exit(-1)
return (full[0:generated_begin_index + len(generated_begin)],
@ -242,7 +331,8 @@ const LONGEST_LABEL_LENGTH: usize = %d; // %s
for name in preferred:
variant = None
if is_single_byte(name):
variant = "SingleByte(data::%s_DATA)" % to_constant_name(u"iso-8859-8" if name == u"ISO-8859-8-I" else name)
(run_bmp_offset, run_byte_offset, run_length) = longest_run_for_single_byte(name)
variant = "SingleByte(&data::SINGLE_BYTE_DATA.%s, 0x%04X, %d, %d)" % (to_snake_name(u"iso-8859-8" if name == u"ISO-8859-8-I" else name), run_bmp_offset, run_byte_offset, run_length)
else:
variant = to_camel_name(name)
@ -323,19 +413,15 @@ def null_to_zero(code_point):
code_point = 0
return code_point
data_file = open("src/data.rs", "w")
data_file.write('''// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
(data_rs_begin, data_rs_end) = read_non_generated("src/data.rs")
// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
data_file = open("src/data.rs", "w")
data_file.write(data_rs_begin)
data_file.write('''
// Instead, please regenerate using generate-encoding-data.py
#[repr(align(64))] // Align to cache lines
pub struct SingleByteData {
''')
# Single-byte
@ -345,13 +431,29 @@ for encoding in single_byte:
if name == u"ISO-8859-8-I":
continue
data_file.write('''pub const %s_DATA: &'static [u16; 128] = &[
''' % to_constant_name(name))
data_file.write(''' pub %s: [u16; 128],
''' % to_snake_name(name))
data_file.write('''}
pub static SINGLE_BYTE_DATA: SingleByteData = SingleByteData {
''')
for encoding in single_byte:
name = encoding["name"]
if name == u"ISO-8859-8-I":
continue
data_file.write(''' %s: [
''' % to_snake_name(name))
for code_point in indexes[name.lower()]:
data_file.write('0x%04X,\n' % null_to_zero(code_point))
data_file.write('''];
data_file.write('''],
''')
data_file.write('''};
''')
@ -374,7 +476,8 @@ for code_point in index[942:19782]:
for j in xrange(32 - (len(astralness) % 32)):
astralness.append(0)
data_file.write('''static BIG5_ASTRALNESS: [u32; %d] = [
data_file.write('''#[cfg_attr(feature = "cargo-clippy", allow(unreadable_literal))]
static BIG5_ASTRALNESS: [u32; %d] = [
''' % (len(astralness) / 32))
i = 0
@ -408,8 +511,23 @@ level1_hanzi_pairs.append((0x5188, (0xC8, 0xA2)))
level1_hanzi_pairs.append((0x9FB1, (0xC8, 0xA3)))
level1_hanzi_pairs.sort(key=lambda x: x[0])
static_u16_table_from_indexable("BIG5_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0, "less-slow-big5-hanzi-encode")
static_u8_pair_table_from_indexable("BIG5_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1, "less-slow-big5-hanzi-encode")
static_u16_table_from_indexable("BIG5_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0, "big5-hanzi-encode")
static_u8_pair_table_from_indexable("BIG5_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1, "big5-hanzi-encode")
# Fast Unified Ideograph encode
big5_unified_ideograph_bytes = [None] * (0x9FCC - 0x4E00)
for row in xrange(0x7E - 0x20):
for column in xrange(157):
pointer = 5024 + column + (row * 157)
code_point = index[pointer]
if code_point and code_point >= 0x4E00 and code_point <= 0x9FCB:
unified_offset = code_point - 0x4E00
unified_lead = 0xA1 + row
unified_trail = (0x40 if column < 0x3F else 0x62) + column
if code_point == 0x5341 or code_point == 0x5345 or not big5_unified_ideograph_bytes[unified_offset]:
big5_unified_ideograph_bytes[unified_offset] = (unified_lead, unified_trail)
static_u8_pair_table("BIG5_UNIFIED_IDEOGRAPH_BYTES", big5_unified_ideograph_bytes, "fast-big5-hanzi-encode")
# JIS0208
@ -550,8 +668,23 @@ for i in xrange(len(level1_kanji_index)):
level1_kanji_pairs.append((level1_kanji_index[i], (lead, trail)))
level1_kanji_pairs.sort(key=lambda x: x[0])
static_u16_table_from_indexable("JIS0208_LEVEL1_KANJI_CODE_POINTS", level1_kanji_pairs, 0, "less-slow-kanji-encode")
static_u8_pair_table_from_indexable("JIS0208_LEVEL1_KANJI_SHIFT_JIS_BYTES", level1_kanji_pairs, 1, "less-slow-kanji-encode")
static_u16_table_from_indexable("JIS0208_LEVEL1_KANJI_CODE_POINTS", level1_kanji_pairs, 0, "kanji-encode")
static_u8_pair_table_from_indexable("JIS0208_LEVEL1_KANJI_SHIFT_JIS_BYTES", level1_kanji_pairs, 1, "kanji-encode")
# Fast encoder table for Kanji
kanji_bytes = [None] * (0x9FA1 - 0x4E00)
for pointer in xrange(len(index)):
code_point = index[pointer]
if code_point and code_point >= 0x4E00 and code_point <= 0x9FA0:
(lead, trail) = divmod(pointer, 188)
lead += 0x81 if lead < 0x1F else 0xC1
trail += 0x40 if trail < 0x3F else 0x41
# unset the high bit of lead if IBM Kanji
if pointer >= 8272:
lead = lead & 0x7F
kanji_bytes[code_point - 0x4E00] = (lead, trail)
static_u8_pair_table("JIS0208_KANJI_BYTES", kanji_bytes, "fast-kanji-encode")
# ISO-2022-JP half-width katakana
@ -728,6 +861,28 @@ static_u16_table("KSX1001_OTHER_POINTERS", pointers)
# is unmapped, so we don't want to look at it.
static_u16_table("KSX1001_OTHER_UNSORTED_OFFSETS", offsets[:-1])
# Fast Hangul and Hanja encode
hangul_bytes = [None] * (0xD7A4 - 0xAC00)
hanja_unified_bytes = [None] * (0x9F9D - 0x4E00)
hanja_compatibility_bytes = [None] * (0xFA0C - 0xF900)
for row in xrange(0x7D):
for column in xrange(190):
pointer = column + (row * 190)
code_point = index[pointer]
if code_point:
lead = 0x81 + row
trail = 0x41 + column
if code_point >= 0xAC00 and code_point < 0xD7A4:
hangul_bytes[code_point - 0xAC00] = (lead, trail)
elif code_point >= 0x4E00 and code_point < 0x9F9D:
hanja_unified_bytes[code_point - 0x4E00] = (lead, trail)
elif code_point >= 0xF900 and code_point < 0xFA0C:
hanja_compatibility_bytes[code_point - 0xF900] = (lead, trail)
static_u8_pair_table("CP949_HANGUL_BYTES", hangul_bytes, "fast-hangul-encode")
static_u8_pair_table("KSX1001_UNIFIED_HANJA_BYTES", hanja_unified_bytes, "fast-hanja-encode")
static_u8_pair_table("KSX1001_COMPATIBILITY_HANJA_BYTES", hanja_compatibility_bytes, "fast-hanja-encode")
# JIS 0212
index = indexes["jis0212"]
@ -927,502 +1082,23 @@ for i in xrange(len(level1_hanzi_index)):
level1_hanzi_pairs.append((level1_hanzi_index[i], (hanzi_lead, hanzi_trail)))
level1_hanzi_pairs.sort(key=lambda x: x[0])
static_u16_table_from_indexable("GB2312_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0, "less-slow-gb-hanzi-encode")
static_u8_pair_table_from_indexable("GB2312_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1, "less-slow-gb-hanzi-encode")
static_u16_table_from_indexable("GB2312_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0, "gb-hanzi-encode")
static_u8_pair_table_from_indexable("GB2312_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1, "gb-hanzi-encode")
data_file.write('''#[inline(always)]
fn map_with_ranges(haystack: &[u16], other: &[u16], needle: u16) -> u16 {
debug_assert_eq!(haystack.len(), other.len());
match haystack.binary_search(&needle) {
Ok(i) => other[i],
Err(i) => other[i - 1] + (needle - haystack[i - 1]),
}
}
# Fast Hanzi encoder table
hanzi_bytes = [None] * (0x9FA7 - 0x4E00)
for row in xrange(126):
for column in xrange(190):
pointer = column + (row * 190)
code_point = index[pointer]
if code_point and code_point >= 0x4E00 and code_point <= 0x9FA6:
hanzi_lead = 0x81 + row
hanzi_trail = column + (0x40 if column < 0x3F else 0x41)
hanzi_bytes[code_point - 0x4E00] = (hanzi_lead, hanzi_trail)
#[inline(always)]
fn map_with_unsorted_ranges(haystack: &[u16], other: &[u16], needle: u16) -> Option<u16> {
debug_assert_eq!(haystack.len() + 1, other.len());
for i in 0..haystack.len() {
let start = other[i];
let end = other[i + 1];
let length = end - start;
let offset = needle.wrapping_sub(haystack[i]);
if offset < length {
return Some(start + offset);
}
}
None
}
static_u8_pair_table("GBK_HANZI_BYTES", hanzi_bytes, "fast-gb-hanzi-encode")
#[inline(always)]
pub fn position(haystack: &[u16], needle: u16) -> Option<usize> {
haystack.iter().position(|&x| x == needle)
}
#[inline(always)]
pub fn gb18030_range_decode(pointer: u16) -> u16 {
map_with_ranges(&GB18030_RANGE_POINTERS[..],
&GB18030_RANGE_OFFSETS[..],
pointer)
}
#[inline(always)]
pub fn gb18030_range_encode(bmp: u16) -> usize {
if bmp == 0xE7C7 {
return 7457;
}
map_with_ranges(&GB18030_RANGE_OFFSETS[..], &GB18030_RANGE_POINTERS[..], bmp) as usize
}
#[inline(always)]
pub fn gbk_top_ideograph_decode(pointer: u16) -> u16 {
map_with_ranges(&GBK_TOP_IDEOGRAPH_POINTERS[..],
&GBK_TOP_IDEOGRAPH_OFFSETS[..],
pointer)
}
#[inline(always)]
pub fn gbk_top_ideograph_encode(bmp: u16) -> u16 {
map_with_ranges(&GBK_TOP_IDEOGRAPH_OFFSETS[..],
&GBK_TOP_IDEOGRAPH_POINTERS[..],
bmp)
}
#[inline(always)]
pub fn gbk_left_ideograph_decode(pointer: u16) -> u16 {
map_with_ranges(&GBK_LEFT_IDEOGRAPH_POINTERS[..],
&GBK_LEFT_IDEOGRAPH_OFFSETS[..],
pointer)
}
#[inline(always)]
pub fn gbk_left_ideograph_encode(bmp: u16) -> u16 {
map_with_ranges(&GBK_LEFT_IDEOGRAPH_OFFSETS[..],
&GBK_LEFT_IDEOGRAPH_POINTERS[..],
bmp)
}
#[inline(always)]
pub fn cp949_top_hangul_decode(pointer: u16) -> u16 {
map_with_ranges(&CP949_TOP_HANGUL_POINTERS[..],
&CP949_TOP_HANGUL_OFFSETS[..],
pointer)
}
#[inline(always)]
pub fn cp949_top_hangul_encode(bmp: u16) -> u16 {
map_with_ranges(&CP949_TOP_HANGUL_OFFSETS[..],
&CP949_TOP_HANGUL_POINTERS[..],
bmp)
}
#[inline(always)]
pub fn cp949_left_hangul_decode(pointer: u16) -> u16 {
map_with_ranges(&CP949_LEFT_HANGUL_POINTERS[..],
&CP949_LEFT_HANGUL_OFFSETS[..],
pointer)
}
#[inline(always)]
pub fn cp949_left_hangul_encode(bmp: u16) -> u16 {
map_with_ranges(&CP949_LEFT_HANGUL_OFFSETS[..],
&CP949_LEFT_HANGUL_POINTERS[..],
bmp)
}
#[inline(always)]
pub fn gbk_other_decode(pointer: u16) -> u16 {
map_with_ranges(&GBK_OTHER_POINTERS[..GBK_OTHER_POINTERS.len() - 1],
&GBK_OTHER_UNSORTED_OFFSETS[..],
pointer)
}
#[inline(always)]
pub fn gbk_other_encode(bmp: u16) -> Option<u16> {
map_with_unsorted_ranges(&GBK_OTHER_UNSORTED_OFFSETS[..],
&GBK_OTHER_POINTERS[..],
bmp)
}
#[inline(always)]
pub fn gb2312_other_decode(pointer: u16) -> u16 {
map_with_ranges(&GB2312_OTHER_POINTERS[..GB2312_OTHER_POINTERS.len() - 1],
&GB2312_OTHER_UNSORTED_OFFSETS[..],
pointer)
}
#[inline(always)]
pub fn gb2312_other_encode(bmp: u16) -> Option<u16> {
map_with_unsorted_ranges(&GB2312_OTHER_UNSORTED_OFFSETS[..],
&GB2312_OTHER_POINTERS[..],
bmp)
}
#[cfg(not(feature = "less-slow-gb-hanzi-encode"))]
#[inline(always)]
pub fn gb2312_level1_hanzi_encode(bmp: u16) -> Option<(u8, u8)> {
position(&GB2312_HANZI[..(94 * (0xD8 - 0xB0) - 5)], bmp).map(|hanzi_pointer| {
let hanzi_lead = (hanzi_pointer / 94) + 0xB0;
let hanzi_trail = (hanzi_pointer % 94) + 0xA1;
(hanzi_lead as u8, hanzi_trail as u8)
})
}
#[cfg(feature = "less-slow-gb-hanzi-encode")]
#[inline(always)]
pub fn gb2312_level1_hanzi_encode(bmp: u16) -> Option<(u8, u8)> {
match GB2312_LEVEL1_HANZI_CODE_POINTS.binary_search(&bmp) {
Ok(i) => {
let pair = &GB2312_LEVEL1_HANZI_BYTES[i];
Some((pair[0], pair[1]))
}
Err(_) => None,
}
}
#[inline(always)]
pub fn gb2312_level2_hanzi_encode(bmp: u16) -> Option<usize> {
// TODO: optimize
position(&GB2312_HANZI[(94 * (0xD8 - 0xB0))..], bmp)
}
#[inline(always)]
pub fn ksx1001_other_decode(pointer: u16) -> u16 {
map_with_ranges(&KSX1001_OTHER_POINTERS[..KSX1001_OTHER_POINTERS.len() - 1],
&KSX1001_OTHER_UNSORTED_OFFSETS[..],
pointer)
}
#[inline(always)]
pub fn ksx1001_other_encode(bmp: u16) -> Option<u16> {
map_with_unsorted_ranges(&KSX1001_OTHER_UNSORTED_OFFSETS[..],
&KSX1001_OTHER_POINTERS[..],
bmp)
}
#[cfg(not(feature = "less-slow-kanji-encode"))]
#[inline(always)]
pub fn jis0208_level1_kanji_shift_jis_encode(bmp: u16) -> Option<(u8, u8)> {
position(&JIS0208_LEVEL1_KANJI[..], bmp).map(|kanji_pointer| {
let pointer = 1410 + kanji_pointer;
let lead = pointer / 188;
let lead_offset = if lead < 0x1F {
0x81
} else {
0xC1
};
let trail = pointer % 188;
let trail_offset = if trail < 0x3F {
0x40
} else {
0x41
};
((lead + lead_offset) as u8, (trail + trail_offset) as u8)
})
}
#[cfg(feature = "less-slow-kanji-encode")]
#[inline(always)]
pub fn jis0208_level1_kanji_shift_jis_encode(bmp: u16) -> Option<(u8, u8)> {
match JIS0208_LEVEL1_KANJI_CODE_POINTS.binary_search(&bmp) {
Ok(i) => {
let pair = &JIS0208_LEVEL1_KANJI_SHIFT_JIS_BYTES[i];
Some((pair[0], pair[1]))
}
Err(_) => None,
}
}
#[cfg(not(feature = "less-slow-kanji-encode"))]
#[inline(always)]
pub fn jis0208_level1_kanji_euc_jp_encode(bmp: u16) -> Option<(u8, u8)> {
position(&JIS0208_LEVEL1_KANJI[..], bmp).map(|kanji_pointer| {
let lead = (kanji_pointer / 94) + 0xB0;
let trail = (kanji_pointer % 94) + 0xA1;
(lead as u8, trail as u8)
})
}
#[cfg(feature = "less-slow-kanji-encode")]
#[inline(always)]
pub fn jis0208_level1_kanji_euc_jp_encode(bmp: u16) -> Option<(u8, u8)> {
jis0208_level1_kanji_shift_jis_encode(bmp).map(|(shift_jis_lead, shift_jis_trail)| {
let mut lead = shift_jis_lead as usize;
if shift_jis_lead >= 0xA0 {
lead -= 0xC1 - 0x81;
}
// The next line would overflow u8. Letting it go over allows us to
// subtract fewer times.
lead <<= 1;
// Bring it back to u8 range
lead -= 0x61;
let trail = if shift_jis_trail >= 0x9F {
lead += 1;
shift_jis_trail + (0xA1 - 0x9F)
} else if shift_jis_trail < 0x7F {
shift_jis_trail + (0xA1 - 0x40)
} else {
shift_jis_trail + (0xA1 - 0x41)
};
(lead as u8, trail)
})
}
#[cfg(not(feature = "less-slow-kanji-encode"))]
#[inline(always)]
pub fn jis0208_level1_kanji_iso_2022_jp_encode(bmp: u16) -> Option<(u8, u8)> {
position(&JIS0208_LEVEL1_KANJI[..], bmp).map(|kanji_pointer| {
let lead = (kanji_pointer / 94) + (0xB0 - 0x80);
let trail = (kanji_pointer % 94) + 0x21;
(lead as u8, trail as u8)
})
}
#[cfg(feature = "less-slow-kanji-encode")]
#[inline(always)]
pub fn jis0208_level1_kanji_iso_2022_jp_encode(bmp: u16) -> Option<(u8, u8)> {
jis0208_level1_kanji_shift_jis_encode(bmp).map(|(shift_jis_lead, shift_jis_trail)| {
let mut lead = shift_jis_lead as usize;
if shift_jis_lead >= 0xA0 {
lead -= 0xC1 - 0x81;
}
// The next line would overflow u8. Letting it go over allows us to
// subtract fewer times.
lead <<= 1;
// Bring it back to u8 range
lead -= 0xE1;
let trail = if shift_jis_trail >= 0x9F {
lead += 1;
shift_jis_trail - (0x9F - 0x21)
} else if shift_jis_trail < 0x7F {
shift_jis_trail - (0x40 - 0x21)
} else {
shift_jis_trail - (0x41 - 0x21)
};
(lead as u8, trail)
})
}
#[inline(always)]
pub fn jis0208_level2_and_additional_kanji_encode(bmp: u16) -> Option<usize> {
// TODO: optimize
position(&JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[..], bmp)
}
pub fn jis0208_symbol_decode(pointer: usize) -> Option<u16> {
let mut i = 0;
while i < JIS0208_SYMBOL_TRIPLES.len() {
let start = JIS0208_SYMBOL_TRIPLES[i] as usize;
let length = JIS0208_SYMBOL_TRIPLES[i + 1] as usize;
let pointer_minus_start = pointer.wrapping_sub(start);
if pointer_minus_start < length {
let offset = JIS0208_SYMBOL_TRIPLES[i + 2] as usize;
return Some(JIS0208_SYMBOLS[pointer_minus_start + offset]);
}
i += 3;
}
None
}
/// Prefers Shift_JIS pointers for the three symbols that are in both ranges.
#[inline(always)]
pub fn jis0208_symbol_encode(bmp: u16) -> Option<usize> {
let mut i = 0;
while i < JIS0208_SYMBOL_TRIPLES.len() {
let pointer_start = JIS0208_SYMBOL_TRIPLES[i] as usize;
let length = JIS0208_SYMBOL_TRIPLES[i + 1] as usize;
let symbol_start = JIS0208_SYMBOL_TRIPLES[i + 2] as usize;
let symbol_end = symbol_start + length;
let mut symbol_pos = symbol_start;
while symbol_pos < symbol_end {
if JIS0208_SYMBOLS[symbol_pos] == bmp {
return Some(symbol_pos - symbol_start + pointer_start);
}
symbol_pos += 1;
}
i += 3;
}
None
}
#[inline(always)]
pub fn ibm_symbol_encode(bmp: u16) -> Option<usize> {
position(&JIS0208_SYMBOLS[IBM_SYMBOL_START..IBM_SYMBOL_END], bmp)
.map(|x| x + IBM_SYMBOL_POINTER_START)
}
#[inline(always)]
pub fn jis0208_range_decode(pointer: usize) -> Option<u16> {
let mut i = 0;
while i < JIS0208_RANGE_TRIPLES.len() {
let start = JIS0208_RANGE_TRIPLES[i] as usize;
let length = JIS0208_RANGE_TRIPLES[i + 1] as usize;
let pointer_minus_start = pointer.wrapping_sub(start);
if pointer_minus_start < length {
let offset = JIS0208_RANGE_TRIPLES[i + 2] as usize;
return Some((pointer_minus_start + offset) as u16);
}
i += 3;
}
None
}
#[inline(always)]
pub fn jis0208_range_encode(bmp: u16) -> Option<usize> {
let mut i = 0;
while i < JIS0208_RANGE_TRIPLES.len() {
let start = JIS0208_RANGE_TRIPLES[i + 2] as usize;
let length = JIS0208_RANGE_TRIPLES[i + 1] as usize;
let bmp_minus_start = (bmp as usize).wrapping_sub(start);
if bmp_minus_start < length {
let offset = JIS0208_RANGE_TRIPLES[i] as usize;
return Some(bmp_minus_start + offset);
}
i += 3;
}
None
}
pub fn jis0212_accented_decode(pointer: usize) -> Option<u16> {
let mut i = 0;
while i < JIS0212_ACCENTED_TRIPLES.len() {
let start = JIS0212_ACCENTED_TRIPLES[i] as usize;
let length = JIS0212_ACCENTED_TRIPLES[i + 1] as usize;
let pointer_minus_start = pointer.wrapping_sub(start);
if pointer_minus_start < length {
let offset = JIS0212_ACCENTED_TRIPLES[i + 2] as usize;
let candidate = JIS0212_ACCENTED[pointer_minus_start + offset];
if candidate == 0 {
return None;
}
return Some(candidate);
}
i += 3;
}
None
}
#[inline(always)]
pub fn big5_is_astral(rebased_pointer: usize) -> bool {
(BIG5_ASTRALNESS[rebased_pointer >> 5] & (1 << (rebased_pointer & 0x1F))) != 0
}
#[inline(always)]
pub fn big5_low_bits(rebased_pointer: usize) -> u16 {
if rebased_pointer < BIG5_LOW_BITS.len() {
BIG5_LOW_BITS[rebased_pointer]
} else {
0
}
}
#[inline(always)]
pub fn big5_astral_encode(low_bits: u16) -> Option<usize> {
match low_bits {
0x00CC => Some(11205 - 942),
0x008A => Some(11207 - 942),
0x7607 => Some(11213 - 942),
_ => {
let mut i = 18997 - 942;
while i < BIG5_LOW_BITS.len() - 1 {
if BIG5_LOW_BITS[i] == low_bits && big5_is_astral(i) {
return Some(i);
}
i += 1;
}
None
}
}
}
#[cfg(not(feature = "less-slow-big5-hanzi-encode"))]
#[inline(always)]
pub fn big5_level1_hanzi_encode(bmp: u16) -> Option<(u8, u8)> {
if super::in_inclusive_range16(bmp, 0x4E00, 0x9FB1) {
if let Some(hanzi_pointer) = position(&BIG5_LOW_BITS[(5495 - 942)..(10951 - 942)], bmp) {
let lead = hanzi_pointer / 157 + 0xA4;
let remainder = hanzi_pointer % 157;
let trail = if remainder < 0x3F {
remainder + 0x40
} else {
remainder + 0x62
};
return Some((lead as u8, trail as u8));
}
match bmp {
0x4E5A => {
return Some((0xC8, 0x7B));
}
0x5202 => {
return Some((0xC8, 0x7D));
}
0x9FB0 => {
return Some((0xC8, 0xA1));
}
0x5188 => {
return Some((0xC8, 0xA2));
}
0x9FB1 => {
return Some((0xC8, 0xA3));
}
_ => {
return None;
}
}
}
None
}
#[cfg(feature = "less-slow-big5-hanzi-encode")]
#[inline(always)]
pub fn big5_level1_hanzi_encode(bmp: u16) -> Option<(u8, u8)> {
if super::in_inclusive_range16(bmp, 0x4E00, 0x9FB1) {
match BIG5_LEVEL1_HANZI_CODE_POINTS.binary_search(&bmp) {
Ok(i) => {
let pair = &BIG5_LEVEL1_HANZI_BYTES[i];
Some((pair[0], pair[1]))
}
Err(_) => None,
}
} else {
None
}
}
#[inline(always)]
pub fn big5_box_encode(bmp: u16) -> Option<usize> {
position(&BIG5_LOW_BITS[(18963 - 942)..(18992 - 942)], bmp).map(|x| x + 18963)
}
#[inline(always)]
pub fn big5_other_encode(bmp: u16) -> Option<usize> {
if 0x4491 == bmp {
return Some(11209);
}
if let Some(pos) = position(&BIG5_LOW_BITS[(5024 - 942)..(5466 - 942)], bmp) {
return Some(pos + 5024);
}
if let Some(pos) = position(&BIG5_LOW_BITS[(10896 - 942)..(11205 - 942)], bmp) {
return Some(pos + 10896);
}
if let Some(pos) = position(&BIG5_LOW_BITS[(11254 - 942)..(18963 - 942)], bmp) {
return Some(pos + 11254);
}
let mut i = 18996 - 942;
while i < BIG5_LOW_BITS.len() {
if BIG5_LOW_BITS[i] == bmp && !big5_is_astral(i) {
return Some(i + 942);
}
i += 1;
}
None
}
#[inline(always)]
pub fn mul_94(lead: u8) -> usize {
lead as usize * 94
}
''')
data_file.write(data_rs_end)
data_file.close()
@ -1568,7 +1244,7 @@ write_variant_method("encode_from_utf8_raw", True, [("src", "&str"),
variant_file.write('''}
pub enum VariantEncoding {
SingleByte(&'static [u16; 128]),''')
SingleByte(&'static [u16; 128], u16, u8, u8),''')
for encoding in multi_byte:
variant_file.write("%s,\n" % to_camel_name(encoding["name"]))
@ -1578,7 +1254,7 @@ variant_file.write('''}
impl VariantEncoding {
pub fn new_variant_decoder(&self) -> VariantDecoder {
match *self {
VariantEncoding::SingleByte(table) => SingleByteDecoder::new(table),
VariantEncoding::SingleByte(table, _, _, _) => SingleByteDecoder::new(table),
VariantEncoding::Utf8 => Utf8Decoder::new(),
VariantEncoding::Gbk | VariantEncoding::Gb18030 => Gb18030Decoder::new(),
VariantEncoding::Big5 => Big5Decoder::new(),
@ -1595,7 +1271,7 @@ impl VariantEncoding {
pub fn new_encoder(&self, encoding: &'static Encoding) -> Encoder {
match *self {
VariantEncoding::SingleByte(table) => SingleByteEncoder::new(encoding, table),
VariantEncoding::SingleByte(table, run_bmp_offset, run_byte_offset, run_length) => SingleByteEncoder::new(encoding, table, run_bmp_offset, run_byte_offset, run_length),
VariantEncoding::Utf8 => Utf8Encoder::new(encoding),
VariantEncoding::Gbk => Gb18030Encoder::new(encoding, false),
VariantEncoding::Gb18030 => Gb18030Encoder::new(encoding, true),
@ -1609,6 +1285,13 @@ impl VariantEncoding {
VariantEncoding::Utf16Le => unreachable!(),
}
}
pub fn is_single_byte(&self) -> bool {
match *self {
VariantEncoding::SingleByte(_, _, _, _) | VariantEncoding::UserDefined => true,
_ => false,
}
}
}
''')
@ -1653,7 +1336,7 @@ for name in preferred:
continue;
if is_single_byte(name):
single_byte_file.write("""
decode_single_byte(%s, %s_DATA);""" % (to_constant_name(name), to_constant_name(name)))
decode_single_byte(%s, &data::SINGLE_BYTE_DATA.%s);""" % (to_constant_name(name), to_snake_name(name)))
single_byte_file.write("""
}
@ -1666,7 +1349,7 @@ for name in preferred:
continue;
if is_single_byte(name):
single_byte_file.write("""
encode_single_byte(%s, %s_DATA);""" % (to_constant_name(name), to_constant_name(name)))
encode_single_byte(%s, &data::SINGLE_BYTE_DATA.%s);""" % (to_constant_name(name), to_snake_name(name)))
single_byte_file.write("""
@ -1748,25 +1431,48 @@ utf_8_file.write(utf_8_rs_begin)
utf_8_file.write("""
// Instead, please regenerate using generate-encoding-data.py
/// Bit is 1 if the trail is invalid.
pub static UTF8_TRAIL_INVALID: [u8; 256] = [""")
pub static UTF8_DATA: Utf8Data = Utf8Data {
table: [
""")
for i in range(256):
combined = 0
combined = (1 << 2) # invalid lead
if i < 0x80 or i > 0xBF:
combined |= (1 << 3)
combined |= (1 << 3) # normal trail
if i < 0xA0 or i > 0xBF:
combined |= (1 << 4)
combined |= (1 << 4) # three-byte special lower bound
if i < 0x80 or i > 0x9F:
combined |= (1 << 5)
combined |= (1 << 5) # three-byte special upper bound
if i < 0x90 or i > 0xBF:
combined |= (1 << 6)
combined |= (1 << 6) # four-byte special lower bound
if i < 0x80 or i > 0x8F:
combined |= (1 << 7)
combined |= (1 << 7) # four-byte special upper bound
utf_8_file.write("%d," % combined)
for i in range(128, 256):
lane = (1 << 2) # invalid lead
if i >= 0xC2 and i <= 0xDF:
lane = (1 << 3) # normal trail
elif i == 0xE0:
lane = (1 << 4) # three-byte special lower bound
elif i >= 0xE1 and i <= 0xEC:
lane = (1 << 3) # normal trail
elif i == 0xED:
lane = (1 << 5) # three-byte special upper bound
elif i >= 0xEE and i <= 0xEF:
lane = (1 << 3) # normal trail
elif i == 0xF0:
lane = (1 << 6) # four-byte special lower bound
elif i >= 0xF1 and i <= 0xF3:
lane = (1 << 3) # normal trail
elif i == 0xF4:
lane = (1 << 7) # four-byte special upper bound
utf_8_file.write("%d," % lane)
utf_8_file.write("""
];
],
};
""")
utf_8_file.write(utf_8_rs_end)

File diff suppressed because it is too large Load Diff

View File

@ -141,7 +141,7 @@ impl Big5Decoder {
}
}
} else if big5_is_astral(rebased_pointer) {
handle.write_astral(low_bits as u32 |
handle.write_astral(u32::from(low_bits) |
0x20000u32)
} else {
handle.write_bmp_excl_ascii(low_bits)

File diff suppressed because it is too large Load Diff

View File

@ -77,10 +77,10 @@ impl EucJpDecoder {
// and Katakana (10% acconding to Lunde).
if jis0208_lead_minus_offset == 0x03 && trail_minus_offset < 0x53 {
// Hiragana
handle.write_upper_bmp(0x3041 + trail_minus_offset as u16)
handle.write_upper_bmp(0x3041 + u16::from(trail_minus_offset))
} else if jis0208_lead_minus_offset == 0x04 && trail_minus_offset < 0x56 {
// Katakana
handle.write_upper_bmp(0x30A1 + trail_minus_offset as u16)
handle.write_upper_bmp(0x30A1 + u16::from(trail_minus_offset))
} else if trail_minus_offset > (0xFE - 0xA1) {
if byte < 0x80 {
return (
@ -95,7 +95,7 @@ impl EucJpDecoder {
handle.written(),
);
} else {
let pointer = mul_94(jis0208_lead_minus_offset) + trail_minus_offset as usize;
let pointer = mul_94(jis0208_lead_minus_offset) + usize::from(trail_minus_offset);
let level1_pointer = pointer.wrapping_sub(1410);
if level1_pointer < JIS0208_LEVEL1_KANJI.len() {
handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer])
@ -160,7 +160,7 @@ impl EucJpDecoder {
handle.written(),
);
}
let pointer = mul_94(jis0212_lead_minus_offset) + trail_minus_offset as usize;
let pointer = mul_94(jis0212_lead_minus_offset) + usize::from(trail_minus_offset);
let pointer_minus_kanji = pointer.wrapping_sub(1410);
if pointer_minus_kanji < JIS0212_KANJI.len() {
handle.write_upper_bmp(JIS0212_KANJI[pointer_minus_kanji])
@ -202,7 +202,7 @@ impl EucJpDecoder {
handle.written(),
);
}
handle.write_upper_bmp(0xFF61 + trail_minus_offset as u16)
handle.write_upper_bmp(0xFF61 + u16::from(trail_minus_offset))
},
self,
non_ascii,
@ -217,6 +217,33 @@ impl EucJpDecoder {
);
}
#[cfg(feature = "fast-kanji-encode")]
#[inline(always)]
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
jis0208_kanji_euc_jp_encode(bmp)
}
#[cfg(not(feature = "fast-kanji-encode"))]
#[inline(always)]
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
if 0x4EDD == bmp {
// Ideograph on the symbol row!
Some((0xA1, 0xB8))
} else if let Some((lead, trail)) = jis0208_level1_kanji_euc_jp_encode(bmp) {
Some((lead, trail))
} else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
let lead = (pos / 94) + 0xD0;
let trail = (pos % 94) + 0xA1;
Some((lead as u8, trail as u8))
} else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
let lead = (pos / 94) + 0xF9;
let trail = (pos % 94) + 0xA1;
Some((lead as u8, trail as u8))
} else {
None
}
}
pub struct EucJpEncoder;
impl EucJpEncoder {
@ -245,19 +272,8 @@ impl EucJpEncoder {
if bmp_minus_hiragana < 0x53 {
handle.write_two(0xA4, 0xA1 + bmp_minus_hiragana as u8)
} else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
if 0x4EDD == bmp {
// Ideograph on the symbol row!
handle.write_two(0xA1, 0xB8)
} else if let Some((lead, trail)) = jis0208_level1_kanji_euc_jp_encode(bmp) {
if let Some((lead, trail)) = encode_kanji(bmp) {
handle.write_two(lead, trail)
} else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
let lead = (pos / 94) + 0xD0;
let trail = (pos % 94) + 0xA1;
handle.write_two(lead as u8, trail as u8)
} else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
let lead = (pos / 94) + 0xF9;
let trail = (pos % 94) + 0xA1;
handle.write_two(lead as u8, trail as u8)
} else {
return (
EncoderResult::unmappable_from_bmp(bmp),

View File

@ -221,6 +221,69 @@ fn ksx1001_encode_misc(bmp: u16) -> Option<(usize, usize)> {
None
}
#[cfg(not(feature = "fast-hangul-encode"))]
#[inline(always)]
fn ksx1001_encode_hangul(bmp: u16, _: u16) -> (u8, u8) {
match KSX1001_HANGUL.binary_search(&bmp) {
Ok(ksx_hangul_pointer) => {
let ksx_hangul_lead = (ksx_hangul_pointer / 94) + (0x81 + 0x2F);
let ksx_hangul_trail = (ksx_hangul_pointer % 94) + 0xA1;
(ksx_hangul_lead as u8, ksx_hangul_trail as u8)
}
Err(_) => {
let (lead, cp949_trail) = if bmp < 0xC8A5 {
// Above KS X 1001
let top_pointer = cp949_top_hangul_encode(bmp) as usize;
let top_lead = (top_pointer / (190 - 12)) + 0x81;
let top_trail = top_pointer % (190 - 12);
(top_lead as u8, top_trail as u8)
} else {
// To the left of KS X 1001
let left_pointer = cp949_left_hangul_encode(bmp) as usize;
let left_lead = (left_pointer / (190 - 94 - 12)) + (0x81 + 0x20);
let left_trail = left_pointer % (190 - 94 - 12);
(left_lead as u8, left_trail as u8)
};
let offset = if cp949_trail >= (0x40 - 12) {
0x41 + 12
} else if cp949_trail >= (0x20 - 6) {
0x41 + 6
} else {
0x41
};
(lead as u8, (cp949_trail + offset) as u8)
}
}
}
#[cfg(feature = "fast-hangul-encode")]
#[inline(always)]
fn ksx1001_encode_hangul(_: u16, bmp_minus_hangul_start: u16) -> (u8, u8) {
cp949_hangul_encode(bmp_minus_hangul_start)
}
#[cfg(not(feature = "fast-hanja-encode"))]
#[inline(always)]
fn ksx1001_encode_hanja(bmp: u16) -> Option<(u8, u8)> {
if let Some(hanja_pointer) = position(&KSX1001_HANJA[..], bmp) {
let hanja_lead = (hanja_pointer / 94) + (0x81 + 0x49);
let hanja_trail = (hanja_pointer % 94) + 0xA1;
Some((hanja_lead as u8, hanja_trail as u8))
} else {
None
}
}
#[cfg(feature = "fast-hanja-encode")]
#[inline(always)]
fn ksx1001_encode_hanja(bmp: u16) -> Option<(u8, u8)> {
if bmp < 0xF900 {
ksx1001_unified_hangul_encode(bmp)
} else {
Some(ksx1001_compatibility_hangul_encode(bmp))
}
}
pub struct EucKrEncoder;
impl EucKrEncoder {
@ -247,36 +310,7 @@ impl EucKrEncoder {
let bmp_minus_hangul_start = bmp.wrapping_sub(0xAC00);
let (lead, trail) = if bmp_minus_hangul_start < (0xD7A4 - 0xAC00) {
// Hangul
match KSX1001_HANGUL.binary_search(&bmp) {
Ok(ksx_hangul_pointer) => {
let ksx_hangul_lead = (ksx_hangul_pointer / 94) + (0x81 + 0x2F);
let ksx_hangul_trail = (ksx_hangul_pointer % 94) + 0xA1;
(ksx_hangul_lead, ksx_hangul_trail)
}
Err(_) => {
let (lead, cp949_trail) = if bmp < 0xC8A5 {
// Above KS X 1001
let top_pointer = cp949_top_hangul_encode(bmp) as usize;
let top_lead = (top_pointer / (190 - 12)) + 0x81;
let top_trail = top_pointer % (190 - 12);
(top_lead, top_trail)
} else {
// To the left of KS X 1001
let left_pointer = cp949_left_hangul_encode(bmp) as usize;
let left_lead = (left_pointer / (190 - 94 - 12)) + (0x81 + 0x20);
let left_trail = left_pointer % (190 - 94 - 12);
(left_lead, left_trail)
};
let offset = if cp949_trail >= (0x40 - 12) {
0x41 + 12
} else if cp949_trail >= (0x20 - 6) {
0x41 + 6
} else {
0x41
};
(lead, cp949_trail + offset)
}
}
ksx1001_encode_hangul(bmp, bmp_minus_hangul_start)
} else if in_range16(bmp, 0x33DE, 0xFF01) {
// Vast range that includes no other
// mappables except Hangul (already
@ -284,9 +318,7 @@ impl EucKrEncoder {
// Narrow the range further to Unified and
// Compatibility ranges of Hanja.
if in_range16(bmp, 0x4E00, 0x9F9D) || in_range16(bmp, 0xF900, 0xFA0C) {
if let Some(hanja_pointer) = position(&KSX1001_HANJA[..], bmp) {
let hanja_lead = (hanja_pointer / 94) + (0x81 + 0x49);
let hanja_trail = (hanja_pointer % 94) + 0xA1;
if let Some((hanja_lead, hanja_trail)) = ksx1001_encode_hanja(bmp) {
(hanja_lead, hanja_trail)
} else {
return (
@ -303,7 +335,7 @@ impl EucKrEncoder {
);
}
} else if let Some((lead, trail)) = ksx1001_encode_misc(bmp) {
(lead, trail)
(lead as u8, trail as u8)
} else {
return (
EncoderResult::unmappable_from_bmp(bmp),
@ -311,7 +343,7 @@ impl EucKrEncoder {
handle.written(),
);
};
handle.write_two(lead as u8, trail as u8)
handle.write_two(lead, trail)
},
bmp,
self,

View File

@ -61,19 +61,23 @@ impl Gb18030Decoder {
fn extra_from_state(&self, byte_length: usize) -> Option<usize> {
byte_length.checked_add(
self.pending.count() + match self.first {
None => 0,
Some(_) => 1,
} + match self.second {
None => 0,
Some(_) => 1,
} + match self.third {
None => 0,
Some(_) => 1,
} + match self.pending_ascii {
None => 0,
Some(_) => 1,
},
self.pending.count()
+ match self.first {
None => 0,
Some(_) => 1,
}
+ match self.second {
None => 0,
Some(_) => 1,
}
+ match self.third {
None => 0,
Some(_) => 1,
}
+ match self.pending_ascii {
None => 0,
Some(_) => 1,
},
)
}
@ -257,9 +261,9 @@ impl Gb18030Decoder {
} else {
handle.write_bmp_excl_ascii(gb18030_range_decode(pointer as u16))
}
} else if pointer >= 189000 && pointer <= 1237575 {
} else if pointer >= 189_000 && pointer <= 1_237_575 {
// Astral
handle.write_astral((pointer - (189000usize - 0x10000usize)) as u32)
handle.write_astral((pointer - (189_000usize - 0x1_0000usize)) as u32)
} else {
return (DecoderResult::Malformed(4, 0),
unread_handle_fourth.consumed(),
@ -391,6 +395,40 @@ fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> {
None
}
#[cfg(not(feature = "fast-gb-hanzi-encode"))]
#[inline(always)]
fn encode_hanzi(bmp: u16, _: u16) -> (u8, u8) {
if let Some((lead, trail)) = gb2312_level1_hanzi_encode(bmp) {
(lead, trail)
} else if let Some(hanzi_pointer) = gb2312_level2_hanzi_encode(bmp) {
let hanzi_lead = (hanzi_pointer / 94) + (0xD8);
let hanzi_trail = (hanzi_pointer % 94) + 0xA1;
(hanzi_lead as u8, hanzi_trail as u8)
} else {
let (lead, gbk_trail) = if bmp < 0x72DC {
// Above GB2312
let pointer = gbk_top_ideograph_encode(bmp) as usize;
let lead = (pointer / 190) + 0x81;
let gbk_trail = pointer % 190;
(lead, gbk_trail)
} else {
// To the left of GB2312
let gbk_left_ideograph_pointer = gbk_left_ideograph_encode(bmp) as usize;
let lead = (gbk_left_ideograph_pointer / (190 - 94)) + (0x81 + 0x29);
let gbk_trail = gbk_left_ideograph_pointer % (190 - 94);
(lead, gbk_trail)
};
let offset = if gbk_trail < 0x3F { 0x40 } else { 0x41 };
(lead as u8, (gbk_trail + offset) as u8)
}
}
#[cfg(feature = "fast-gb-hanzi-encode")]
#[inline(always)]
fn encode_hanzi(_: u16, bmp_minus_unified_start: u16) -> (u8, u8) {
gbk_hanzi_encode(bmp_minus_unified_start)
}
pub struct Gb18030Encoder {
extended: bool,
}
@ -447,33 +485,8 @@ impl Gb18030Encoder {
// CJK Unified Ideographs
// Can't fail now, since all are
// mapped.
// XXX Can we do something smarter
// than linear search for GB2312
// Level 2 Hanzi, which are almost
// Unicode-ordered?
if let Some((lead, trail)) = gb2312_level1_hanzi_encode(bmp) {
handle.write_two(lead, trail)
} else if let Some(hanzi_pointer) = gb2312_level2_hanzi_encode(bmp) {
let hanzi_lead = (hanzi_pointer / 94) + (0xD8);
let hanzi_trail = (hanzi_pointer % 94) + 0xA1;
handle.write_two(hanzi_lead as u8, hanzi_trail as u8)
} else {
let (lead, gbk_trail) = if bmp < 0x72DC {
// Above GB2312
let pointer = gbk_top_ideograph_encode(bmp) as usize;
let lead = (pointer / 190) + 0x81;
let gbk_trail = pointer % 190;
(lead, gbk_trail)
} else {
// To the left of GB2312
let gbk_left_ideograph_pointer = gbk_left_ideograph_encode(bmp) as usize;
let lead = (gbk_left_ideograph_pointer / (190 - 94)) + (0x81 + 0x29);
let gbk_trail = gbk_left_ideograph_pointer % (190 - 94);
(lead, gbk_trail)
};
let offset = if gbk_trail < 0x3F { 0x40 } else { 0x41 };
handle.write_two(lead as u8, (gbk_trail + offset) as u8)
}
let (lead, trail) = encode_hanzi(bmp, bmp_minus_unified_start);
handle.write_two(lead, trail)
} else if bmp == 0xE5E5 {
// It's not optimal to check for the unmappable
// and for euro at this stage, but getting
@ -522,7 +535,7 @@ impl Gb18030Encoder {
handle.written(),
);
}
let range_pointer = astral as usize + (189000usize - 0x10000usize);
let range_pointer = astral as usize + (189_000usize - 0x1_0000usize);
let first = range_pointer / (10 * 126 * 10);
let rem_first = range_pointer % (10 * 126 * 10);
let second = rem_first / (10 * 126);

View File

@ -16,28 +16,24 @@
//! the plan is to replace the internals with unsafe code that omits the
//! bound check at the read/write time.
#[cfg(
all(
feature = "simd-accel",
any(
target_feature = "sse2",
all(target_endian = "little", target_arch = "aarch64"),
all(target_endian = "little", target_feature = "neon")
)
#[cfg(all(
feature = "simd-accel",
any(
target_feature = "sse2",
all(target_endian = "little", target_arch = "aarch64"),
all(target_endian = "little", target_feature = "neon")
)
)]
))]
use simd_funcs::*;
#[cfg(
all(
feature = "simd-accel",
any(
target_feature = "sse2",
all(target_endian = "little", target_arch = "aarch64"),
all(target_endian = "little", target_feature = "neon")
)
#[cfg(all(
feature = "simd-accel",
any(
target_feature = "sse2",
all(target_endian = "little", target_arch = "aarch64"),
all(target_endian = "little", target_feature = "neon")
)
)]
))]
use simd::u16x8;
use super::DecoderResult;
@ -92,6 +88,7 @@ impl Endian for LittleEndian {
const OPPOSITE_ENDIAN: bool = true;
}
#[derive(Debug, Copy, Clone)]
struct UnalignedU16Slice {
ptr: *const u8,
len: usize,
@ -114,11 +111,7 @@ impl UnalignedU16Slice {
assert!(i < self.len);
unsafe {
let mut u: u16 = ::std::mem::uninitialized();
::std::ptr::copy_nonoverlapping(
self.ptr.offset((i * 2) as isize),
&mut u as *mut u16 as *mut u8,
2,
);
::std::ptr::copy_nonoverlapping(self.ptr.add(i * 2), &mut u as *mut u16 as *mut u8, 2);
u
}
}
@ -128,7 +121,7 @@ impl UnalignedU16Slice {
pub fn simd_at(&self, i: usize) -> u16x8 {
assert!(i + SIMD_STRIDE_SIZE / 2 <= self.len);
let byte_index = i * 2;
unsafe { to_u16_lanes(load16_unaligned(self.ptr.offset(byte_index as isize))) }
unsafe { to_u16_lanes(load16_unaligned(self.ptr.add(byte_index))) }
}
#[inline(always)]
@ -141,7 +134,7 @@ impl UnalignedU16Slice {
// XXX the return value should be restricted not to
// outlive self.
assert!(from <= self.len);
unsafe { UnalignedU16Slice::new(self.ptr.offset((from * 2) as isize), self.len - from) }
unsafe { UnalignedU16Slice::new(self.ptr.add(from * 2), self.len - from) }
}
#[cfg(feature = "simd-accel")]
@ -157,7 +150,7 @@ impl UnalignedU16Slice {
simd = simd_byte_swap(simd);
}
unsafe {
store8_unaligned(other.as_mut_ptr().offset(offset as isize), simd);
store8_unaligned(other.as_mut_ptr().add(offset), simd);
}
if contains_surrogates(simd) {
break;
@ -183,9 +176,9 @@ impl UnalignedU16Slice {
#[inline(always)]
fn copy_bmp_to<E: Endian>(&self, other: &mut [u16]) -> Option<(u16, usize)> {
assert!(self.len <= other.len());
for i in 0..self.len {
for (i, target) in other.iter_mut().enumerate().take(self.len) {
let unit = swap_if_opposite_endian::<E>(self.at(i));
other[i] = unit;
*target = unit;
if super::in_range16(unit, 0xD800, 0xE000) {
return Some((unit, i));
}
@ -255,7 +248,7 @@ fn copy_unaligned_basic_latin_to_ascii<E: Endian>(
}
let packed = simd_pack(first, second);
unsafe {
store16_unaligned(dst.as_mut_ptr().offset(offset as isize), packed);
store16_unaligned(dst.as_mut_ptr().add(offset), packed);
}
offset += SIMD_STRIDE_SIZE;
if offset > len_minus_stride {
@ -302,16 +295,16 @@ fn convert_unaligned_utf16_to_utf8<E: Endian>(
let non_ascii_minus_surrogate_start = non_ascii.wrapping_sub(0xD800);
if non_ascii_minus_surrogate_start > (0xDFFF - 0xD800) {
if non_ascii < 0x800 {
dst[dst_pos] = ((non_ascii as u32 >> 6) | 0xC0u32) as u8;
dst[dst_pos] = ((non_ascii >> 6) | 0xC0) as u8;
dst_pos += 1;
dst[dst_pos] = ((non_ascii as u32 & 0x3Fu32) | 0x80u32) as u8;
dst[dst_pos] = ((non_ascii & 0x3F) | 0x80) as u8;
dst_pos += 1;
} else {
dst[dst_pos] = ((non_ascii as u32 >> 12) | 0xE0u32) as u8;
dst[dst_pos] = ((non_ascii >> 12) | 0xE0) as u8;
dst_pos += 1;
dst[dst_pos] = (((non_ascii as u32 & 0xFC0u32) >> 6) | 0x80u32) as u8;
dst[dst_pos] = (((non_ascii & 0xFC0) >> 6) | 0x80) as u8;
dst_pos += 1;
dst[dst_pos] = ((non_ascii as u32 & 0x3Fu32) | 0x80u32) as u8;
dst[dst_pos] = ((non_ascii & 0x3F) | 0x80) as u8;
dst_pos += 1;
}
} else if non_ascii_minus_surrogate_start <= (0xDBFF - 0xD800) {
@ -322,7 +315,7 @@ fn convert_unaligned_utf16_to_utf8<E: Endian>(
if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) {
// The next code unit is a low surrogate. Advance position.
src_pos += 1;
let point = ((non_ascii as u32) << 10) + (second as u32)
let point = (u32::from(non_ascii) << 10) + u32::from(second)
- (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
dst[dst_pos] = ((point >> 18) | 0xF0u32) as u8;
@ -613,7 +606,7 @@ impl<'a> Utf16Destination<'a> {
#[inline(always)]
fn write_ascii(&mut self, ascii: u8) {
debug_assert!(ascii < 0x80);
self.write_code_unit(ascii as u16);
self.write_code_unit(u16::from(ascii));
}
#[inline(always)]
fn write_bmp(&mut self, bmp: u16) {
@ -637,7 +630,7 @@ impl<'a> Utf16Destination<'a> {
#[inline(always)]
fn write_astral(&mut self, astral: u32) {
debug_assert!(astral > 0xFFFF);
debug_assert!(astral <= 0x10FFFF);
debug_assert!(astral <= 0x10_FFFF);
self.write_code_unit((0xD7C0 + (astral >> 10)) as u16);
self.write_code_unit((0xDC00 + (astral & 0x3FF)) as u16);
}
@ -962,15 +955,15 @@ impl<'a> Utf8Destination<'a> {
fn write_mid_bmp(&mut self, mid_bmp: u16) {
debug_assert!(mid_bmp >= 0x80);
debug_assert!(mid_bmp < 0x800);
self.write_code_unit(((mid_bmp as u32 >> 6) | 0xC0u32) as u8);
self.write_code_unit(((mid_bmp as u32 & 0x3Fu32) | 0x80u32) as u8);
self.write_code_unit(((mid_bmp >> 6) | 0xC0) as u8);
self.write_code_unit(((mid_bmp & 0x3F) | 0x80) as u8);
}
#[inline(always)]
fn write_upper_bmp(&mut self, upper_bmp: u16) {
debug_assert!(upper_bmp >= 0x800);
self.write_code_unit(((upper_bmp as u32 >> 12) | 0xE0u32) as u8);
self.write_code_unit((((upper_bmp as u32 & 0xFC0u32) >> 6) | 0x80u32) as u8);
self.write_code_unit(((upper_bmp as u32 & 0x3Fu32) | 0x80u32) as u8);
self.write_code_unit(((upper_bmp >> 12) | 0xE0) as u8);
self.write_code_unit((((upper_bmp & 0xFC0) >> 6) | 0x80) as u8);
self.write_code_unit(((upper_bmp & 0x3F) | 0x80) as u8);
}
#[inline(always)]
fn write_bmp_excl_ascii(&mut self, bmp: u16) {
@ -983,16 +976,17 @@ impl<'a> Utf8Destination<'a> {
#[inline(always)]
fn write_astral(&mut self, astral: u32) {
debug_assert!(astral > 0xFFFF);
debug_assert!(astral <= 0x10FFFF);
self.write_code_unit(((astral >> 18) | 0xF0u32) as u8);
self.write_code_unit((((astral & 0x3F000u32) >> 12) | 0x80u32) as u8);
self.write_code_unit((((astral & 0xFC0u32) >> 6) | 0x80u32) as u8);
self.write_code_unit(((astral & 0x3Fu32) | 0x80u32) as u8);
debug_assert!(astral <= 0x10_FFFF);
self.write_code_unit(((astral >> 18) | 0xF0) as u8);
self.write_code_unit((((astral & 0x3F000) >> 12) | 0x80) as u8);
self.write_code_unit((((astral & 0xFC0) >> 6) | 0x80) as u8);
self.write_code_unit(((astral & 0x3F) | 0x80) as u8);
}
#[inline(always)]
pub fn write_surrogate_pair(&mut self, high: u16, low: u16) {
self.write_astral(
((high as u32) << 10) + (low as u32) - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32),
(u32::from(high) << 10) + u32::from(low)
- (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32),
);
}
#[inline(always)]
@ -1088,13 +1082,7 @@ impl<'a> Utf8Destination<'a> {
// Validate first, then memcpy to let memcpy do its thing even for
// non-ASCII. (And potentially do something better than SSE2 for ASCII.)
let valid_len = utf8_valid_up_to(&src_remaining[..min_len]);
unsafe {
::std::ptr::copy_nonoverlapping(
src_remaining.as_ptr(),
dst_remaining.as_mut_ptr(),
valid_len,
);
}
(&mut dst_remaining[..valid_len]).copy_from_slice(&src_remaining[..valid_len]);
source.pos += valid_len;
self.pos += valid_len;
}
@ -1162,23 +1150,24 @@ impl<'a> Utf16Source<'a> {
#[inline(always)]
fn read(&mut self) -> char {
self.old_pos = self.pos;
let unit = self.slice[self.pos] as u32;
let unit = self.slice[self.pos];
self.pos += 1;
let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
if unit_minus_surrogate_start > (0xDFFF - 0xD800) {
return unsafe { ::std::mem::transmute(unit) };
return unsafe { ::std::char::from_u32_unchecked(u32::from(unit)) };
}
if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
// high surrogate
if self.pos < self.slice.len() {
let second = self.slice[self.pos] as u32;
let second = self.slice[self.pos];
let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) {
// The next code unit is a low surrogate. Advance position.
self.pos += 1;
return unsafe {
::std::mem::transmute(
(unit << 10) + second - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32),
::std::char::from_u32_unchecked(
(u32::from(unit) << 10) + u32::from(second)
- (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32),
)
};
}
@ -1207,14 +1196,14 @@ impl<'a> Utf16Source<'a> {
if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
// high surrogate
if self.pos < self.slice.len() {
let second = self.slice[self.pos] as u32;
let second = self.slice[self.pos];
let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) {
// The next code unit is a low surrogate. Advance position.
self.pos += 1;
return Unicode::NonAscii(NonAscii::Astral(unsafe {
::std::mem::transmute(
((unit as u32) << 10) + (second as u32)
::std::char::from_u32_unchecked(
(u32::from(unit) << 10) + u32::from(second)
- (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32),
)
}));
@ -1271,14 +1260,14 @@ impl<'a> Utf16Source<'a> {
} else if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
// high surrogate
if self.pos < self.slice.len() {
let second = self.slice[self.pos] as u32;
let second = self.slice[self.pos];
let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) {
// The next code unit is a low surrogate. Advance position.
self.pos += 1;
NonAscii::Astral(unsafe {
::std::mem::transmute(
((unit as u32) << 10) + (second as u32)
::std::char::from_u32_unchecked(
(u32::from(unit) << 10) + u32::from(second)
- (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32),
)
})
@ -1344,15 +1333,15 @@ impl<'a> Utf16Source<'a> {
// Unpaired surrogate at the end of the buffer.
NonAscii::BmpExclAscii(0xFFFDu16)
} else {
let second = self.slice[self.pos] as u32;
let second = self.slice[self.pos];
let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) {
// The next code unit is a low surrogate. Advance position.
self.pos += 1;
NonAscii::Astral(unsafe {
::std::mem::transmute(
((unit as u32) << 10) + (second as u32)
- (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32),
::std::char::from_u32_unchecked(
(u32::from(unit) << 10) + u32::from(second)
- (((0xD800u32 << 10) - 0x1_0000u32) + 0xDC00u32),
)
})
} else {
@ -1469,57 +1458,60 @@ impl<'a> Utf8Source<'a> {
#[inline(always)]
fn read(&mut self) -> char {
self.old_pos = self.pos;
let unit = self.slice[self.pos] as u32;
if unit < 0x80u32 {
let unit = self.slice[self.pos];
if unit < 0x80 {
self.pos += 1;
return unsafe { ::std::mem::transmute(unit) };
return char::from(unit);
}
if unit < 0xE0u32 {
let point = ((unit & 0x1Fu32) << 6) | (self.slice[self.pos + 1] as u32 & 0x3Fu32);
if unit < 0xE0 {
let point =
((u32::from(unit) & 0x1F) << 6) | (u32::from(self.slice[self.pos + 1]) & 0x3F);
self.pos += 2;
return unsafe { ::std::mem::transmute(point) };
return unsafe { ::std::char::from_u32_unchecked(point) };
}
if unit < 0xF0u32 {
let point = ((unit & 0xFu32) << 12)
| ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 6)
| (self.slice[self.pos + 2] as u32 & 0x3Fu32);
if unit < 0xF0 {
let point = ((u32::from(unit) & 0xF) << 12)
| ((u32::from(self.slice[self.pos + 1]) & 0x3F) << 6)
| (u32::from(self.slice[self.pos + 2]) & 0x3F);
self.pos += 3;
return unsafe { ::std::mem::transmute(point) };
return unsafe { ::std::char::from_u32_unchecked(point) };
}
let point = ((unit & 0x7u32) << 18)
| ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 12)
| ((self.slice[self.pos + 2] as u32 & 0x3Fu32) << 6)
| (self.slice[self.pos + 3] as u32 & 0x3Fu32);
let point = ((u32::from(unit) & 0x7) << 18)
| ((u32::from(self.slice[self.pos + 1]) & 0x3F) << 12)
| ((u32::from(self.slice[self.pos + 2]) & 0x3F) << 6)
| (u32::from(self.slice[self.pos + 3]) & 0x3F);
self.pos += 4;
unsafe { ::std::mem::transmute(point) }
unsafe { ::std::char::from_u32_unchecked(point) }
}
#[inline(always)]
fn read_enum(&mut self) -> Unicode {
self.old_pos = self.pos;
let unit = self.slice[self.pos];
if unit < 0x80u8 {
if unit < 0x80 {
self.pos += 1;
return Unicode::Ascii(unit);
}
if unit < 0xE0u8 {
if unit < 0xE0 {
let point =
(((unit as u32) & 0x1Fu32) << 6) | (self.slice[self.pos + 1] as u32 & 0x3Fu32);
((u16::from(unit) & 0x1F) << 6) | (u16::from(self.slice[self.pos + 1]) & 0x3F);
self.pos += 2;
return Unicode::NonAscii(NonAscii::BmpExclAscii(point as u16));
return Unicode::NonAscii(NonAscii::BmpExclAscii(point));
}
if unit < 0xF0u8 {
let point = (((unit as u32) & 0xFu32) << 12)
| ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 6)
| (self.slice[self.pos + 2] as u32 & 0x3Fu32);
if unit < 0xF0 {
let point = ((u16::from(unit) & 0xF) << 12)
| ((u16::from(self.slice[self.pos + 1]) & 0x3F) << 6)
| (u16::from(self.slice[self.pos + 2]) & 0x3F);
self.pos += 3;
return Unicode::NonAscii(NonAscii::BmpExclAscii(point as u16));
return Unicode::NonAscii(NonAscii::BmpExclAscii(point));
}
let point = (((unit as u32) & 0x7u32) << 18)
| ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 12)
| ((self.slice[self.pos + 2] as u32 & 0x3Fu32) << 6)
| (self.slice[self.pos + 3] as u32 & 0x3Fu32);
let point = ((u32::from(unit) & 0x7) << 18)
| ((u32::from(self.slice[self.pos + 1]) & 0x3F) << 12)
| ((u32::from(self.slice[self.pos + 2]) & 0x3F) << 6)
| (u32::from(self.slice[self.pos + 3]) & 0x3F);
self.pos += 4;
Unicode::NonAscii(NonAscii::Astral(unsafe { ::std::mem::transmute(point) }))
Unicode::NonAscii(NonAscii::Astral(unsafe {
::std::char::from_u32_unchecked(point)
}))
}
#[inline(always)]
fn unread(&mut self) -> usize {
@ -1556,25 +1548,24 @@ impl<'a> Utf8Source<'a> {
dest.pos += consumed;
// We don't need to check space in destination, because
// `ascii_to_ascii()` already did.
let non_ascii32 = non_ascii as u32;
if non_ascii32 < 0xE0u32 {
let point = ((non_ascii32 & 0x1Fu32) << 6)
| (self.slice[self.pos + 1] as u32 & 0x3Fu32);
if non_ascii < 0xE0 {
let point = ((u16::from(non_ascii) & 0x1F) << 6)
| (u16::from(self.slice[self.pos + 1]) & 0x3F);
self.pos += 2;
NonAscii::BmpExclAscii(point as u16)
} else if non_ascii32 < 0xF0u32 {
let point = ((non_ascii32 & 0xFu32) << 12)
| ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 6)
| (self.slice[self.pos + 2] as u32 & 0x3Fu32);
NonAscii::BmpExclAscii(point)
} else if non_ascii < 0xF0 {
let point = ((u16::from(non_ascii) & 0xF) << 12)
| ((u16::from(self.slice[self.pos + 1]) & 0x3F) << 6)
| (u16::from(self.slice[self.pos + 2]) & 0x3F);
self.pos += 3;
NonAscii::BmpExclAscii(point as u16)
NonAscii::BmpExclAscii(point)
} else {
let point = ((non_ascii32 & 0x7u32) << 18)
| ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 12)
| ((self.slice[self.pos + 2] as u32 & 0x3Fu32) << 6)
| (self.slice[self.pos + 3] as u32 & 0x3Fu32);
let point = ((u32::from(non_ascii) & 0x7) << 18)
| ((u32::from(self.slice[self.pos + 1]) & 0x3F) << 12)
| ((u32::from(self.slice[self.pos + 2]) & 0x3F) << 6)
| (u32::from(self.slice[self.pos + 3]) & 0x3F);
self.pos += 4;
NonAscii::Astral(unsafe { ::std::mem::transmute(point) })
NonAscii::Astral(unsafe { ::std::char::from_u32_unchecked(point) })
}
}
}
@ -1607,25 +1598,24 @@ impl<'a> Utf8Source<'a> {
self.pos += consumed;
dest.pos += consumed;
if dest.pos + 1 < dst_len {
let non_ascii32 = non_ascii as u32;
if non_ascii32 < 0xE0u32 {
let point = ((non_ascii32 & 0x1Fu32) << 6)
| (self.slice[self.pos + 1] as u32 & 0x3Fu32);
if non_ascii < 0xE0 {
let point = ((u16::from(non_ascii) & 0x1F) << 6)
| (u16::from(self.slice[self.pos + 1]) & 0x3F);
self.pos += 2;
NonAscii::BmpExclAscii(point as u16)
} else if non_ascii32 < 0xF0u32 {
let point = ((non_ascii32 & 0xFu32) << 12)
| ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 6)
| (self.slice[self.pos + 2] as u32 & 0x3Fu32);
NonAscii::BmpExclAscii(point)
} else if non_ascii < 0xF0 {
let point = ((u16::from(non_ascii) & 0xF) << 12)
| ((u16::from(self.slice[self.pos + 1]) & 0x3F) << 6)
| (u16::from(self.slice[self.pos + 2]) & 0x3F);
self.pos += 3;
NonAscii::BmpExclAscii(point as u16)
NonAscii::BmpExclAscii(point)
} else {
let point = ((non_ascii32 & 0x7u32) << 18)
| ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 12)
| ((self.slice[self.pos + 2] as u32 & 0x3Fu32) << 6)
| (self.slice[self.pos + 3] as u32 & 0x3Fu32);
let point = ((u32::from(non_ascii) & 0x7) << 18)
| ((u32::from(self.slice[self.pos + 1]) & 0x3F) << 12)
| ((u32::from(self.slice[self.pos + 2]) & 0x3F) << 6)
| (u32::from(self.slice[self.pos + 3]) & 0x3F);
self.pos += 4;
NonAscii::Astral(unsafe { ::std::mem::transmute(point) })
NonAscii::Astral(unsafe { ::std::char::from_u32_unchecked(point) })
}
} else {
return CopyAsciiResult::Stop((
@ -1665,25 +1655,24 @@ impl<'a> Utf8Source<'a> {
self.pos += consumed;
dest.pos += consumed;
if dest.pos + 3 < dst_len {
let non_ascii32 = non_ascii as u32;
if non_ascii32 < 0xE0u32 {
let point = ((non_ascii32 & 0x1Fu32) << 6)
| (self.slice[self.pos + 1] as u32 & 0x3Fu32);
if non_ascii < 0xE0 {
let point = ((u16::from(non_ascii) & 0x1F) << 6)
| (u16::from(self.slice[self.pos + 1]) & 0x3F);
self.pos += 2;
NonAscii::BmpExclAscii(point as u16)
} else if non_ascii32 < 0xF0u32 {
let point = ((non_ascii32 & 0xFu32) << 12)
| ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 6)
| (self.slice[self.pos + 2] as u32 & 0x3Fu32);
NonAscii::BmpExclAscii(point)
} else if non_ascii < 0xF0 {
let point = ((u16::from(non_ascii) & 0xF) << 12)
| ((u16::from(self.slice[self.pos + 1]) & 0x3F) << 6)
| (u16::from(self.slice[self.pos + 2]) & 0x3F);
self.pos += 3;
NonAscii::BmpExclAscii(point as u16)
NonAscii::BmpExclAscii(point)
} else {
let point = ((non_ascii32 & 0x7u32) << 18)
| ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 12)
| ((self.slice[self.pos + 2] as u32 & 0x3Fu32) << 6)
| (self.slice[self.pos + 3] as u32 & 0x3Fu32);
let point = ((u32::from(non_ascii) & 0x7) << 18)
| ((u32::from(self.slice[self.pos + 1]) & 0x3F) << 12)
| ((u32::from(self.slice[self.pos + 2]) & 0x3F) << 6)
| (u32::from(self.slice[self.pos + 3]) & 0x3F);
self.pos += 4;
NonAscii::Astral(unsafe { ::std::mem::transmute(point) })
NonAscii::Astral(unsafe { ::std::char::from_u32_unchecked(point) })
}
} else {
return CopyAsciiResult::Stop((

View File

@ -107,7 +107,7 @@ impl Iso2022JpDecoder {
}
Iso2022JpDecoderState::Katakana => {
destination_handle
.write_upper_bmp(self.lead as u16 - 0x21u16 + 0xFF61u16);
.write_upper_bmp(u16::from(self.lead) - 0x21u16 + 0xFF61u16);
self.lead = 0x0u8;
}
Iso2022JpDecoderState::LeadByte => {
@ -183,7 +183,7 @@ impl Iso2022JpDecoder {
}
self.output_flag = false;
if b >= 0x21u8 && b <= 0x5Fu8 {
destination_handle.write_upper_bmp(b as u16 - 0x21u16 + 0xFF61u16);
destination_handle.write_upper_bmp(u16::from(b) - 0x21u16 + 0xFF61u16);
continue;
}
return (
@ -231,11 +231,11 @@ impl Iso2022JpDecoder {
// and Katakana (10% acconding to Lunde).
if jis0208_lead_minus_offset == 0x03 && trail_minus_offset < 0x53 {
// Hiragana
handle.write_upper_bmp(0x3041 + trail_minus_offset as u16);
handle.write_upper_bmp(0x3041 + u16::from(trail_minus_offset));
continue;
} else if jis0208_lead_minus_offset == 0x04 && trail_minus_offset < 0x56 {
// Katakana
handle.write_upper_bmp(0x30A1 + trail_minus_offset as u16);
handle.write_upper_bmp(0x30A1 + u16::from(trail_minus_offset));
continue;
} else if trail_minus_offset > (0xFE - 0xA1) {
return (
@ -356,7 +356,46 @@ impl Iso2022JpDecoder {
);
}
#[cfg_attr(feature = "cargo-clippy", allow(if_let_redundant_pattern_matching, if_same_then_else))]
#[cfg(feature = "fast-kanji-encode")]
#[inline(always)]
fn is_kanji_mapped(bmp: u16) -> bool {
// Use the shift_jis variant, because we don't care about the
// byte values here.
jis0208_kanji_shift_jis_encode(bmp).is_some()
}
#[cfg(not(feature = "fast-kanji-encode"))]
#[cfg_attr(
feature = "cargo-clippy",
allow(
if_let_redundant_pattern_matching,
if_same_then_else
)
)]
#[inline(always)]
fn is_kanji_mapped(bmp: u16) -> bool {
if 0x4EDD == bmp {
true
} else if let Some(_) = jis0208_level1_kanji_shift_jis_encode(bmp) {
// Use the shift_jis variant, because we don't care about the
// byte values here.
true
} else if let Some(_) = jis0208_level2_and_additional_kanji_encode(bmp) {
true
} else if let Some(_) = position(&IBM_KANJI[..], bmp) {
true
} else {
false
}
}
#[cfg_attr(
feature = "cargo-clippy",
allow(
if_let_redundant_pattern_matching,
if_same_then_else
)
)]
fn is_mapped_for_two_byte_encode(bmp: u16) -> bool {
// The code below uses else after return to
// keep the same structure as in EUC-JP.
@ -365,19 +404,7 @@ fn is_mapped_for_two_byte_encode(bmp: u16) -> bool {
if bmp_minus_hiragana < 0x53 {
true
} else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
if 0x4EDD == bmp {
true
} else if let Some(_) = jis0208_level1_kanji_shift_jis_encode(bmp) {
// Use the shift_jis variant, because we don't care about the
// byte values here.
true
} else if let Some(_) = jis0208_level2_and_additional_kanji_encode(bmp) {
true
} else if let Some(_) = position(&IBM_KANJI[..], bmp) {
true
} else {
false
}
is_kanji_mapped(bmp)
} else {
let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
if bmp_minus_katakana < 0x56 {
@ -406,6 +433,33 @@ fn is_mapped_for_two_byte_encode(bmp: u16) -> bool {
}
}
#[cfg(feature = "fast-kanji-encode")]
#[inline(always)]
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
jis0208_kanji_iso_2022_jp_encode(bmp)
}
#[cfg(not(feature = "fast-kanji-encode"))]
#[inline(always)]
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
if 0x4EDD == bmp {
// Ideograph on the symbol row!
Some((0x21, 0xB8 - 0x80))
} else if let Some((lead, trail)) = jis0208_level1_kanji_iso_2022_jp_encode(bmp) {
Some((lead, trail))
} else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
let lead = (pos / 94) + (0xD0 - 0x80);
let trail = (pos % 94) + 0x21;
Some((lead as u8, trail as u8))
} else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
let lead = (pos / 94) + (0xF9 - 0x80);
let trail = (pos % 94) + 0x21;
Some((lead as u8, trail as u8))
} else {
None
}
}
enum Iso2022JpEncoderState {
Ascii,
Roman,
@ -605,25 +659,9 @@ impl Iso2022JpEncoder {
handle.write_two(0x24, 0x21 + bmp_minus_hiragana as u8);
continue;
} else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
if 0x4EDD == bmp {
// Ideograph on the symbol row!
handle.write_two(0x21, 0xB8 - 0x80);
continue;
} else if let Some((lead, trail)) =
jis0208_level1_kanji_iso_2022_jp_encode(bmp)
{
if let Some((lead, trail)) = encode_kanji(bmp) {
handle.write_two(lead, trail);
continue;
} else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
let lead = (pos / 94) + (0xD0 - 0x80);
let trail = (pos % 94) + 0x21;
handle.write_two(lead as u8, trail as u8);
continue;
} else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
let lead = (pos / 94) + (0xF9 - 0x80);
let trail = (pos % 94) + 0x21;
handle.write_two(lead as u8, trail as u8);
continue;
} else {
self.state = Iso2022JpEncoderState::Ascii;
return (

View File

@ -7,8 +7,15 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.
#![cfg_attr(feature = "cargo-clippy", allow(doc_markdown, inline_always, new_ret_no_self))]
#![doc(html_root_url = "https://docs.rs/encoding_rs/0.8.9")]
#![cfg_attr(
feature = "cargo-clippy",
allow(
doc_markdown,
inline_always,
new_ret_no_self
)
)]
#![doc(html_root_url = "https://docs.rs/encoding_rs/0.8.12")]
//! encoding_rs is a Gecko-oriented Free Software / Open Source implementation
//! of the [Encoding Standard](https://encoding.spec.whatwg.org/) in Rust.
@ -82,10 +89,7 @@
//! // Very short output buffer to demonstrate the output buffer getting full.
//! // Normally, you'd use something like `[0u8; 2048]`.
//! let mut buffer_bytes = [0u8; 8];
//! // Rust doesn't allow us to stack-allocate a `mut str` without `unsafe`.
//! let mut buffer: &mut str = unsafe {
//! std::mem::transmute(&mut buffer_bytes[..])
//! };
//! let mut buffer: &mut str = std::str::from_utf8_mut(&mut buffer_bytes[..]).unwrap();
//!
//! // How many bytes in the buffer currently hold significant data.
//! let mut bytes_in_buffer = 0usize;
@ -231,16 +235,17 @@
//! performance, the decoder for ISO-2022-JP optimizes for ease/clarity
//! of implementation.
//!
//! Despite the focus on the Web, encoding_rs may well be useful for decoding
//! email, although you'll need to implement UTF-7 decoding and label handling
//! by other means. (Due to the Web focus, patches to add UTF-7 are unwelcome
//! in encoding_rs itself.) Also, despite the browser focus, the hope is that
//! non-browser applications that wish to consume Web content or submit Web
//! forms in a Web-compatible way will find encoding_rs useful. While
//! encoding_rs does not try to match Windows behavior, many of the encodings
//! are close enough to legacy encodings implemented by Windows that
//! applications that need to consume data in legacy Windows encodins may
//! find encoding_rs useful.
//! Despite the browser focus, the hope is that non-browser applications
//! that wish to consume Web content or submit Web forms in a Web-compatible
//! way will find encoding_rs useful. While encoding_rs does not try to match
//! Windows behavior, many of the encodings are close enough to legacy
//! encodings implemented by Windows that applications that need to consume
//! data in legacy Windows encodins may find encoding_rs useful.
//!
//! For decoding email, UTF-7 support is needed (unfortunately) in additition
//! to the encodings defined in the Encoding Standard. The
//! [charset](https://crates.io/crates/charset) wraps encoding_rs and adds
//! UTF-7 decoding for email purposes.
//!
//! # Streaming & Non-Streaming; Rust & C/C++
//!
@ -660,22 +665,21 @@
//! for discussion about the UTF-16 family.
#![cfg_attr(
feature = "simd-accel", feature(cfg_target_feature, platform_intrinsics, core_intrinsics)
feature = "simd-accel",
feature(platform_intrinsics, core_intrinsics)
)]
#[macro_use]
extern crate cfg_if;
#[cfg(
all(
feature = "simd-accel",
any(
target_feature = "sse2",
all(target_endian = "little", target_arch = "aarch64"),
all(target_endian = "little", target_feature = "neon")
)
#[cfg(all(
feature = "simd-accel",
any(
target_feature = "sse2",
all(target_endian = "little", target_arch = "aarch64"),
all(target_endian = "little", target_feature = "neon")
)
)]
))]
extern crate simd;
#[cfg(feature = "serde")]
@ -692,26 +696,15 @@ extern crate serde_json;
#[macro_use]
mod macros;
#[cfg(
all(
feature = "simd-accel",
any(
target_feature = "sse2",
all(target_endian = "little", target_arch = "aarch64"),
all(target_endian = "little", target_feature = "neon")
)
)
)]
mod simd_funcs;
#[cfg(
#[cfg(all(
feature = "simd-accel",
any(
all(feature = "simd-accel", target_feature = "sse2"),
target_feature = "sse2",
all(target_endian = "little", target_arch = "aarch64"),
all(target_endian = "little", target_arch = "arm")
all(target_endian = "little", target_feature = "neon")
)
)]
mod utf_8_core;
))]
mod simd_funcs;
#[cfg(test)]
mod testing;
@ -934,7 +927,7 @@ pub static GBK: &'static Encoding = &GBK_INIT;
/// items.
pub static IBM866_INIT: Encoding = Encoding {
name: "IBM866",
variant: VariantEncoding::SingleByte(data::IBM866_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.ibm866, 0x0440, 96, 16),
};
/// The IBM866 encoding.
@ -1004,7 +997,7 @@ pub static ISO_2022_JP: &'static Encoding = &ISO_2022_JP_INIT;
/// items.
pub static ISO_8859_10_INIT: Encoding = Encoding {
name: "ISO-8859-10",
variant: VariantEncoding::SingleByte(data::ISO_8859_10_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_10, 0x00DA, 90, 6),
};
/// The ISO-8859-10 encoding.
@ -1038,7 +1031,7 @@ pub static ISO_8859_10: &'static Encoding = &ISO_8859_10_INIT;
/// items.
pub static ISO_8859_13_INIT: Encoding = Encoding {
name: "ISO-8859-13",
variant: VariantEncoding::SingleByte(data::ISO_8859_13_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_13, 0x00DF, 95, 1),
};
/// The ISO-8859-13 encoding.
@ -1072,7 +1065,7 @@ pub static ISO_8859_13: &'static Encoding = &ISO_8859_13_INIT;
/// items.
pub static ISO_8859_14_INIT: Encoding = Encoding {
name: "ISO-8859-14",
variant: VariantEncoding::SingleByte(data::ISO_8859_14_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_14, 0x00DF, 95, 17),
};
/// The ISO-8859-14 encoding.
@ -1106,7 +1099,7 @@ pub static ISO_8859_14: &'static Encoding = &ISO_8859_14_INIT;
/// items.
pub static ISO_8859_15_INIT: Encoding = Encoding {
name: "ISO-8859-15",
variant: VariantEncoding::SingleByte(data::ISO_8859_15_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_15, 0x00BF, 63, 65),
};
/// The ISO-8859-15 encoding.
@ -1139,7 +1132,7 @@ pub static ISO_8859_15: &'static Encoding = &ISO_8859_15_INIT;
/// items.
pub static ISO_8859_16_INIT: Encoding = Encoding {
name: "ISO-8859-16",
variant: VariantEncoding::SingleByte(data::ISO_8859_16_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_16, 0x00DF, 95, 4),
};
/// The ISO-8859-16 encoding.
@ -1173,7 +1166,7 @@ pub static ISO_8859_16: &'static Encoding = &ISO_8859_16_INIT;
/// items.
pub static ISO_8859_2_INIT: Encoding = Encoding {
name: "ISO-8859-2",
variant: VariantEncoding::SingleByte(data::ISO_8859_2_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_2, 0x00DF, 95, 1),
};
/// The ISO-8859-2 encoding.
@ -1205,7 +1198,7 @@ pub static ISO_8859_2: &'static Encoding = &ISO_8859_2_INIT;
/// items.
pub static ISO_8859_3_INIT: Encoding = Encoding {
name: "ISO-8859-3",
variant: VariantEncoding::SingleByte(data::ISO_8859_3_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_3, 0x00DF, 95, 4),
};
/// The ISO-8859-3 encoding.
@ -1237,7 +1230,7 @@ pub static ISO_8859_3: &'static Encoding = &ISO_8859_3_INIT;
/// items.
pub static ISO_8859_4_INIT: Encoding = Encoding {
name: "ISO-8859-4",
variant: VariantEncoding::SingleByte(data::ISO_8859_4_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_4, 0x00DF, 95, 1),
};
/// The ISO-8859-4 encoding.
@ -1269,7 +1262,7 @@ pub static ISO_8859_4: &'static Encoding = &ISO_8859_4_INIT;
/// items.
pub static ISO_8859_5_INIT: Encoding = Encoding {
name: "ISO-8859-5",
variant: VariantEncoding::SingleByte(data::ISO_8859_5_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_5, 0x040E, 46, 66),
};
/// The ISO-8859-5 encoding.
@ -1301,7 +1294,7 @@ pub static ISO_8859_5: &'static Encoding = &ISO_8859_5_INIT;
/// items.
pub static ISO_8859_6_INIT: Encoding = Encoding {
name: "ISO-8859-6",
variant: VariantEncoding::SingleByte(data::ISO_8859_6_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_6, 0x0621, 65, 26),
};
/// The ISO-8859-6 encoding.
@ -1334,7 +1327,7 @@ pub static ISO_8859_6: &'static Encoding = &ISO_8859_6_INIT;
/// items.
pub static ISO_8859_7_INIT: Encoding = Encoding {
name: "ISO-8859-7",
variant: VariantEncoding::SingleByte(data::ISO_8859_7_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_7, 0x03A3, 83, 44),
};
/// The ISO-8859-7 encoding.
@ -1371,7 +1364,7 @@ pub static ISO_8859_7: &'static Encoding = &ISO_8859_7_INIT;
/// items.
pub static ISO_8859_8_INIT: Encoding = Encoding {
name: "ISO-8859-8",
variant: VariantEncoding::SingleByte(data::ISO_8859_8_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
};
/// The ISO-8859-8 encoding.
@ -1406,7 +1399,7 @@ pub static ISO_8859_8: &'static Encoding = &ISO_8859_8_INIT;
/// items.
pub static ISO_8859_8_I_INIT: Encoding = Encoding {
name: "ISO-8859-8-I",
variant: VariantEncoding::SingleByte(data::ISO_8859_8_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
};
/// The ISO-8859-8-I encoding.
@ -1441,7 +1434,7 @@ pub static ISO_8859_8_I: &'static Encoding = &ISO_8859_8_I_INIT;
/// items.
pub static KOI8_R_INIT: Encoding = Encoding {
name: "KOI8-R",
variant: VariantEncoding::SingleByte(data::KOI8_R_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_r, 0x044E, 64, 1),
};
/// The KOI8-R encoding.
@ -1473,7 +1466,7 @@ pub static KOI8_R: &'static Encoding = &KOI8_R_INIT;
/// items.
pub static KOI8_U_INIT: Encoding = Encoding {
name: "KOI8-U",
variant: VariantEncoding::SingleByte(data::KOI8_U_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_u, 0x044E, 64, 1),
};
/// The KOI8-U encoding.
@ -1673,7 +1666,7 @@ pub static GB18030: &'static Encoding = &GB18030_INIT;
/// items.
pub static MACINTOSH_INIT: Encoding = Encoding {
name: "macintosh",
variant: VariantEncoding::SingleByte(data::MACINTOSH_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.macintosh, 0x00CD, 106, 3),
};
/// The macintosh encoding.
@ -1742,7 +1735,7 @@ pub static REPLACEMENT: &'static Encoding = &REPLACEMENT_INIT;
/// items.
pub static WINDOWS_1250_INIT: Encoding = Encoding {
name: "windows-1250",
variant: VariantEncoding::SingleByte(data::WINDOWS_1250_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1250, 0x00DC, 92, 2),
};
/// The windows-1250 encoding.
@ -1774,7 +1767,7 @@ pub static WINDOWS_1250: &'static Encoding = &WINDOWS_1250_INIT;
/// items.
pub static WINDOWS_1251_INIT: Encoding = Encoding {
name: "windows-1251",
variant: VariantEncoding::SingleByte(data::WINDOWS_1251_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1251, 0x0410, 64, 64),
};
/// The windows-1251 encoding.
@ -1806,7 +1799,7 @@ pub static WINDOWS_1251: &'static Encoding = &WINDOWS_1251_INIT;
/// items.
pub static WINDOWS_1252_INIT: Encoding = Encoding {
name: "windows-1252",
variant: VariantEncoding::SingleByte(data::WINDOWS_1252_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1252, 0x00A0, 32, 96),
};
/// The windows-1252 encoding.
@ -1839,7 +1832,7 @@ pub static WINDOWS_1252: &'static Encoding = &WINDOWS_1252_INIT;
/// items.
pub static WINDOWS_1253_INIT: Encoding = Encoding {
name: "windows-1253",
variant: VariantEncoding::SingleByte(data::WINDOWS_1253_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1253, 0x03A3, 83, 44),
};
/// The windows-1253 encoding.
@ -1873,7 +1866,7 @@ pub static WINDOWS_1253: &'static Encoding = &WINDOWS_1253_INIT;
/// items.
pub static WINDOWS_1254_INIT: Encoding = Encoding {
name: "windows-1254",
variant: VariantEncoding::SingleByte(data::WINDOWS_1254_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1254, 0x00DF, 95, 17),
};
/// The windows-1254 encoding.
@ -1906,7 +1899,7 @@ pub static WINDOWS_1254: &'static Encoding = &WINDOWS_1254_INIT;
/// items.
pub static WINDOWS_1255_INIT: Encoding = Encoding {
name: "windows-1255",
variant: VariantEncoding::SingleByte(data::WINDOWS_1255_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1255, 0x05D0, 96, 27),
};
/// The windows-1255 encoding.
@ -1940,7 +1933,7 @@ pub static WINDOWS_1255: &'static Encoding = &WINDOWS_1255_INIT;
/// items.
pub static WINDOWS_1256_INIT: Encoding = Encoding {
name: "windows-1256",
variant: VariantEncoding::SingleByte(data::WINDOWS_1256_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1256, 0x0621, 65, 22),
};
/// The windows-1256 encoding.
@ -1972,7 +1965,7 @@ pub static WINDOWS_1256: &'static Encoding = &WINDOWS_1256_INIT;
/// items.
pub static WINDOWS_1257_INIT: Encoding = Encoding {
name: "windows-1257",
variant: VariantEncoding::SingleByte(data::WINDOWS_1257_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1257, 0x00DF, 95, 1),
};
/// The windows-1257 encoding.
@ -2005,7 +1998,7 @@ pub static WINDOWS_1257: &'static Encoding = &WINDOWS_1257_INIT;
/// items.
pub static WINDOWS_1258_INIT: Encoding = Encoding {
name: "windows-1258",
variant: VariantEncoding::SingleByte(data::WINDOWS_1258_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1258, 0x00DF, 95, 4),
};
/// The windows-1258 encoding.
@ -2042,7 +2035,7 @@ pub static WINDOWS_1258: &'static Encoding = &WINDOWS_1258_INIT;
/// items.
pub static WINDOWS_874_INIT: Encoding = Encoding {
name: "windows-874",
variant: VariantEncoding::SingleByte(data::WINDOWS_874_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_874, 0x0E01, 33, 58),
};
/// The windows-874 encoding.
@ -2075,7 +2068,7 @@ pub static WINDOWS_874: &'static Encoding = &WINDOWS_874_INIT;
/// items.
pub static X_MAC_CYRILLIC_INIT: Encoding = Encoding {
name: "x-mac-cyrillic",
variant: VariantEncoding::SingleByte(data::X_MAC_CYRILLIC_DATA),
variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.x_mac_cyrillic, 0x0430, 96, 31),
};
/// The x-mac-cyrillic encoding.
@ -2848,6 +2841,20 @@ impl Encoding {
!(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE || self == ISO_2022_JP)
}
/// Checks whether this encoding maps one byte to one Basic Multilingual
/// Plane code point (i.e. byte length equals decoded UTF-16 length) and
/// vice versa (for mappable characters).
///
/// `true` iff this encoding is on the list of [Legacy single-byte
/// encodings](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
/// in the spec or x-user-defined.
///
/// Available via the C wrapper.
#[inline]
pub fn is_single_byte(&'static self) -> bool {
self.variant.is_single_byte()
}
/// Checks whether the bytes 0x00...0x7F map mostly to the characters
/// U+0000...U+007F and vice versa.
#[inline]
@ -3002,7 +3009,7 @@ impl Encoding {
ascii_valid_up_to(bytes)
};
if valid_up_to == bytes.len() {
let str: &str = unsafe { std::mem::transmute(bytes) };
let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
return (Cow::Borrowed(str), false);
}
let decoder = self.new_decoder_without_bom_handling();
@ -3094,7 +3101,7 @@ impl Encoding {
if self == UTF_8 {
let valid_up_to = utf8_valid_up_to(bytes);
if valid_up_to == bytes.len() {
let str: &str = unsafe { std::mem::transmute(bytes) };
let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
return Some(Cow::Borrowed(str));
}
return None;
@ -3106,7 +3113,7 @@ impl Encoding {
ascii_valid_up_to(bytes)
};
if valid_up_to == bytes.len() {
let str: &str = unsafe { std::mem::transmute(bytes) };
let str: &str = unsafe { std::str::from_utf8_unchecked(bytes) };
return Some(Cow::Borrowed(str));
}
let decoder = self.new_decoder_without_bom_handling();
@ -3114,7 +3121,8 @@ impl Encoding {
checked_add(
valid_up_to,
decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
).unwrap(),
)
.unwrap(),
);
unsafe {
let vec = string.as_mut_vec();
@ -3201,8 +3209,9 @@ impl Encoding {
(checked_add(
valid_up_to,
encoder.max_buffer_length_from_utf8_if_no_unmappables(string.len() - valid_up_to),
)).unwrap()
.next_power_of_two(),
))
.unwrap()
.next_power_of_two(),
);
unsafe {
vec.set_len(valid_up_to);
@ -3394,7 +3403,7 @@ impl<'de> Deserialize<'de> for &'static Encoding {
}
/// Tracks the life cycle of a decoder from BOM sniffing to conversion to end.
#[derive(PartialEq, Debug)]
#[derive(PartialEq, Debug, Copy, Clone)]
enum DecoderLifeCycle {
/// The decoder has seen no input yet.
AtStart,
@ -3423,6 +3432,7 @@ enum DecoderLifeCycle {
}
/// Communicate the BOM handling mode.
#[derive(Debug, Copy, Clone)]
enum BomHandling {
/// Don't handle the BOM
Off,
@ -3887,7 +3897,7 @@ impl Decoder {
dst: &mut str,
last: bool,
) -> (CoderResult, usize, usize, bool) {
let bytes: &mut [u8] = unsafe { std::mem::transmute(dst) };
let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
let (result, read, written, replaced) = self.decode_to_utf8(src, bytes, last);
let len = bytes.len();
let mut trail = written;
@ -3977,7 +3987,7 @@ impl Decoder {
dst: &mut str,
last: bool,
) -> (DecoderResult, usize, usize) {
let bytes: &mut [u8] = unsafe { std::mem::transmute(dst) };
let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
let (result, read, written) = self.decode_to_utf8_without_replacement(src, bytes, last);
let len = bytes.len();
let mut trail = written;
@ -4217,7 +4227,7 @@ pub enum EncoderResult {
impl EncoderResult {
fn unmappable_from_bmp(bmp: u16) -> EncoderResult {
EncoderResult::Unmappable(::std::char::from_u32(bmp as u32).unwrap())
EncoderResult::Unmappable(::std::char::from_u32(u32::from(bmp)).unwrap())
}
}
@ -4688,13 +4698,13 @@ fn write_ncr(unmappable: char, dst: &mut [u8]) -> usize {
// len is the number of decimal digits needed to represent unmappable plus
// 3 (the length of "&#" and ";").
let mut number = unmappable as u32;
let len = if number >= 1000000u32 {
let len = if number >= 1_000_000u32 {
10usize
} else if number >= 100000u32 {
} else if number >= 100_000u32 {
9usize
} else if number >= 10000u32 {
} else if number >= 10_000u32 {
8usize
} else if number >= 1000u32 {
} else if number >= 1_000u32 {
7usize
} else if number >= 100u32 {
6usize
@ -5635,4 +5645,48 @@ mod tests {
assert_eq!(debincoded, demo);
}
#[test]
fn test_is_single_byte() {
assert!(!BIG5.is_single_byte());
assert!(!EUC_JP.is_single_byte());
assert!(!EUC_KR.is_single_byte());
assert!(!GB18030.is_single_byte());
assert!(!GBK.is_single_byte());
assert!(!REPLACEMENT.is_single_byte());
assert!(!SHIFT_JIS.is_single_byte());
assert!(!UTF_8.is_single_byte());
assert!(!UTF_16BE.is_single_byte());
assert!(!UTF_16LE.is_single_byte());
assert!(!ISO_2022_JP.is_single_byte());
assert!(IBM866.is_single_byte());
assert!(ISO_8859_2.is_single_byte());
assert!(ISO_8859_3.is_single_byte());
assert!(ISO_8859_4.is_single_byte());
assert!(ISO_8859_5.is_single_byte());
assert!(ISO_8859_6.is_single_byte());
assert!(ISO_8859_7.is_single_byte());
assert!(ISO_8859_8.is_single_byte());
assert!(ISO_8859_10.is_single_byte());
assert!(ISO_8859_13.is_single_byte());
assert!(ISO_8859_14.is_single_byte());
assert!(ISO_8859_15.is_single_byte());
assert!(ISO_8859_16.is_single_byte());
assert!(ISO_8859_8_I.is_single_byte());
assert!(KOI8_R.is_single_byte());
assert!(KOI8_U.is_single_byte());
assert!(MACINTOSH.is_single_byte());
assert!(WINDOWS_874.is_single_byte());
assert!(WINDOWS_1250.is_single_byte());
assert!(WINDOWS_1251.is_single_byte());
assert!(WINDOWS_1252.is_single_byte());
assert!(WINDOWS_1253.is_single_byte());
assert!(WINDOWS_1254.is_single_byte());
assert!(WINDOWS_1255.is_single_byte());
assert!(WINDOWS_1256.is_single_byte());
assert!(WINDOWS_1257.is_single_byte());
assert!(WINDOWS_1258.is_single_byte());
assert!(X_MAC_CYRILLIC.is_single_byte());
assert!(X_USER_DEFINED.is_single_byte());
}
}

View File

@ -361,6 +361,7 @@ macro_rules! gb18030_decoder_function {
$name:ident,
$code_unit:ty,
$dest_struct:ident) => (
#[cfg_attr(feature = "cargo-clippy", allow(never_loop))]
pub fn $name(&mut $slf,
src: &[u8],
dst: &mut [$code_unit],
@ -685,6 +686,7 @@ macro_rules! euc_jp_decoder_function {
$name:ident,
$code_unit:ty,
$dest_struct:ident) => (
#[cfg_attr(feature = "cargo-clippy", allow(never_loop))]
pub fn $name(&mut $slf,
src: &[u8],
dst: &mut [$code_unit],

View File

@ -21,6 +21,8 @@
//! in-memory encoding is sometimes used as a storage optimization of text
//! when UTF-16 indexing and length semantics are exposed.
use std::borrow::Cow;
use super::in_inclusive_range16;
use super::in_inclusive_range32;
use super::in_inclusive_range8;
@ -65,11 +67,12 @@ pub enum Latin1Bidi {
// `as` truncates, so works on 32-bit, too.
#[allow(dead_code)]
const LATIN1_MASK: usize = 0xFF00FF00_FF00FF00u64 as usize;
const LATIN1_MASK: usize = 0xFF00_FF00_FF00_FF00u64 as usize;
#[allow(unused_macros)]
macro_rules! by_unit_check_alu {
($name:ident, $unit:ty, $bound:expr, $mask:ident) => {
#[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
#[inline(always)]
fn $name(buffer: &[$unit]) -> bool {
let mut offset = 0usize;
@ -84,7 +87,8 @@ macro_rules! by_unit_check_alu {
}
let src = buffer.as_ptr();
let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK))
& ALU_ALIGNMENT_MASK) / unit_size;
& ALU_ALIGNMENT_MASK)
/ unit_size;
if until_alignment + ALU_ALIGNMENT / unit_size <= len {
if until_alignment != 0 {
accu |= buffer[offset] as usize;
@ -103,18 +107,18 @@ macro_rules! by_unit_check_alu {
if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len {
let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size));
loop {
let unroll_accu = unsafe {
*(src.offset(offset as isize) as *const usize)
} | unsafe {
*(src.offset((offset + (ALU_ALIGNMENT / unit_size)) as isize)
as *const usize)
} | unsafe {
*(src.offset((offset + (2 * (ALU_ALIGNMENT / unit_size))) as isize)
as *const usize)
} | unsafe {
*(src.offset((offset + (3 * (ALU_ALIGNMENT / unit_size))) as isize)
as *const usize)
};
let unroll_accu = unsafe { *(src.add(offset) as *const usize) }
| unsafe {
*(src.add(offset + (ALU_ALIGNMENT / unit_size)) as *const usize)
}
| unsafe {
*(src.add(offset + (2 * (ALU_ALIGNMENT / unit_size)))
as *const usize)
}
| unsafe {
*(src.add(offset + (3 * (ALU_ALIGNMENT / unit_size)))
as *const usize)
};
if unroll_accu & $mask != 0 {
return false;
}
@ -125,7 +129,7 @@ macro_rules! by_unit_check_alu {
}
}
while offset <= len_minus_stride {
accu |= unsafe { *(src.offset(offset as isize) as *const usize) };
accu |= unsafe { *(src.add(offset) as *const usize) };
offset += ALU_ALIGNMENT / unit_size;
}
}
@ -154,8 +158,10 @@ macro_rules! by_unit_check_simd {
return false;
}
let src = buffer.as_ptr();
let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK))
& SIMD_ALIGNMENT_MASK) / unit_size;
let mut until_alignment = ((SIMD_ALIGNMENT
- ((src as usize) & SIMD_ALIGNMENT_MASK))
& SIMD_ALIGNMENT_MASK)
/ unit_size;
if until_alignment + SIMD_STRIDE_SIZE / unit_size <= len {
if until_alignment != 0 {
accu |= buffer[offset] as usize;
@ -174,20 +180,19 @@ macro_rules! by_unit_check_simd {
if offset + (4 * (SIMD_STRIDE_SIZE / unit_size)) <= len {
let len_minus_unroll = len - (4 * (SIMD_STRIDE_SIZE / unit_size));
loop {
let unroll_accu = unsafe {
*(src.offset(offset as isize) as *const $simd_ty)
} | unsafe {
*(src.offset((offset + (SIMD_STRIDE_SIZE / unit_size)) as isize)
as *const $simd_ty)
} | unsafe {
*(src.offset(
(offset + (2 * (SIMD_STRIDE_SIZE / unit_size))) as isize,
) as *const $simd_ty)
} | unsafe {
*(src.offset(
(offset + (3 * (SIMD_STRIDE_SIZE / unit_size))) as isize,
) as *const $simd_ty)
};
let unroll_accu = unsafe { *(src.add(offset) as *const $simd_ty) }
| unsafe {
*(src.add(offset + (SIMD_STRIDE_SIZE / unit_size))
as *const $simd_ty)
}
| unsafe {
*(src.add(offset + (2 * (SIMD_STRIDE_SIZE / unit_size)))
as *const $simd_ty)
}
| unsafe {
*(src.add(offset + (3 * (SIMD_STRIDE_SIZE / unit_size)))
as *const $simd_ty)
};
if !$func(unroll_accu) {
return false;
}
@ -199,8 +204,7 @@ macro_rules! by_unit_check_simd {
}
let mut simd_accu = $splat;
while offset <= len_minus_stride {
simd_accu = simd_accu
| unsafe { *(src.offset(offset as isize) as *const $simd_ty) };
simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) };
offset += SIMD_STRIDE_SIZE / unit_size;
}
if !$func(simd_accu) {
@ -241,7 +245,7 @@ cfg_if!{
let len = buffer.len();
let mut offset = 0usize;
'outer: loop {
let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.offset(offset as isize) } as usize) & SIMD_ALIGNMENT_MASK)) &
let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.add(offset) } as usize) & SIMD_ALIGNMENT_MASK)) &
SIMD_ALIGNMENT_MASK) / unit_size;
if until_alignment == 0 {
if offset + SIMD_STRIDE_SIZE / unit_size > len {
@ -266,7 +270,7 @@ cfg_if!{
let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
'inner: loop {
let offset_plus_stride = offset + SIMD_STRIDE_SIZE / unit_size;
if contains_surrogates(unsafe { *(src.offset(offset as isize) as *const u16x8) }) {
if contains_surrogates(unsafe { *(src.add(offset) as *const u16x8) }) {
if offset_plus_stride == len {
break 'outer;
}
@ -304,6 +308,7 @@ cfg_if!{
/// The second return value is true iff the last code unit of the slice was
/// reached and turned out to be a low surrogate that is part of a valid pair.
#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
#[inline(always)]
fn utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool) {
let len = buffer.len();
@ -368,7 +373,7 @@ cfg_if!{
}
let len_minus_stride = len - SIMD_STRIDE_SIZE;
loop {
if !simd_is_str_latin1(unsafe { *(src.offset(offset as isize) as *const u8x16) }) {
if !simd_is_str_latin1(unsafe { *(src.add(offset) as *const u8x16) }) {
// TODO: Ensure this compiles away when inlined into `is_str_latin1()`.
while bytes[offset] & 0xC0 == 0x80 {
offset += 1;
@ -456,7 +461,7 @@ cfg_if!{
}
let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
loop {
if is_u16x8_bidi(unsafe { *(src.offset(offset as isize) as *const u16x8) }) {
if is_u16x8_bidi(unsafe { *(src.add(offset) as *const u16x8) }) {
return true;
}
offset += SIMD_STRIDE_SIZE / 2;
@ -511,7 +516,7 @@ cfg_if!{
}
let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
loop {
let mut s = unsafe { *(src.offset(offset as isize) as *const u16x8) };
let mut s = unsafe { *(src.add(offset) as *const u16x8) };
if !simd_is_latin1(s) {
loop {
if is_u16x8_bidi(s) {
@ -526,7 +531,7 @@ cfg_if!{
}
return Latin1Bidi::LeftToRight;
}
s = unsafe { *(src.offset(offset as isize) as *const u16x8) };
s = unsafe { *(src.add(offset) as *const u16x8) };
}
}
offset += SIMD_STRIDE_SIZE / 2;
@ -558,6 +563,7 @@ cfg_if!{
}
}
} else {
#[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
#[inline(always)]
fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
let mut offset = 0usize;
@ -579,7 +585,7 @@ cfg_if!{
}
let len_minus_stride = len - ALU_ALIGNMENT / 2;
loop {
if unsafe { *(src.offset(offset as isize) as *const usize) } & LATIN1_MASK != 0 {
if unsafe { *(src.add(offset) as *const usize) } & LATIN1_MASK != 0 {
if is_utf16_bidi_impl(&buffer[offset..]) {
return Latin1Bidi::Bidi;
}
@ -681,6 +687,10 @@ pub fn is_utf16_latin1(buffer: &[u16]) -> bool {
/// Returns `true` if the input is invalid UTF-8 or the input contains an
/// RTL character. Returns `false` if the input is valid UTF-8 and contains
/// no RTL characters.
#[cfg_attr(
feature = "cargo-clippy",
allow(collapsible_if, cyclomatic_complexity)
)]
#[inline]
pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
// As of rustc 1.25.0-nightly (73ac5d6a8 2018-01-11), this is faster
@ -721,33 +731,33 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
// U+1E800: F0 9E A0 80
// U+1EFFF: F0 9E BF BF
// U+1F000: F0 9F 80 80
let mut bytes = buffer;
let mut src = buffer;
'outer: loop {
if let Some((mut byte, mut read)) = validate_ascii(bytes) {
if let Some((mut byte, mut read)) = validate_ascii(src) {
// Check for the longest sequence to avoid checking twice for the
// multi-byte sequences.
if read + 4 <= bytes.len() {
if read + 4 <= src.len() {
'inner: loop {
// At this point, `byte` is not included in `read`.
match byte {
0...0x7F => {
// ASCII: go back to SIMD.
read += 1;
bytes = &bytes[read..];
src = &src[read..];
continue 'outer;
}
0xC2...0xD5 => {
// Two-byte
let second = bytes[read + 1];
if (UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL) != 0 {
let second = unsafe { *(src.get_unchecked(read + 1)) };
if !in_inclusive_range8(second, 0x80, 0xBF) {
return true;
}
read += 2;
}
0xD6 => {
// Two-byte
let second = bytes[read + 1];
if (UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL) != 0 {
let second = unsafe { *(src.get_unchecked(read + 1)) };
if !in_inclusive_range8(second, 0x80, 0xBF) {
return true;
}
// XXX consider folding the above and below checks
@ -759,11 +769,12 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
// two-byte starting with 0xD7 and above is bidi
0xE1 | 0xE3...0xEC | 0xEE => {
// Three-byte normal
let second = bytes[read + 1];
let third = bytes[read + 2];
if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL)
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
!= 0
let second = unsafe { *(src.get_unchecked(read + 1)) };
let third = unsafe { *(src.get_unchecked(read + 2)) };
if ((UTF8_DATA.table[usize::from(second)] & unsafe {
*(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
}) | (third >> 6))
!= 2
{
return true;
}
@ -771,11 +782,12 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
}
0xE2 => {
// Three-byte normal, potentially bidi
let second = bytes[read + 1];
let third = bytes[read + 2];
if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL)
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
!= 0
let second = unsafe { *(src.get_unchecked(read + 1)) };
let third = unsafe { *(src.get_unchecked(read + 2)) };
if ((UTF8_DATA.table[usize::from(second)] & unsafe {
*(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
}) | (third >> 6))
!= 2
{
return true;
}
@ -792,11 +804,12 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
}
0xEF => {
// Three-byte normal, potentially bidi
let second = bytes[read + 1];
let third = bytes[read + 2];
if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL)
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
!= 0
let second = unsafe { *(src.get_unchecked(read + 1)) };
let third = unsafe { *(src.get_unchecked(read + 2)) };
if ((UTF8_DATA.table[usize::from(second)] & unsafe {
*(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
}) | (third >> 6))
!= 2
{
return true;
}
@ -825,12 +838,12 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
}
0xE0 => {
// Three-byte special lower bound, potentially bidi
let second = bytes[read + 1];
let third = bytes[read + 2];
if ((UTF8_TRAIL_INVALID[second as usize]
& UTF8_THREE_BYTE_SPECIAL_LOWER_BOUND_TRAIL)
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
!= 0
let second = unsafe { *(src.get_unchecked(read + 1)) };
let third = unsafe { *(src.get_unchecked(read + 2)) };
if ((UTF8_DATA.table[usize::from(second)] & unsafe {
*(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
}) | (third >> 6))
!= 2
{
return true;
}
@ -842,26 +855,30 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
}
0xED => {
// Three-byte special upper bound
let second = bytes[read + 1];
let third = bytes[read + 2];
if ((UTF8_TRAIL_INVALID[second as usize]
& UTF8_THREE_BYTE_SPECIAL_UPPER_BOUND_TRAIL)
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
!= 0
let second = unsafe { *(src.get_unchecked(read + 1)) };
let third = unsafe { *(src.get_unchecked(read + 2)) };
if ((UTF8_DATA.table[usize::from(second)] & unsafe {
*(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
}) | (third >> 6))
!= 2
{
return true;
}
read += 3;
}
0xF1...0xF3 => {
0xF1...0xF4 => {
// Four-byte normal
let second = bytes[read + 1];
let third = bytes[read + 2];
let fourth = bytes[read + 3];
if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL)
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL)
| (UTF8_TRAIL_INVALID[fourth as usize] & UTF8_NORMAL_TRAIL))
!= 0
let second = unsafe { *(src.get_unchecked(read + 1)) };
let third = unsafe { *(src.get_unchecked(read + 2)) };
let fourth = unsafe { *(src.get_unchecked(read + 3)) };
if (u16::from(
UTF8_DATA.table[usize::from(second)]
& unsafe {
*(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
},
) | u16::from(third >> 6)
| (u16::from(fourth & 0xC0) << 2))
!= 0x202
{
return true;
}
@ -869,53 +886,41 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
}
0xF0 => {
// Four-byte special lower bound, potentially bidi
let second = bytes[read + 1];
let third = bytes[read + 2];
let fourth = bytes[read + 3];
if ((UTF8_TRAIL_INVALID[second as usize]
& UTF8_FOUR_BYTE_SPECIAL_LOWER_BOUND_TRAIL)
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL)
| (UTF8_TRAIL_INVALID[fourth as usize] & UTF8_NORMAL_TRAIL))
!= 0
let second = unsafe { *(src.get_unchecked(read + 1)) };
let third = unsafe { *(src.get_unchecked(read + 2)) };
let fourth = unsafe { *(src.get_unchecked(read + 3)) };
if (u16::from(
UTF8_DATA.table[usize::from(second)]
& unsafe {
*(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
},
) | u16::from(third >> 6)
| (u16::from(fourth & 0xC0) << 2))
!= 0x202
{
return true;
}
if unsafe { unlikely(second == 0x90 || second == 0x9E) } {
let third = bytes[read + 2];
let third = src[read + 2];
if third >= 0xA0 {
return true;
}
}
read += 4;
}
0xF4 => {
// Four-byte special upper bound
let second = bytes[read + 1];
let third = bytes[read + 2];
let fourth = bytes[read + 3];
if ((UTF8_TRAIL_INVALID[second as usize]
& UTF8_FOUR_BYTE_SPECIAL_UPPER_BOUND_TRAIL)
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL)
| (UTF8_TRAIL_INVALID[fourth as usize] & UTF8_NORMAL_TRAIL))
!= 0
{
return true;
}
read += 4;
}
_ => {
// Invalid lead or bidi-only lead
return true;
}
}
if read + 4 > bytes.len() {
if read == bytes.len() {
if read + 4 > src.len() {
if read == src.len() {
return false;
}
byte = bytes[read];
byte = src[read];
break 'inner;
}
byte = bytes[read];
byte = src[read];
continue 'inner;
}
}
@ -927,33 +932,33 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
0...0x7F => {
// ASCII: go back to SIMD.
read += 1;
bytes = &bytes[read..];
src = &src[read..];
continue 'outer;
}
0xC2...0xD5 => {
// Two-byte
let new_read = read + 2;
if new_read > bytes.len() {
if new_read > src.len() {
return true;
}
let second = bytes[read + 1];
if (UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL) != 0 {
let second = unsafe { *(src.get_unchecked(read + 1)) };
if !in_inclusive_range8(second, 0x80, 0xBF) {
return true;
}
read = new_read;
// We need to deal with the case where we came here with 3 bytes
// left, so we need to take a look at the last one.
bytes = &bytes[read..];
src = &src[read..];
continue 'outer;
}
0xD6 => {
// Two-byte, potentially bidi
let new_read = read + 2;
if new_read > bytes.len() {
if new_read > src.len() {
return true;
}
let second = bytes[read + 1];
if (UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL) != 0 {
let second = unsafe { *(src.get_unchecked(read + 1)) };
if !in_inclusive_range8(second, 0x80, 0xBF) {
return true;
}
// XXX consider folding the above and below checks
@ -963,21 +968,22 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
read = new_read;
// We need to deal with the case where we came here with 3 bytes
// left, so we need to take a look at the last one.
bytes = &bytes[read..];
src = &src[read..];
continue 'outer;
}
// two-byte starting with 0xD7 and above is bidi
0xE1 | 0xE3...0xEC | 0xEE => {
// Three-byte normal
let new_read = read + 3;
if new_read > bytes.len() {
if new_read > src.len() {
return true;
}
let second = bytes[read + 1];
let third = bytes[read + 2];
if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL)
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
!= 0
let second = unsafe { *(src.get_unchecked(read + 1)) };
let third = unsafe { *(src.get_unchecked(read + 2)) };
if ((UTF8_DATA.table[usize::from(second)]
& unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
| (third >> 6))
!= 2
{
return true;
}
@ -985,14 +991,15 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
0xE2 => {
// Three-byte normal, potentially bidi
let new_read = read + 3;
if new_read > bytes.len() {
if new_read > src.len() {
return true;
}
let second = bytes[read + 1];
let third = bytes[read + 2];
if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL)
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
!= 0
let second = unsafe { *(src.get_unchecked(read + 1)) };
let third = unsafe { *(src.get_unchecked(read + 2)) };
if ((UTF8_DATA.table[usize::from(second)]
& unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
| (third >> 6))
!= 2
{
return true;
}
@ -1009,14 +1016,15 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
0xEF => {
// Three-byte normal, potentially bidi
let new_read = read + 3;
if new_read > bytes.len() {
if new_read > src.len() {
return true;
}
let second = bytes[read + 1];
let third = bytes[read + 2];
if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL)
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
!= 0
let second = unsafe { *(src.get_unchecked(read + 1)) };
let third = unsafe { *(src.get_unchecked(read + 2)) };
if ((UTF8_DATA.table[usize::from(second)]
& unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
| (third >> 6))
!= 2
{
return true;
}
@ -1045,15 +1053,15 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
0xE0 => {
// Three-byte special lower bound, potentially bidi
let new_read = read + 3;
if new_read > bytes.len() {
if new_read > src.len() {
return true;
}
let second = bytes[read + 1];
let third = bytes[read + 2];
if ((UTF8_TRAIL_INVALID[second as usize]
& UTF8_THREE_BYTE_SPECIAL_LOWER_BOUND_TRAIL)
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
!= 0
let second = unsafe { *(src.get_unchecked(read + 1)) };
let third = unsafe { *(src.get_unchecked(read + 2)) };
if ((UTF8_DATA.table[usize::from(second)]
& unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
| (third >> 6))
!= 2
{
return true;
}
@ -1065,15 +1073,15 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
0xED => {
// Three-byte special upper bound
let new_read = read + 3;
if new_read > bytes.len() {
if new_read > src.len() {
return true;
}
let second = bytes[read + 1];
let third = bytes[read + 2];
if ((UTF8_TRAIL_INVALID[second as usize]
& UTF8_THREE_BYTE_SPECIAL_UPPER_BOUND_TRAIL)
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
!= 0
let second = unsafe { *(src.get_unchecked(read + 1)) };
let third = unsafe { *(src.get_unchecked(read + 2)) };
if ((UTF8_DATA.table[usize::from(second)]
& unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
| (third >> 6))
!= 2
{
return true;
}
@ -1102,6 +1110,7 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
/// cause right-to-left behavior without the presence of right-to-left
/// characters or right-to-left controls are not checked for. As a special
/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
#[inline]
pub fn is_str_bidi(buffer: &str) -> bool {
// U+058F: D6 8F
@ -1299,7 +1308,7 @@ pub fn is_char_bidi(c: char) -> bool {
// https://www.unicode.org/roadmaps/smp/
// U+10800...U+10FFF (Lead surrogate U+D802 or U+D803)
// U+1E800...U+1EFFF (Lead surrogate U+D83A or U+D83B)
let code_point = c as u32;
let code_point = u32::from(c);
if code_point < 0x0590 {
// Below Hebrew
return false;
@ -1457,8 +1466,9 @@ pub fn check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi {
/// Panics if the destination buffer is shorter than stated above.
#[inline]
pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize {
// TODO: Can the + 1 be eliminated?
assert!(dst.len() >= src.len() + 1);
// TODO: Can the requirement for dst to be at least one unit longer
// be eliminated?
assert!(dst.len() > src.len());
let mut decoder = Utf8Decoder::new_inner();
let mut total_read = 0usize;
let mut total_written = 0usize;
@ -1528,13 +1538,13 @@ pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
if byte >= 0x80 {
// Two-byte
let second = bytes[read + 1];
let point = (((byte as u32) & 0x1Fu32) << 6) | (second as u32 & 0x3Fu32);
dst[written] = point as u16;
let point = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F);
dst[written] = point;
read += 2;
written += 1;
} else {
// ASCII: write and go back to SIMD.
dst[written] = byte as u16;
dst[written] = u16::from(byte);
read += 1;
written += 1;
// Intuitively, we should go back to the outer loop only
@ -1548,10 +1558,10 @@ pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
// Three-byte
let second = bytes[read + 1];
let third = bytes[read + 2];
let point = (((byte as u32) & 0xFu32) << 12)
| ((second as u32 & 0x3Fu32) << 6)
| (third as u32 & 0x3Fu32);
dst[written] = point as u16;
let point = ((u16::from(byte) & 0xF) << 12)
| ((u16::from(second) & 0x3F) << 6)
| (u16::from(third) & 0x3F);
dst[written] = point;
read += 3;
written += 1;
} else {
@ -1559,10 +1569,10 @@ pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
let second = bytes[read + 1];
let third = bytes[read + 2];
let fourth = bytes[read + 3];
let point = (((byte as u32) & 0x7u32) << 18)
| ((second as u32 & 0x3Fu32) << 12)
| ((third as u32 & 0x3Fu32) << 6)
| (fourth as u32 & 0x3Fu32);
let point = ((u32::from(byte) & 0x7) << 18)
| ((u32::from(second) & 0x3F) << 12)
| ((u32::from(third) & 0x3F) << 6)
| (u32::from(fourth) & 0x3F);
dst[written] = (0xD7C0 + (point >> 10)) as u16;
dst[written + 1] = (0xDC00 + (point & 0x3FF)) as u16;
read += 4;
@ -1627,7 +1637,7 @@ pub fn convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usi
/// a `&mut str`, use `convert_utf16_to_str()` instead of this function.
#[inline]
pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize {
assert!(dst.len() >= src.len() * 3 + 1);
assert!(dst.len() > src.len() * 3);
let (read, written) = convert_utf16_to_utf8_partial(src, dst);
debug_assert_eq!(read, src.len());
written
@ -1648,7 +1658,7 @@ pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize {
/// replaced with the REPLACEMENT CHARACTER.
#[inline]
pub fn convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize) {
let bytes: &mut [u8] = unsafe { ::std::mem::transmute(dst) };
let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
let (read, written) = convert_utf16_to_utf8_partial(src, bytes);
let len = bytes.len();
let mut trail = written;
@ -1678,7 +1688,7 @@ pub fn convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize
/// Panics if the destination buffer is shorter than stated above.
#[inline]
pub fn convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize {
assert!(dst.len() >= src.len() * 3 + 1);
assert!(dst.len() > src.len() * 3);
let (read, written) = convert_utf16_to_str_partial(src, dst);
debug_assert_eq!(read, src.len());
written
@ -1738,8 +1748,8 @@ pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usi
let min_left = ::std::cmp::min(src_left, dst_left);
if let Some((non_ascii, consumed)) = unsafe {
ascii_to_ascii(
src_ptr.offset(total_read as isize),
dst_ptr.offset(total_written as isize),
src_ptr.add(total_read),
dst_ptr.add(total_written),
min_left,
)
} {
@ -1751,10 +1761,9 @@ pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usi
total_read += 1; // consume `non_ascii`
let code_point = non_ascii as u32;
dst[total_written] = ((code_point >> 6) | 0xC0u32) as u8;
dst[total_written] = (non_ascii >> 6) | 0xC0;
total_written += 1;
dst[total_written] = ((code_point as u32 & 0x3Fu32) | 0x80u32) as u8;
dst[total_written] = (non_ascii & 0x3F) | 0x80;
total_written += 1;
continue;
}
@ -1801,7 +1810,7 @@ pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize {
/// If the output isn't large enough, not all input is consumed.
#[inline]
pub fn convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize) {
let bytes: &mut [u8] = unsafe { ::std::mem::transmute(dst) };
let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
let (read, written) = convert_latin1_to_utf8_partial(src, bytes);
let len = bytes.len();
let mut trail = written;
@ -1880,8 +1889,8 @@ pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize {
let src_left = src_len - total_read;
if let Some((non_ascii, consumed)) = unsafe {
ascii_to_ascii(
src_ptr.offset(total_read as isize),
dst_ptr.offset(total_written as isize),
src_ptr.add(total_read),
dst_ptr.add(total_written),
src_left,
)
} {
@ -1895,8 +1904,7 @@ pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize {
let trail = src[total_read];
total_read += 1;
dst[total_written] =
(((non_ascii as u32 & 0x1Fu32) << 6) | (trail as u32 & 0x3Fu32)) as u8;
dst[total_written] = ((non_ascii & 0x1F) << 6) | (trail & 0x3F);
total_written += 1;
continue;
}
@ -1939,6 +1947,65 @@ pub fn convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8]) {
}
}
/// Converts bytes whose unsigned value is interpreted as Unicode code point
/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
///
/// Borrows if input is ASCII-only. Performs a single heap allocation
/// otherwise.
pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> {
let up_to = ascii_valid_up_to(bytes);
// >= makes later things optimize better than ==
if up_to >= bytes.len() {
debug_assert_eq!(up_to, bytes.len());
let s: &str = unsafe { ::std::str::from_utf8_unchecked(bytes) };
return Cow::Borrowed(s);
}
let (head, tail) = bytes.split_at(up_to);
let capacity = head.len() + tail.len() * 2;
let mut vec = Vec::with_capacity(capacity);
unsafe {
vec.set_len(capacity);
}
(&mut vec[..up_to]).copy_from_slice(head);
let written = convert_latin1_to_utf8(tail, &mut vec[up_to..]);
vec.truncate(up_to + written);
Cow::Owned(unsafe { String::from_utf8_unchecked(vec) })
}
/// If the input is valid UTF-8 representing only Unicode code points from
/// U+0000 to U+00FF, inclusive, converts the input into output that
/// represents the value of each code point as the unsigned byte value of
/// each output byte.
///
/// If the input does not fulfill the condition stated above, this function
/// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
/// does something that is memory-safe without any promises about any
/// properties of the output. In particular, callers shouldn't assume the
/// output to be the same across crate versions or CPU architectures and
/// should not assume that non-ASCII input can't map to ASCII output.
///
/// Borrows if input is ASCII-only. Performs a single heap allocation
/// otherwise.
pub fn encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]> {
let bytes = string.as_bytes();
let up_to = ascii_valid_up_to(bytes);
// >= makes later things optimize better than ==
if up_to >= bytes.len() {
debug_assert_eq!(up_to, bytes.len());
return Cow::Borrowed(bytes);
}
let (head, tail) = bytes.split_at(up_to);
let capacity = bytes.len();
let mut vec = Vec::with_capacity(capacity);
unsafe {
vec.set_len(capacity);
}
(&mut vec[..up_to]).copy_from_slice(head);
let written = convert_utf8_to_latin1_lossy(tail, &mut vec[up_to..]);
vec.truncate(up_to + written);
Cow::Owned(vec)
}
/// Returns the index of the first unpaired surrogate or, if the input is
/// valid UTF-16 in its entirety, the length of the input.
#[inline]
@ -2321,6 +2388,7 @@ mod tests {
assert_eq!(dst, reference);
}
#[cfg(all(debug_assertions, not(fuzzing)))]
#[test]
#[should_panic]
fn test_convert_utf8_to_latin1_lossy_panics() {
@ -3035,11 +3103,11 @@ mod tests {
#[test]
fn test_is_char_bidi_thoroughly() {
for i in 0..0xD800u32 {
let c: char = unsafe { ::std::mem::transmute(i) };
let c: char = ::std::char::from_u32(i).unwrap();
assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
}
for i in 0xE000..0x110000u32 {
let c: char = unsafe { ::std::mem::transmute(i) };
let c: char = ::std::char::from_u32(i).unwrap();
assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
}
}
@ -3059,14 +3127,14 @@ mod tests {
fn test_is_str_bidi_thoroughly() {
let mut buf = [0; 4];
for i in 0..0xD800u32 {
let c: char = unsafe { ::std::mem::transmute(i) };
let c: char = ::std::char::from_u32(i).unwrap();
assert_eq!(
is_str_bidi(c.encode_utf8(&mut buf[..])),
reference_is_char_bidi(c)
);
}
for i in 0xE000..0x110000u32 {
let c: char = unsafe { ::std::mem::transmute(i) };
let c: char = ::std::char::from_u32(i).unwrap();
assert_eq!(
is_str_bidi(c.encode_utf8(&mut buf[..])),
reference_is_char_bidi(c)
@ -3078,7 +3146,7 @@ mod tests {
fn test_is_utf8_bidi_thoroughly() {
let mut buf = [0; 8];
for i in 0..0xD800u32 {
let c: char = unsafe { ::std::mem::transmute(i) };
let c: char = ::std::char::from_u32(i).unwrap();
let expect = reference_is_char_bidi(c);
{
let len = {
@ -3096,7 +3164,7 @@ mod tests {
assert_eq!(is_utf8_bidi(&buf[..]), expect);
}
for i in 0xE000..0x110000u32 {
let c: char = unsafe { ::std::mem::transmute(i) };
let c: char = ::std::char::from_u32(i).unwrap();
let expect = reference_is_char_bidi(c);
{
let len = {
@ -3137,4 +3205,31 @@ mod tests {
assert!(is_utf8_bidi(b"\xD6\x80\xC2"));
assert!(is_utf8_bidi(b"ab\xC2"));
}
#[test]
fn test_decode_latin1() {
match decode_latin1(b"ab") {
Cow::Borrowed(s) => {
assert_eq!(s, "ab");
}
Cow::Owned(_) => {
unreachable!("Should have borrowed");
}
}
assert_eq!(decode_latin1(b"a\xE4"), "a\u{E4}");
}
#[test]
fn test_encode_latin1_lossy() {
match encode_latin1_lossy("ab") {
Cow::Borrowed(s) => {
assert_eq!(s, b"ab");
}
Cow::Owned(_) => {
unreachable!("Should have borrowed");
}
}
assert_eq!(encode_latin1_lossy("a\u{E4}"), &(b"a\xE4")[..]);
}
}

View File

@ -41,7 +41,7 @@ impl ReplacementDecoder {
// https://github.com/whatwg/encoding/issues/33
if self.emitted || src.is_empty() {
(DecoderResult::InputEmpty, src.len(), 0)
} else if dst.len() < 1 {
} else if dst.is_empty() {
// Make sure there's room for the replacement character.
(DecoderResult::OutputFull, 0, 0)
} else {

View File

@ -68,7 +68,7 @@ impl ShiftJisDecoder {
source.consumed(),
handle.written());
}
handle.write_upper_bmp(0xFF61 + non_ascii_minus_half_with_katakana_start as u16);
handle.write_upper_bmp(0xFF61 + u16::from(non_ascii_minus_half_with_katakana_start));
// Not caring about optimizing subsequent non-ASCII
continue 'outermost;
}
@ -89,7 +89,7 @@ impl ShiftJisDecoder {
let trail_minus_hiragana = byte.wrapping_sub(0x9F);
if lead_minus_offset == 0x01 && trail_minus_hiragana < 0x53 {
// Hiragana
handle.write_upper_bmp(0x3041 + trail_minus_hiragana as u16)
handle.write_upper_bmp(0x3041 + u16::from(trail_minus_hiragana))
} else {
let mut trail_minus_offset =
byte.wrapping_sub(0x40);
@ -111,7 +111,7 @@ impl ShiftJisDecoder {
if lead_minus_offset == 0x02 &&
trail_minus_offset < 0x56 {
// Katakana
handle.write_upper_bmp(0x30A1 + trail_minus_offset as u16)
handle.write_upper_bmp(0x30A1 + u16::from(trail_minus_offset))
} else {
let pointer = lead_minus_offset as usize *
188usize +
@ -167,6 +167,35 @@ impl ShiftJisDecoder {
false);
}
#[cfg(feature = "fast-kanji-encode")]
#[inline(always)]
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
jis0208_kanji_shift_jis_encode(bmp)
}
#[cfg(not(feature = "fast-kanji-encode"))]
#[inline(always)]
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
if let Some((lead, trail)) = jis0208_level1_kanji_shift_jis_encode(bmp) {
return Some((lead, trail));
}
let pointer = if 0x4EDD == bmp {
// Ideograph on the symbol row!
23
} else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
4418 + pos
} else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
10744 + pos
} else {
return None;
};
let lead = pointer / 188;
let lead_offset = if lead < 0x1F { 0x81usize } else { 0xC1usize };
let trail = pointer % 188;
let trail_offset = if trail < 0x3F { 0x40usize } else { 0x41usize };
Some(((lead + lead_offset) as u8, (trail + trail_offset) as u8))
}
pub struct ShiftJisEncoder;
impl ShiftJisEncoder {
@ -195,28 +224,14 @@ impl ShiftJisEncoder {
if bmp_minus_hiragana < 0x53 {
handle.write_two(0x82, 0x9F + bmp_minus_hiragana as u8)
} else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
if let Some((lead, trail)) = jis0208_level1_kanji_shift_jis_encode(bmp) {
if let Some((lead, trail)) = encode_kanji(bmp) {
handle.write_two(lead, trail)
} else {
let pointer = if 0x4EDD == bmp {
// Ideograph on the symbol row!
23
} else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
4418 + pos
} else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
10744 + pos
} else {
return (
EncoderResult::unmappable_from_bmp(bmp),
source.consumed(),
handle.written(),
);
};
let lead = pointer / 188;
let lead_offset = if lead < 0x1F { 0x81usize } else { 0xC1usize };
let trail = pointer % 188;
let trail_offset = if trail < 0x3F { 0x40usize } else { 0x41usize };
handle.write_two((lead + lead_offset) as u8, (trail + trail_offset) as u8)
return (
EncoderResult::unmappable_from_bmp(bmp),
source.consumed(),
handle.written(),
);
}
} else {
let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);

View File

@ -286,7 +286,7 @@ pub fn is_u16x8_bidi(s: u16x8) -> bool {
| s.eq(u16x8::splat(0x202B))
| s.eq(u16x8::splat(0x202E))
| s.eq(u16x8::splat(0x2067)))
.any()
.any()
}
#[inline(always)]
@ -360,7 +360,7 @@ mod tests {
let ptr = vec.as_mut_ptr();
unsafe {
store8_unaligned(ptr, first);
store8_unaligned(ptr.offset(8), second);
store8_unaligned(ptr.add(8), second);
}
assert_eq!(&vec[..], &basic_latin[..]);
}
@ -376,7 +376,7 @@ mod tests {
0x75, 0x76,
];
let first = unsafe { load8_unaligned(basic_latin.as_ptr()) };
let second = unsafe { load8_unaligned(basic_latin.as_ptr().offset(8)) };
let second = unsafe { load8_unaligned(basic_latin.as_ptr().add(8)) };
let mut vec = Vec::with_capacity(16);
vec.resize(16, 0u8);
let ptr = vec.as_mut_ptr();
@ -394,7 +394,7 @@ mod tests {
0x75, 0x76,
];
let first = unsafe { load8_unaligned(input.as_ptr()) };
let second = unsafe { load8_unaligned(input.as_ptr().offset(8)) };
let second = unsafe { load8_unaligned(input.as_ptr().add(8)) };
assert!(!simd_is_basic_latin(first | second));
}
@ -405,7 +405,7 @@ mod tests {
0x75, 0x76,
];
let first = unsafe { load8_unaligned(input.as_ptr()) };
let second = unsafe { load8_unaligned(input.as_ptr().offset(8)) };
let second = unsafe { load8_unaligned(input.as_ptr().add(8)) };
assert!(!simd_is_basic_latin(first | second));
}
@ -416,7 +416,7 @@ mod tests {
0x75, 0x76,
];
let first = unsafe { load8_unaligned(input.as_ptr()) };
let second = unsafe { load8_unaligned(input.as_ptr().offset(8)) };
let second = unsafe { load8_unaligned(input.as_ptr().add(8)) };
assert!(!simd_is_basic_latin(first | second));
}

View File

@ -9,6 +9,7 @@
use super::*;
use ascii::*;
use data::position;
use handles::*;
use variant::*;
@ -154,8 +155,8 @@ impl SingleByteDecoder {
'outermost: loop {
match unsafe {
ascii_to_basic_latin(
src.as_ptr().offset(converted as isize),
dst.as_mut_ptr().offset(converted as isize),
src.as_ptr().add(converted),
dst.as_mut_ptr().add(converted),
length - converted,
)
} {
@ -207,7 +208,7 @@ impl SingleByteDecoder {
// byte unconditionally instead of trying to unread it
// to make it part of the next SIMD stride.
unsafe {
*(dst.get_unchecked_mut(converted)) = b as u16;
*(dst.get_unchecked_mut(converted)) = u16::from(b);
}
converted += 1;
if b < 60 {
@ -230,13 +231,27 @@ impl SingleByteDecoder {
pub struct SingleByteEncoder {
table: &'static [u16; 128],
run_bmp_offset: usize,
run_byte_offset: usize,
run_length: usize,
}
impl SingleByteEncoder {
pub fn new(encoding: &'static Encoding, data: &'static [u16; 128]) -> Encoder {
pub fn new(
encoding: &'static Encoding,
data: &'static [u16; 128],
run_bmp_offset: u16,
run_byte_offset: u8,
run_length: u8,
) -> Encoder {
Encoder::new(
encoding,
VariantEncoder::SingleByte(SingleByteEncoder { table: data }),
VariantEncoder::SingleByte(SingleByteEncoder {
table: data,
run_bmp_offset: run_bmp_offset as usize,
run_byte_offset: run_byte_offset as usize,
run_length: run_length as usize,
}),
)
}
@ -254,54 +269,64 @@ impl SingleByteEncoder {
Some(byte_length)
}
#[inline(always)]
fn encode_u16(&self, code_unit: u16) -> Option<u8> {
// We search the quadrants in reverse order, but we search forward
// within each quadrant. For Windows and ISO encodings, this is
// generally faster than just searching the whole table backwards.
// (Exceptions: English, German, Czech.) This order is also OK for
// KOI encodings. For IBM and Mac encodings, this order is bad,
// but we don't really need to optimize for those encodings anyway.
// First, we see if the code unit falls into a run of consecutive
// code units that can be mapped by offset. This is very efficient
// for most non-Latin encodings as well as Latin1-ish encodings.
//
// For encodings that don't fit this pattern, the run (which may
// have the length of just one) just establishes the starting point
// for the next rule.
//
// Next, we do a forward linear search in the part of the index
// after the run. Even in non-Latin1-ish Latin encodings (except
// macintosh), the lower case letters are here.
//
// Next, we search the third quadrant up to the start of the run
// (upper case letters in Latin encodings except macintosh, in
// Greek and in KOI encodings) and then the second quadrant,
// except if the run stared before the third quadrant, we search
// the second quadrant up to the run.
//
// Last, we search the first quadrant, which has unused controls
// or punctuation in most encodings. This is bad for macintosh
// and IBM866, but those are rare.
// In Windows and ISO encodings, the fourth quadrant holds most of the
// lower-case letters for bicameral scripts as well as the Hebrew
// letters. There are some Thai letters and combining marks as well as
// Thai numerals here. (In KOI8-R, the upper-case letters are here.)
for i in 96..128 {
if self.table[i] == code_unit {
return Some((i + 128) as u8);
}
// Run of consecutive units
let unit_as_usize = code_unit as usize;
let offset = unit_as_usize.wrapping_sub(self.run_bmp_offset);
if offset < self.run_length {
return Some((128 + self.run_byte_offset + offset) as u8);
}
// In Windows and ISO encodings, the third quadrant holds most of the
// upper-case letters for bicameral scripts as well as most of the
// Arabic letters. Searching this quadrant first would be better for
// Arabic. There are a number of Thai letters and combining marks here.
// (In KOI8-R, the lower-case letters are here.)
for i in 64..96 {
if self.table[i] == code_unit {
return Some((i + 128) as u8);
}
// Search after the run
let tail_start = self.run_byte_offset + self.run_length;
if let Some(pos) = position(&self.table[tail_start..], code_unit) {
return Some((128 + tail_start + pos) as u8);
}
// In Windows and ISO encodings, the second quadrant hold most of the
// Thai letters. In other scripts, there tends to be symbols here.
// Even though the two quadrants above are relevant for Thai, for Thai
// it would likely be optimal to search this quadrant first. :-(
for i in 32..64 {
if self.table[i] == code_unit {
return Some((i + 128) as u8);
if self.run_byte_offset >= 64 {
// Search third quadrant before the run
if let Some(pos) = position(&self.table[64..self.run_byte_offset], code_unit) {
return Some(((128 + 64) + pos) as u8);
}
// Search second quadrant
if let Some(pos) = position(&self.table[32..64], code_unit) {
return Some(((128 + 32) + pos) as u8);
}
} else if let Some(pos) = position(&self.table[32..self.run_byte_offset], code_unit) {
// windows-1252, windows-874, ISO-8859-15 and ISO-8859-5
// Search second quadrant before the run
return Some(((128 + 32) + pos) as u8);
}
// The first quadrant is useless in ISO encodings. In Windows encodings,
// there is useful punctuation here that might warrant searching
// before the symbols in the second quadrant, but the second quadrant
// is searched before this one for the benefit of Thai.
for i in 0..32 {
if self.table[i] == code_unit {
return Some((i + 128) as u8);
}
// Search first quadrant
if let Some(pos) = position(&self.table[..32], code_unit) {
return Some((128 + pos) as u8);
}
None
}
@ -345,8 +370,8 @@ impl SingleByteEncoder {
'outermost: loop {
match unsafe {
basic_latin_to_ascii(
src.as_ptr().offset(converted as isize),
dst.as_mut_ptr().offset(converted as isize),
src.as_ptr().add(converted),
dst.as_mut_ptr().add(converted),
length - converted,
)
} {
@ -379,7 +404,7 @@ impl SingleByteEncoder {
);
}
let second =
unsafe { *src.get_unchecked(converted + 1) } as u32;
u32::from(unsafe { *src.get_unchecked(converted + 1) });
if second & 0xFC00u32 != 0xDC00u32 {
return (
EncoderResult::Unmappable('\u{FFFD}'),
@ -389,9 +414,9 @@ impl SingleByteEncoder {
}
// The next code unit is a low surrogate.
let astral: char = unsafe {
::std::mem::transmute(
((non_ascii as u32) << 10) + second
- (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32),
::std::char::from_u32_unchecked(
(u32::from(non_ascii) << 10) + second
- (((0xD800u32 << 10) - 0x1_0000u32) + 0xDC00u32),
)
};
return (
@ -408,10 +433,8 @@ impl SingleByteEncoder {
converted,
);
}
let thirty_two = non_ascii as u32;
let bmp: char = unsafe { ::std::mem::transmute(thirty_two) };
return (
EncoderResult::Unmappable(bmp),
EncoderResult::unmappable_from_bmp(non_ascii),
converted + 1, // +1 `for non_ascii`
converted,
);
@ -464,7 +487,6 @@ impl SingleByteEncoder {
#[cfg(test)]
mod tests {
use super::super::data::*;
use super::super::testing::*;
use super::super::*;
@ -603,64 +625,64 @@ mod tests {
#[test]
fn test_single_byte_decode() {
decode_single_byte(IBM866, IBM866_DATA);
decode_single_byte(ISO_8859_10, ISO_8859_10_DATA);
decode_single_byte(ISO_8859_13, ISO_8859_13_DATA);
decode_single_byte(ISO_8859_14, ISO_8859_14_DATA);
decode_single_byte(ISO_8859_15, ISO_8859_15_DATA);
decode_single_byte(ISO_8859_16, ISO_8859_16_DATA);
decode_single_byte(ISO_8859_2, ISO_8859_2_DATA);
decode_single_byte(ISO_8859_3, ISO_8859_3_DATA);
decode_single_byte(ISO_8859_4, ISO_8859_4_DATA);
decode_single_byte(ISO_8859_5, ISO_8859_5_DATA);
decode_single_byte(ISO_8859_6, ISO_8859_6_DATA);
decode_single_byte(ISO_8859_7, ISO_8859_7_DATA);
decode_single_byte(ISO_8859_8, ISO_8859_8_DATA);
decode_single_byte(KOI8_R, KOI8_R_DATA);
decode_single_byte(KOI8_U, KOI8_U_DATA);
decode_single_byte(MACINTOSH, MACINTOSH_DATA);
decode_single_byte(WINDOWS_1250, WINDOWS_1250_DATA);
decode_single_byte(WINDOWS_1251, WINDOWS_1251_DATA);
decode_single_byte(WINDOWS_1252, WINDOWS_1252_DATA);
decode_single_byte(WINDOWS_1253, WINDOWS_1253_DATA);
decode_single_byte(WINDOWS_1254, WINDOWS_1254_DATA);
decode_single_byte(WINDOWS_1255, WINDOWS_1255_DATA);
decode_single_byte(WINDOWS_1256, WINDOWS_1256_DATA);
decode_single_byte(WINDOWS_1257, WINDOWS_1257_DATA);
decode_single_byte(WINDOWS_1258, WINDOWS_1258_DATA);
decode_single_byte(WINDOWS_874, WINDOWS_874_DATA);
decode_single_byte(X_MAC_CYRILLIC, X_MAC_CYRILLIC_DATA);
decode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866);
decode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10);
decode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13);
decode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14);
decode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15);
decode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16);
decode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2);
decode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3);
decode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4);
decode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5);
decode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6);
decode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7);
decode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8);
decode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r);
decode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u);
decode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh);
decode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250);
decode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251);
decode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252);
decode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253);
decode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254);
decode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255);
decode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256);
decode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257);
decode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258);
decode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874);
decode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic);
}
#[test]
fn test_single_byte_encode() {
encode_single_byte(IBM866, IBM866_DATA);
encode_single_byte(ISO_8859_10, ISO_8859_10_DATA);
encode_single_byte(ISO_8859_13, ISO_8859_13_DATA);
encode_single_byte(ISO_8859_14, ISO_8859_14_DATA);
encode_single_byte(ISO_8859_15, ISO_8859_15_DATA);
encode_single_byte(ISO_8859_16, ISO_8859_16_DATA);
encode_single_byte(ISO_8859_2, ISO_8859_2_DATA);
encode_single_byte(ISO_8859_3, ISO_8859_3_DATA);
encode_single_byte(ISO_8859_4, ISO_8859_4_DATA);
encode_single_byte(ISO_8859_5, ISO_8859_5_DATA);
encode_single_byte(ISO_8859_6, ISO_8859_6_DATA);
encode_single_byte(ISO_8859_7, ISO_8859_7_DATA);
encode_single_byte(ISO_8859_8, ISO_8859_8_DATA);
encode_single_byte(KOI8_R, KOI8_R_DATA);
encode_single_byte(KOI8_U, KOI8_U_DATA);
encode_single_byte(MACINTOSH, MACINTOSH_DATA);
encode_single_byte(WINDOWS_1250, WINDOWS_1250_DATA);
encode_single_byte(WINDOWS_1251, WINDOWS_1251_DATA);
encode_single_byte(WINDOWS_1252, WINDOWS_1252_DATA);
encode_single_byte(WINDOWS_1253, WINDOWS_1253_DATA);
encode_single_byte(WINDOWS_1254, WINDOWS_1254_DATA);
encode_single_byte(WINDOWS_1255, WINDOWS_1255_DATA);
encode_single_byte(WINDOWS_1256, WINDOWS_1256_DATA);
encode_single_byte(WINDOWS_1257, WINDOWS_1257_DATA);
encode_single_byte(WINDOWS_1258, WINDOWS_1258_DATA);
encode_single_byte(WINDOWS_874, WINDOWS_874_DATA);
encode_single_byte(X_MAC_CYRILLIC, X_MAC_CYRILLIC_DATA);
encode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866);
encode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10);
encode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13);
encode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14);
encode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15);
encode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16);
encode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2);
encode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3);
encode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4);
encode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5);
encode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6);
encode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7);
encode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8);
encode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r);
encode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u);
encode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh);
encode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250);
encode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251);
encode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252);
encode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253);
encode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254);
encode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255);
encode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256);
encode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257);
encode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258);
encode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874);
encode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic);
}
// END GENERATED CODE

View File

@ -29,8 +29,7 @@ impl Utf16Decoder {
}
pub fn additional_from_state(&self) -> usize {
1
+ if self.lead_byte.is_some() { 1 } else { 0 }
1 + if self.lead_byte.is_some() { 1 } else { 0 }
+ if self.lead_surrogate == 0 { 0 } else { 2 }
}
@ -120,9 +119,9 @@ impl Utf16Decoder {
Some(lead) => {
self.lead_byte = None;
let code_unit = if self.be {
(lead as u16) << 8 | b as u16
u16::from(lead) << 8 | u16::from(b)
} else {
(b as u16) << 8 | (lead as u16)
u16::from(b) << 8 | u16::from(lead)
};
let high_bits = code_unit & 0xFC00u16;
if high_bits == 0xD800u16 {

View File

@ -7,12 +7,10 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.
#[cfg(feature = "parallel-utf8")]
extern crate rayon;
use super::*;
use ascii::ascii_to_basic_latin;
use ascii::basic_latin_to_ascii;
use ascii::validate_ascii;
use handles::*;
use variant::*;
@ -34,111 +32,211 @@ cfg_if!{
}
}
// Keep this cfg_if in sync with whether the utf_8_core module is defined in lib.rs.
cfg_if! {
// When running 32-bit ARM code on Raspberry Pi 3, which has a 64-bit CPU,
// this is a pessimization for non-Latin, non-CJK scripts. However, this
// optimization seems to work across scripts when running 32-bit ARM code
// on a 32-bit ARM CPU (particularly good on Exynos 5) and when running
// 64-bit ARM code on a 64-bit ARM CPU.
if #[cfg(any(all(feature = "simd-accel", target_feature = "sse2"), all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_arch = "arm")))] {
use utf_8_core::run_utf8_validation;
} else {
use ::std::str::Utf8Error;
#[inline(always)]
fn run_utf8_validation(v: &[u8]) -> Result<&str, Utf8Error> {
::std::str::from_utf8(v)
}
}
#[repr(align(64))] // Align to cache lines
pub struct Utf8Data {
pub table: [u8; 384],
}
pub const UTF8_NORMAL_TRAIL: u8 = 1 << 3;
pub const UTF8_THREE_BYTE_SPECIAL_LOWER_BOUND_TRAIL: u8 = 1 << 4;
pub const UTF8_THREE_BYTE_SPECIAL_UPPER_BOUND_TRAIL: u8 = 1 << 5;
pub const UTF8_FOUR_BYTE_SPECIAL_LOWER_BOUND_TRAIL: u8 = 1 << 6;
pub const UTF8_FOUR_BYTE_SPECIAL_UPPER_BOUND_TRAIL: u8 = 1 << 7;
// BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
// Instead, please regenerate using generate-encoding-data.py
/// Bit is 1 if the trail is invalid.
pub static UTF8_TRAIL_INVALID: [u8; 256] = [
248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248,
248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248,
248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248,
248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248,
248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248,
248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248,
248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 80, 80, 80, 80, 80, 80,
80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144,
144, 144, 144, 144, 144, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160,
160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 248,
248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248,
248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248,
248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, 248,
248, 248, 248, 248, 248, 248,
];
pub static UTF8_DATA: Utf8Data = Utf8Data {
table: [
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 148, 148, 148,
148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 164, 164, 164, 164, 164,
164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164,
164, 164, 164, 164, 164, 164, 164, 164, 164, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 16, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 32, 8, 8, 64, 8, 8, 8, 128, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
],
};
// END GENERATED CODE
#[cfg(feature = "parallel-utf8")]
#[cfg_attr(feature = "cargo-clippy", allow(never_loop))]
pub fn utf8_valid_up_to(bytes: &[u8]) -> usize {
let mut len = bytes.len();
// The purpose of the outer loop is to avoid recursion when the attempt
// to find the split point discovers and over-long sequence.
pub fn utf8_valid_up_to(src: &[u8]) -> usize {
// This algorithm differs from the UTF-8 validation algorithm, but making
// this one consistent with that one makes this slower for reasons I don't
// understand.
let mut read = 0;
'outer: loop {
// This magic number has been determined on i7-4770 with SSE2 enabled.
// It's very likely that the number should be different when different
// ISA is used for ASCII acceleration. The number has been chosen
// to optimize the all-ASCII case. With mostly non-ASCII, the number
// should be much smaller, but that would pessimize the all-ASCII case,
// which we are trying to optimize here.
if len < 290000 {
return match run_utf8_validation(&bytes[..len]) {
Ok(_) => bytes.len(),
Err(e) => e.valid_up_to(),
};
}
let mid = len >> 1;
let mut adjusted = mid;
let mut i = 0;
'inner: loop {
// No need to check for `adjusted` reaching `len` because we
// already know that `len` is way larger than `(len / 2) + 4`.
if i == 3 {
// `mid` landed inside an overlong sequence.
len = mid;
continue 'outer;
let mut byte = {
let src_remaining = &src[read..];
match validate_ascii(src_remaining) {
None => {
return src.len();
}
Some((non_ascii, consumed)) => {
read += consumed;
non_ascii
}
}
if (bytes[adjusted] & 0xC0) != 0x80 {
};
// Check for the longest sequence to avoid checking twice for the
// multi-byte sequences. This can't overflow with 64-bit address space,
// because full 64 bits aren't in use. In the 32-bit PAE case, for this
// to overflow would mean that the source slice would be so large that
// the address space of the process would not have space for any code.
// Therefore, the slice cannot be so long that this would overflow.
if unsafe { likely(read + 4 <= src.len()) } {
'inner: loop {
// At this point, `byte` is not included in `read`, because we
// don't yet know that a) the UTF-8 sequence is valid and b) that there
// is output space if it is an astral sequence.
// We know, thanks to `ascii_to_basic_latin` that there is output
// space for at least one UTF-16 code unit, so no need to check
// for output space in the BMP cases.
// Inspecting the lead byte directly is faster than what the
// std lib does!
if unsafe { likely(in_inclusive_range8(byte, 0xC2, 0xDF)) } {
// Two-byte
let second = unsafe { *(src.get_unchecked(read + 1)) };
if !in_inclusive_range8(second, 0x80, 0xBF) {
break 'outer;
}
read += 2;
// Next lead (manually inlined)
if unsafe { likely(read + 4 <= src.len()) } {
byte = unsafe { *(src.get_unchecked(read)) };
if byte < 0x80 {
read += 1;
continue 'outer;
}
continue 'inner;
}
break 'inner;
}
if unsafe { likely(byte < 0xF0) } {
'three: loop {
// Three-byte
let second = unsafe { *(src.get_unchecked(read + 1)) };
let third = unsafe { *(src.get_unchecked(read + 2)) };
if ((UTF8_DATA.table[usize::from(second)]
& unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
| (third >> 6))
!= 2
{
break 'outer;
}
read += 3;
// Next lead (manually inlined)
if unsafe { likely(read + 4 <= src.len()) } {
byte = unsafe { *(src.get_unchecked(read)) };
if in_inclusive_range8(byte, 0xE0, 0xEF) {
continue 'three;
}
if unsafe { likely(byte < 0x80) } {
read += 1;
continue 'outer;
}
continue 'inner;
}
break 'inner;
}
}
// Four-byte
let second = unsafe { *(src.get_unchecked(read + 1)) };
let third = unsafe { *(src.get_unchecked(read + 2)) };
let fourth = unsafe { *(src.get_unchecked(read + 3)) };
if (u16::from(
UTF8_DATA.table[usize::from(second)]
& unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) },
) | u16::from(third >> 6)
| (u16::from(fourth & 0xC0) << 2))
!= 0x202
{
break 'outer;
}
read += 4;
// Next lead
if unsafe { likely(read + 4 <= src.len()) } {
byte = unsafe { *(src.get_unchecked(read)) };
if byte < 0x80 {
read += 1;
continue 'outer;
}
continue 'inner;
}
break 'inner;
}
adjusted += 1;
i += 1;
}
let (head, tail) = bytes[..len].split_at(adjusted);
let (head_valid_up_to, tail_valid_up_to) =
rayon::join(|| utf8_valid_up_to(head), || utf8_valid_up_to(tail));
if head_valid_up_to == adjusted {
return adjusted + tail_valid_up_to;
// We can't have a complete 4-byte sequence, but we could still have
// one to three shorter sequences.
'tail: loop {
// >= is better for bound check elision than ==
if read >= src.len() {
break 'outer;
}
byte = src[read];
// At this point, `byte` is not included in `read`, because we
// don't yet know that a) the UTF-8 sequence is valid and b) that there
// is output space if it is an astral sequence.
// Inspecting the lead byte directly is faster than what the
// std lib does!
if byte < 0x80 {
read += 1;
continue 'tail;
}
if in_inclusive_range8(byte, 0xC2, 0xDF) {
// Two-byte
let new_read = read + 2;
if new_read > src.len() {
break 'outer;
}
let second = src[read + 1];
if !in_inclusive_range8(second, 0x80, 0xBF) {
break 'outer;
}
read += 2;
continue 'tail;
}
// We need to exclude valid four byte lead bytes, because
// `UTF8_DATA.second_mask` covers
if byte < 0xF0 {
// Three-byte
let new_read = read + 3;
if new_read > src.len() {
break 'outer;
}
let second = src[read + 1];
let third = src[read + 2];
if ((UTF8_DATA.table[usize::from(second)]
& unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
| (third >> 6))
!= 2
{
break 'outer;
}
read += 3;
// `'tail` handles sequences shorter than 4, so
// there can't be another sequence after this one.
break 'outer;
}
break 'outer;
}
return head_valid_up_to;
}
read
}
#[cfg(not(feature = "parallel-utf8"))]
pub fn utf8_valid_up_to(bytes: &[u8]) -> usize {
match run_utf8_validation(bytes) {
Ok(_) => bytes.len(),
Err(e) => e.valid_up_to(),
}
}
#[cfg_attr(feature = "cargo-clippy", allow(never_loop, cyclomatic_complexity))]
#[cfg_attr(
feature = "cargo-clippy",
allow(never_loop, cyclomatic_complexity)
)]
pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usize, usize) {
// This algorithm differs from the UTF-8 validation algorithm, but making
// this one consistent with that one makes this slower for reasons I don't
@ -166,8 +264,12 @@ pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usiz
}
};
// Check for the longest sequence to avoid checking twice for the
// multi-byte sequences.
if read + 4 <= src.len() {
// multi-byte sequences. This can't overflow with 64-bit address space,
// because full 64 bits aren't in use. In the 32-bit PAE case, for this
// to overflow would mean that the source slice would be so large that
// the address space of the process would not have space for any code.
// Therefore, the slice cannot be so long that this would overflow.
if unsafe { likely(read + 4 <= src.len()) } {
'inner: loop {
// At this point, `byte` is not included in `read`, because we
// don't yet know that a) the UTF-8 sequence is valid and b) that there
@ -175,278 +277,183 @@ pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usiz
// We know, thanks to `ascii_to_basic_latin` that there is output
// space for at least one UTF-16 code unit, so no need to check
// for output space in the BMP cases.
// Matching directly on the lead byte is faster than what the
// Inspecting the lead byte directly is faster than what the
// std lib does!
match byte {
0...0x7F => {
// ASCII: write and go back to SIMD.
dst[written] = byte as u16;
if unsafe { likely(in_inclusive_range8(byte, 0xC2, 0xDF)) } {
// Two-byte
let second = unsafe { *(src.get_unchecked(read + 1)) };
if !in_inclusive_range8(second, 0x80, 0xBF) {
break 'outer;
}
unsafe {
*(dst.get_unchecked_mut(written)) =
((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F)
};
read += 2;
written += 1;
// Next lead (manually inlined)
if written == dst.len() {
break 'outer;
}
if unsafe { likely(read + 4 <= src.len()) } {
byte = unsafe { *(src.get_unchecked(read)) };
if byte < 0x80 {
unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
read += 1;
written += 1;
continue 'outer;
}
continue 'inner;
}
break 'inner;
}
if unsafe { likely(byte < 0xF0) } {
'three: loop {
// Three-byte
let second = unsafe { *(src.get_unchecked(read + 1)) };
let third = unsafe { *(src.get_unchecked(read + 2)) };
if ((UTF8_DATA.table[usize::from(second)]
& unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
| (third >> 6))
!= 2
{
break 'outer;
}
let point = ((u16::from(byte) & 0xF) << 12)
| ((u16::from(second) & 0x3F) << 6)
| (u16::from(third) & 0x3F);
unsafe { *(dst.get_unchecked_mut(written)) = point };
read += 3;
written += 1;
// Next lead (manually inlined)
if written == dst.len() {
break 'outer;
}
if unsafe { likely(read + 4 <= src.len()) } {
byte = unsafe { *(src.get_unchecked(read)) };
if in_inclusive_range8(byte, 0xE0, 0xEF) {
continue 'three;
}
if unsafe { likely(byte < 0x80) } {
unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
read += 1;
written += 1;
continue 'outer;
}
continue 'inner;
}
break 'inner;
}
}
// Four-byte
if written + 1 == dst.len() {
break 'outer;
}
let second = unsafe { *(src.get_unchecked(read + 1)) };
let third = unsafe { *(src.get_unchecked(read + 2)) };
let fourth = unsafe { *(src.get_unchecked(read + 3)) };
if (u16::from(
UTF8_DATA.table[usize::from(second)]
& unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) },
) | u16::from(third >> 6)
| (u16::from(fourth & 0xC0) << 2))
!= 0x202
{
break 'outer;
}
let point = ((u32::from(byte) & 0x7) << 18)
| ((u32::from(second) & 0x3F) << 12)
| ((u32::from(third) & 0x3F) << 6)
| (u32::from(fourth) & 0x3F);
unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 };
unsafe {
*(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16
};
read += 4;
written += 2;
// Next lead
if written == dst.len() {
break 'outer;
}
if unsafe { likely(read + 4 <= src.len()) } {
byte = unsafe { *(src.get_unchecked(read)) };
if byte < 0x80 {
unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
read += 1;
written += 1;
continue 'outer;
}
0xC2...0xDF => {
// Two-byte
let second = src[read + 1];
if (UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL) != 0 {
break 'outer;
}
let point = (((byte as u32) & 0x1Fu32) << 6) | (second as u32 & 0x3Fu32);
dst[written] = point as u16;
read += 2;
written += 1;
}
0xE1...0xEC | 0xEE...0xEF => {
// Three-byte normal
let second = src[read + 1];
let third = src[read + 2];
if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL)
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
!= 0
{
break 'outer;
}
let point = (((byte as u32) & 0xFu32) << 12)
| ((second as u32 & 0x3Fu32) << 6)
| (third as u32 & 0x3Fu32);
dst[written] = point as u16;
read += 3;
written += 1;
}
0xE0 => {
// Three-byte special lower bound
let second = src[read + 1];
let third = src[read + 2];
if ((UTF8_TRAIL_INVALID[second as usize]
& UTF8_THREE_BYTE_SPECIAL_LOWER_BOUND_TRAIL)
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
!= 0
{
break 'outer;
}
let point = (((byte as u32) & 0xFu32) << 12)
| ((second as u32 & 0x3Fu32) << 6)
| (third as u32 & 0x3Fu32);
dst[written] = point as u16;
read += 3;
written += 1;
}
0xED => {
// Three-byte special upper bound
let second = src[read + 1];
let third = src[read + 2];
if ((UTF8_TRAIL_INVALID[second as usize]
& UTF8_THREE_BYTE_SPECIAL_UPPER_BOUND_TRAIL)
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
!= 0
{
break 'outer;
}
let point = (((byte as u32) & 0xFu32) << 12)
| ((second as u32 & 0x3Fu32) << 6)
| (third as u32 & 0x3Fu32);
dst[written] = point as u16;
read += 3;
written += 1;
}
0xF1...0xF3 => {
// Four-byte normal
if written + 1 == dst.len() {
break 'outer;
}
let second = src[read + 1];
let third = src[read + 2];
let fourth = src[read + 3];
if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL)
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL)
| (UTF8_TRAIL_INVALID[fourth as usize] & UTF8_NORMAL_TRAIL))
!= 0
{
break 'outer;
}
let point = (((byte as u32) & 0x7u32) << 18)
| ((second as u32 & 0x3Fu32) << 12)
| ((third as u32 & 0x3Fu32) << 6)
| (fourth as u32 & 0x3Fu32);
dst[written] = (0xD7C0 + (point >> 10)) as u16;
dst[written + 1] = (0xDC00 + (point & 0x3FF)) as u16;
read += 4;
written += 2;
}
0xF0 => {
// Four-byte special lower bound
if written + 1 == dst.len() {
break 'outer;
}
let second = src[read + 1];
let third = src[read + 2];
let fourth = src[read + 3];
if ((UTF8_TRAIL_INVALID[second as usize]
& UTF8_FOUR_BYTE_SPECIAL_LOWER_BOUND_TRAIL)
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL)
| (UTF8_TRAIL_INVALID[fourth as usize] & UTF8_NORMAL_TRAIL))
!= 0
{
break 'outer;
}
let point = (((byte as u32) & 0x7u32) << 18)
| ((second as u32 & 0x3Fu32) << 12)
| ((third as u32 & 0x3Fu32) << 6)
| (fourth as u32 & 0x3Fu32);
dst[written] = (0xD7C0 + (point >> 10)) as u16;
dst[written + 1] = (0xDC00 + (point & 0x3FF)) as u16;
read += 4;
written += 2;
}
0xF4 => {
// Four-byte special upper bound
if written + 1 == dst.len() {
break 'outer;
}
let second = src[read + 1];
let third = src[read + 2];
let fourth = src[read + 3];
if ((UTF8_TRAIL_INVALID[second as usize]
& UTF8_FOUR_BYTE_SPECIAL_UPPER_BOUND_TRAIL)
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL)
| (UTF8_TRAIL_INVALID[fourth as usize] & UTF8_NORMAL_TRAIL))
!= 0
{
break 'outer;
}
let point = (((byte as u32) & 0x7u32) << 18)
| ((second as u32 & 0x3Fu32) << 12)
| ((third as u32 & 0x3Fu32) << 6)
| (fourth as u32 & 0x3Fu32);
dst[written] = (0xD7C0 + (point >> 10)) as u16;
dst[written + 1] = (0xDC00 + (point & 0x3FF)) as u16;
read += 4;
written += 2;
}
_ => {
// Invalid lead
break 'outer;
}
continue 'inner;
}
if written == dst.len() {
break 'outer;
}
if read + 4 > src.len() {
if read == src.len() {
break 'outer;
}
byte = src[read];
break 'inner;
}
byte = src[read];
continue 'inner;
break 'inner;
}
}
// We can't have a complete 4-byte sequence, but we could still have
// a complete shorter sequence.
// At this point, `byte` is not included in `read`, because we
// don't yet know that a) the UTF-8 sequence is valid and b) that there
// is output space if it is an astral sequence.
// We know, thanks to `ascii_to_basic_latin` that there is output
// space for at least one UTF-16 code unit, so no need to check
// for output space in the BMP cases.
// Matching directly on the lead byte is faster than what the
// std lib does!
match byte {
0...0x7F => {
// ASCII: write and go back to SIMD.
dst[written] = byte as u16;
// one to three shorter sequences.
'tail: loop {
// >= is better for bound check elision than ==
if read >= src.len() || written >= src.len() {
break 'outer;
}
byte = src[read];
// At this point, `byte` is not included in `read`, because we
// don't yet know that a) the UTF-8 sequence is valid and b) that there
// is output space if it is an astral sequence.
// Inspecting the lead byte directly is faster than what the
// std lib does!
if byte < 0x80 {
dst[written] = u16::from(byte);
read += 1;
written += 1;
continue 'outer;
continue 'tail;
}
0xC2...0xDF => {
if in_inclusive_range8(byte, 0xC2, 0xDF) {
// Two-byte
let new_read = read + 2;
if new_read > src.len() {
break 'outer;
}
let second = src[read + 1];
if (UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL) != 0 {
if !in_inclusive_range8(second, 0x80, 0xBF) {
break 'outer;
}
let point = (((byte as u32) & 0x1Fu32) << 6) | (second as u32 & 0x3Fu32);
dst[written] = point as u16;
read = new_read;
dst[written] = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F);
read += 2;
written += 1;
continue 'tail;
}
0xE1...0xEC | 0xEE...0xEF => {
// Three-byte normal
// We need to exclude valid four byte lead bytes, because
// `UTF8_DATA.second_mask` covers
if byte < 0xF0 {
// Three-byte
let new_read = read + 3;
if new_read > src.len() {
break 'outer;
}
let second = src[read + 1];
let third = src[read + 2];
if ((UTF8_TRAIL_INVALID[second as usize] & UTF8_NORMAL_TRAIL)
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
!= 0
if ((UTF8_DATA.table[usize::from(second)]
& unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
| (third >> 6))
!= 2
{
break 'outer;
}
let point = (((byte as u32) & 0xFu32) << 12)
| ((second as u32 & 0x3Fu32) << 6)
| (third as u32 & 0x3Fu32);
dst[written] = point as u16;
read = new_read;
let point = ((u16::from(byte) & 0xF) << 12)
| ((u16::from(second) & 0x3F) << 6)
| (u16::from(third) & 0x3F);
dst[written] = point;
read += 3;
written += 1;
}
0xE0 => {
// Three-byte special lower bound
let new_read = read + 3;
if new_read > src.len() {
break 'outer;
}
let second = src[read + 1];
let third = src[read + 2];
if ((UTF8_TRAIL_INVALID[second as usize]
& UTF8_THREE_BYTE_SPECIAL_LOWER_BOUND_TRAIL)
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
!= 0
{
break 'outer;
}
let point = (((byte as u32) & 0xFu32) << 12)
| ((second as u32 & 0x3Fu32) << 6)
| (third as u32 & 0x3Fu32);
dst[written] = point as u16;
read = new_read;
written += 1;
}
0xED => {
// Three-byte special upper bound
let new_read = read + 3;
if new_read > src.len() {
break 'outer;
}
let second = src[read + 1];
let third = src[read + 2];
if ((UTF8_TRAIL_INVALID[second as usize]
& UTF8_THREE_BYTE_SPECIAL_UPPER_BOUND_TRAIL)
| (UTF8_TRAIL_INVALID[third as usize] & UTF8_NORMAL_TRAIL))
!= 0
{
break 'outer;
}
let point = (((byte as u32) & 0xFu32) << 12)
| ((second as u32 & 0x3Fu32) << 6)
| (third as u32 & 0x3Fu32);
dst[written] = point as u16;
read = new_read;
written += 1;
}
_ => {
// Invalid lead or 4-byte lead
// `'tail` handles sequences shorter than 4, so
// there can't be another sequence after this one.
break 'outer;
}
break 'outer;
}
break 'outer;
}
(read, written)
}
@ -534,7 +541,7 @@ impl Utf8Decoder {
}
if b < 0xE0u8 {
self.bytes_needed = 1;
self.code_point = b as u32 & 0x1F;
self.code_point = u32::from(b) & 0x1F;
continue;
}
if b < 0xF0u8 {
@ -544,7 +551,7 @@ impl Utf8Decoder {
self.upper_boundary = 0x9Fu8;
}
self.bytes_needed = 2;
self.code_point = b as u32 & 0xF;
self.code_point = u32::from(b) & 0xF;
continue;
}
if b < 0xF5u8 {
@ -554,7 +561,7 @@ impl Utf8Decoder {
self.upper_boundary = 0x8Fu8;
}
self.bytes_needed = 3;
self.code_point = b as u32 & 0x7;
self.code_point = u32::from(b) & 0x7;
continue;
}
return (
@ -579,7 +586,7 @@ impl Utf8Decoder {
}
self.lower_boundary = 0x80u8;
self.upper_boundary = 0xBFu8;
self.code_point = (self.code_point << 6) | (b as u32 & 0x3F);
self.code_point = (self.code_point << 6) | (u32::from(b) & 0x3F);
self.bytes_seen += 1;
if self.bytes_seen != self.bytes_needed {
continue;
@ -683,7 +690,8 @@ impl Utf8Encoder {
unsafe {
*(dst.get_unchecked_mut(written)) = (unit >> 12) as u8 | 0xE0u8;
written += 1;
*(dst.get_unchecked_mut(written)) = ((unit & 0xFC0) >> 6) as u8 | 0x80u8;
*(dst.get_unchecked_mut(written)) =
((unit & 0xFC0) >> 6) as u8 | 0x80u8;
written += 1;
*(dst.get_unchecked_mut(written)) = (unit & 0x3F) as u8 | 0x80u8;
written += 1;
@ -709,19 +717,22 @@ impl Utf8Encoder {
}
let second = src[read];
let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
if unsafe { likely(second_minus_low_surrogate_start <= (0xDFFF - 0xDC00)) } {
if unsafe { likely(second_minus_low_surrogate_start <= (0xDFFF - 0xDC00)) }
{
// The next code unit is a low surrogate. Advance position.
read += 1;
let astral = ((unit as u32) << 10) + second as u32
let astral = (u32::from(unit) << 10) + u32::from(second)
- (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
unsafe {
*(dst.get_unchecked_mut(written)) = (astral >> 18) as u8 | 0xF0u8;
written += 1;
*(dst.get_unchecked_mut(written)) = ((astral & 0x3F000u32) >> 12) as u8 | 0x80u8;
*(dst.get_unchecked_mut(written)) =
((astral & 0x3F000u32) >> 12) as u8 | 0x80u8;
written += 1;
*(dst.get_unchecked_mut(written)) = ((astral & 0xFC0u32) >> 6) as u8 | 0x80u8;
*(dst.get_unchecked_mut(written)) =
((astral & 0xFC0u32) >> 6) as u8 | 0x80u8;
written += 1;
*(dst.get_unchecked_mut(written)) = (astral & 0x3Fu32) as u8 | 0x80u8;
*(dst.get_unchecked_mut(written)) = (astral & 0x3F) as u8 | 0x80u8;
written += 1;
}
break;
@ -774,22 +785,18 @@ impl Utf8Encoder {
dst: &mut [u8],
_last: bool,
) -> (EncoderResult, usize, usize) {
let mut to_write = src.len();
let bytes = src.as_bytes();
let mut to_write = bytes.len();
if to_write <= dst.len() {
unsafe {
::std::ptr::copy_nonoverlapping(src.as_ptr(), dst.as_mut_ptr(), to_write);
}
(&mut dst[..to_write]).copy_from_slice(bytes);
return (EncoderResult::InputEmpty, to_write, to_write);
}
to_write = dst.len();
// Move back until we find a UTF-8 sequence boundary.
let bytes = src.as_bytes();
while (bytes[to_write] & 0xC0) == 0x80 {
to_write -= 1;
}
unsafe {
::std::ptr::copy_nonoverlapping(src.as_ptr(), dst.as_mut_ptr(), to_write);
}
(&mut dst[..to_write]).copy_from_slice(&bytes[..to_write]);
(EncoderResult::OutputFull, to_write, to_write)
}
}

View File

@ -1,430 +0,0 @@
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// The initial revision of this file was extracted from the "UTF-8 validation"
// section of the file src/libcore/str/mod.rs from Rust project at revision
// 7ad7232422f7e5bbfa0e52dabe36c12677df19e2. The Utf8Error struct also comes
// from that file. Subsequently, changes from the mentioned file at revision
// 85eadf84f3945dc431643ea43d34f15193fdafb4 were merged into this file.
use ascii::validate_ascii;
/// Errors which can occur when attempting to interpret a sequence of `u8`
/// as a string.
///
/// As such, the `from_utf8` family of functions and methods for both `String`s
/// and `&str`s make use of this error, for example.
#[derive(Copy, Eq, PartialEq, Clone, Debug)]
pub struct Utf8Error {
valid_up_to: usize,
}
impl Utf8Error {
/// Returns the index in the given string up to which valid UTF-8 was
/// verified.
///
/// It is the maximum index such that `from_utf8(input[..index])`
/// would return `Ok(_)`.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// use std::str;
///
/// // some invalid bytes, in a vector
/// let sparkle_heart = vec![0, 159, 146, 150];
///
/// // std::str::from_utf8 returns a Utf8Error
/// let error = str::from_utf8(&sparkle_heart).unwrap_err();
///
/// // the second byte is invalid here
/// assert_eq!(1, error.valid_up_to());
/// ```
pub fn valid_up_to(&self) -> usize {
self.valid_up_to
}
}
#[cfg_attr(feature = "cargo-clippy", allow(eval_order_dependence))]
#[inline(always)]
pub fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
let mut index = 0;
let len = v.len();
'outer: loop {
let mut first = {
let remaining = &v[index..];
match validate_ascii(remaining) {
None => {
// offset += remaining.len();
break 'outer;
}
Some((non_ascii, consumed)) => {
index += consumed;
non_ascii
}
}
};
let old_offset = index;
macro_rules! err {
($error_len:expr) => {
return Err(Utf8Error {
valid_up_to: old_offset,
});
};
}
macro_rules! next {
() => {{
index += 1;
// we needed data, but there was none: error!
if index >= len {
err!(None)
}
v[index]
}};
}
'inner: loop {
let w = UTF8_CHAR_WIDTH[first as usize];
// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
// first C2 80 last DF BF
// 3-byte encoding is for codepoints \u{0800} to \u{ffff}
// first E0 A0 80 last EF BF BF
// excluding surrogates codepoints \u{d800} to \u{dfff}
// ED A0 80 to ED BF BF
// 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff
// first F0 90 80 80 last F4 8F BF BF
//
// Use the UTF-8 syntax from the RFC
//
// https://tools.ietf.org/html/rfc3629
// UTF8-1 = %x00-7F
// UTF8-2 = %xC2-DF UTF8-tail
// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
// %xF4 %x80-8F 2( UTF8-tail )
match w {
2 => {
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(1))
}
}
3 => {
match (first, next!()) {
(0xE0, 0xA0...0xBF)
| (0xE1...0xEC, 0x80...0xBF)
| (0xED, 0x80...0x9F)
| (0xEE...0xEF, 0x80...0xBF) => {}
_ => err!(Some(1)),
}
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(2))
}
}
4 => {
match (first, next!()) {
(0xF0, 0x90...0xBF) | (0xF1...0xF3, 0x80...0xBF) | (0xF4, 0x80...0x8F) => {}
_ => err!(Some(1)),
}
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(2))
}
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(3))
}
}
_ => err!(Some(1)),
}
index += 1;
if index == len {
break 'outer;
}
first = v[index];
// This check is separate from the above `match`, because merging
// this check into it causes a notable performance drop.
if first < 0x80 {
index += 1;
continue 'outer;
}
continue 'inner;
}
}
Ok(())
}
// https://tools.ietf.org/html/rfc3629
static UTF8_CHAR_WIDTH: [u8; 256] = [
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1, // 0x1F
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1, // 0x3F
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1, // 0x5F
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1, // 0x7F
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0, // 0x9F
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0, // 0xBF
0,
0,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2, // 0xDF
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3, // 0xEF
4,
4,
4,
4,
4,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0, // 0xFF
];
/// Mask of the value bits of a continuation byte
const CONT_MASK: u8 = 0b0011_1111;
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte
const TAG_CONT_U8: u8 = 0b1000_0000;

View File

@ -289,7 +289,7 @@ impl VariantEncoder {
}
pub enum VariantEncoding {
SingleByte(&'static [u16; 128]),
SingleByte(&'static [u16; 128], u16, u8, u8),
Utf8,
Gbk,
Gb18030,
@ -307,7 +307,7 @@ pub enum VariantEncoding {
impl VariantEncoding {
pub fn new_variant_decoder(&self) -> VariantDecoder {
match *self {
VariantEncoding::SingleByte(table) => SingleByteDecoder::new(table),
VariantEncoding::SingleByte(table, _, _, _) => SingleByteDecoder::new(table),
VariantEncoding::Utf8 => Utf8Decoder::new(),
VariantEncoding::Gbk | VariantEncoding::Gb18030 => Gb18030Decoder::new(),
VariantEncoding::Big5 => Big5Decoder::new(),
@ -324,7 +324,9 @@ impl VariantEncoding {
pub fn new_encoder(&self, encoding: &'static Encoding) -> Encoder {
match *self {
VariantEncoding::SingleByte(table) => SingleByteEncoder::new(encoding, table),
VariantEncoding::SingleByte(table, run_bmp_offset, run_byte_offset, run_length) => {
SingleByteEncoder::new(encoding, table, run_bmp_offset, run_byte_offset, run_length)
}
VariantEncoding::Utf8 => Utf8Encoder::new(encoding),
VariantEncoding::Gbk => Gb18030Encoder::new(encoding, false),
VariantEncoding::Gb18030 => Gb18030Encoder::new(encoding, true),
@ -339,4 +341,11 @@ impl VariantEncoding {
}
}
}
pub fn is_single_byte(&self) -> bool {
match *self {
VariantEncoding::SingleByte(_, _, _, _) | VariantEncoding::UserDefined => true,
_ => false,
}
}
}

View File

@ -56,7 +56,7 @@ impl UserDefinedDecoder {
destination_handle.write_ascii(b);
continue;
}
destination_handle.write_upper_bmp((b as usize + 0xF700usize) as u16);
destination_handle.write_upper_bmp(u16::from(b) + 0xF700);
continue;
},
self,
@ -93,9 +93,9 @@ impl UserDefinedDecoder {
*to = {
let unit = *from;
if unit < 0x80 {
unit as u16
u16::from(unit)
} else {
(unit as u16) + 0xF700
u16::from(unit) + 0xF700
}
}
});
@ -120,11 +120,11 @@ impl UserDefinedDecoder {
let src_ptr = src.as_ptr();
let dst_ptr = dst.as_mut_ptr();
for i in 0..simd_iterations {
let input = unsafe { load16_unaligned(src_ptr.offset((i * 16) as isize)) };
let input = unsafe { load16_unaligned(src_ptr.add(i * 16)) };
let (first, second) = simd_unpack(input);
unsafe {
store8_unaligned(dst_ptr.offset((i * 16) as isize), shift_upper(first));
store8_unaligned(dst_ptr.offset(((i * 16) + 8) as isize), shift_upper(second));
store8_unaligned(dst_ptr.add(i * 16), shift_upper(first));
store8_unaligned(dst_ptr.add((i * 16) + 8), shift_upper(second));
}
}
let src_tail = &src[tail_start..length];
@ -136,9 +136,9 @@ impl UserDefinedDecoder {
*to = {
let unit = *from;
if unit < 0x80 {
unit as u16
u16::from(unit)
} else {
(unit as u16) + 0xF700
u16::from(unit) + 0xF700
}
}
});
@ -182,7 +182,7 @@ impl UserDefinedEncoder {
destination_handle.written(),
);
}
destination_handle.write_one((c as usize - 0xF700usize) as u8);
destination_handle.write_one((u32::from(c) - 0xF700) as u8);
continue;
},
self,