diff --git a/doc/Big5.txt b/doc/Big5.txt new file mode 100644 index 0000000..61e8fd5 --- /dev/null +++ b/doc/Big5.txt @@ -0,0 +1,16 @@ +/// This is Big5 with HKSCS with mappings to more recent Unicode assignments +/// instead of the Private Use Area code points that have been used historically. +/// It is believed to be able to decode existing Web content in a way that makes +/// sense. +/// +/// To avoid form submissions generating data that Web servers don't understand, +/// the encoder doesn't use the HKSCS byte sequences that precede the unextended +/// Big5 in the lexical order. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/big5.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/big5-bmp.html) +/// +/// This encoding is designed to be suited for decoding the Windows code page 950 +/// and its HKSCS patched "951" variant such that the text makes sense, given +/// assignments that Unicode has made after those encodings used Private Use +/// Area characters. diff --git a/doc/EUC-JP.txt b/doc/EUC-JP.txt new file mode 100644 index 0000000..f90a735 --- /dev/null +++ b/doc/EUC-JP.txt @@ -0,0 +1,12 @@ +/// This is the legacy Unix encoding for Japanese. +/// +/// For compatibility with Web servers that don't expect three-byte sequences +/// in form submissions, the encoder doesn't generate three-byte sequences. +/// That is, the JIS X 0212 support is decode-only. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/euc-jp.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-jp-bmp.html) +/// +/// This encoding roughly matches the Windows code page 20932. There are error +/// handling differences and a handful of 2-byte sequences that decode differently. +/// Additionall, Windows doesn't support 3-byte sequences. diff --git a/doc/EUC-KR.txt b/doc/EUC-KR.txt new file mode 100644 index 0000000..ef24c98 --- /dev/null +++ b/doc/EUC-KR.txt @@ -0,0 +1,10 @@ +/// This is the Korean encoding for Windows. It extends the Unix legacy encoding +/// for Korean, based on KS X 1001 (which also formed the base of MacKorean on Mac OS +/// Classic), with all the characters from the Hangul Syllables block of Unicode. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/euc-kr.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-kr-bmp.html) +/// +/// This encoding matches the Windows code page 949, except Windows decodes byte 0x80 +/// to U+0080 and some byte sequences that are error per the Encoding Standard to +/// the question mark or the Private Use Area. diff --git a/doc/GBK.txt b/doc/GBK.txt new file mode 100644 index 0000000..2faefff --- /dev/null +++ b/doc/GBK.txt @@ -0,0 +1,16 @@ +/// The decoder for this encoding is the same as the decoder for gb18030. +/// The encoder side of this encoding is GBK with Windows code page 936 euro +/// sign behavior. GBK extends GB2312-80 to cover the CJK Unified Ideographs +/// Unicode block as well as a handful of ideographs from the CJK Unified +/// Ideographs Extension A and CJK Compatibility Ideographs blocks. +/// +/// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't +/// unified with the gb18030 encoder in the Encoding Standard out of concern +/// that servers that expect GBK form submissions might not be able to handle +/// the four-byte sequences. +/// +/// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html), +/// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html) +/// +/// The encoder of this encoding roughly matches the Windows code page 936. +/// The decoder side is a superset. diff --git a/doc/IBM866.txt b/doc/IBM866.txt new file mode 100644 index 0000000..871ff42 --- /dev/null +++ b/doc/IBM866.txt @@ -0,0 +1,8 @@ +/// This the most notable one of the DOS Cyrillic code pages. It has the same +/// box drawing characters as code page 437, so it can be used for decoding +/// DOS-era ASCII + box drawing data. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/ibm866.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/ibm866-bmp.html) +/// +/// This encoding matches the Windows code page 866. diff --git a/doc/ISO-2022-JP.txt b/doc/ISO-2022-JP.txt new file mode 100644 index 0000000..65713a1 --- /dev/null +++ b/doc/ISO-2022-JP.txt @@ -0,0 +1,10 @@ +/// This the primary pre-UTF-8 encoding for Japanese email. It uses the ASCII +/// byte range to encode non-Basic Latin characters. It's the only encoding +/// supported by this crate whose encoder is stateful. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/jis0208.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/jis0208-bmp.html) +/// +/// This encoding roughly matches the Windows code page 50220. Notably, Windows +/// uses U+30FB in place of the REPLACEMENT CHARACTER and otherwise differs in +/// error handling. diff --git a/doc/ISO-8859-10.txt b/doc/ISO-8859-10.txt new file mode 100644 index 0000000..8aca388 --- /dev/null +++ b/doc/ISO-8859-10.txt @@ -0,0 +1,8 @@ +/// This is the Nordic part of the ISO/IEC 8859 encoding family. This encoding +/// is also known as Latin 6. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-10.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-10-bmp.html) +/// +/// The Windows code page number for this encoding is 28600, but kernel32.dll +/// does not support this encoding. diff --git a/doc/ISO-8859-13.txt b/doc/ISO-8859-13.txt new file mode 100644 index 0000000..20cd549 --- /dev/null +++ b/doc/ISO-8859-13.txt @@ -0,0 +1,8 @@ +/// This is the Baltic part of the ISO/IEC 8859 encoding family. This encoding +/// is also known as Latin 7. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-13.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-13-bmp.html) +/// +/// This encoding matches the Windows code page 28603, except Windows decodes +/// unassigned code points to the Private Use Area of Unicode. diff --git a/doc/ISO-8859-14.txt b/doc/ISO-8859-14.txt new file mode 100644 index 0000000..3e4833b --- /dev/null +++ b/doc/ISO-8859-14.txt @@ -0,0 +1,8 @@ +/// This is the Celtic part of the ISO/IEC 8859 encoding family. This encoding +/// is also known as Latin 8. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-14.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-14-bmp.html) +/// +/// The Windows code page number for this encoding is 28604, but kernel32.dll +/// does not support this encoding. diff --git a/doc/ISO-8859-15.txt b/doc/ISO-8859-15.txt new file mode 100644 index 0000000..922896a --- /dev/null +++ b/doc/ISO-8859-15.txt @@ -0,0 +1,7 @@ +/// This is the revised Western European part of the ISO/IEC 8859 encoding +/// family. This encoding is also known as Latin 9. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-15.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-15-bmp.html) +/// +/// This encoding matches the Windows code page 28605. diff --git a/doc/ISO-8859-16.txt b/doc/ISO-8859-16.txt new file mode 100644 index 0000000..d1ae50b --- /dev/null +++ b/doc/ISO-8859-16.txt @@ -0,0 +1,8 @@ +/// This is the South-Eastern European part of the ISO/IEC 8859 encoding +/// family. This encoding is also known as Latin 10. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-16.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-16-bmp.html) +/// +/// The Windows code page number for this encoding is 28606, but kernel32.dll +/// does not support this encoding. diff --git a/doc/ISO-8859-2.txt b/doc/ISO-8859-2.txt new file mode 100644 index 0000000..298df09 --- /dev/null +++ b/doc/ISO-8859-2.txt @@ -0,0 +1,6 @@ +/// This is the Central European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 2. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-2.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-2-bmp.html) +/// +/// This encoding matches the Windows code page 28592. diff --git a/doc/ISO-8859-3.txt b/doc/ISO-8859-3.txt new file mode 100644 index 0000000..c462ce8 --- /dev/null +++ b/doc/ISO-8859-3.txt @@ -0,0 +1,6 @@ +/// This is the South European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 3. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-3.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-3-bmp.html) +/// +/// This encoding matches the Windows code page 28593. diff --git a/doc/ISO-8859-4.txt b/doc/ISO-8859-4.txt new file mode 100644 index 0000000..40449c4 --- /dev/null +++ b/doc/ISO-8859-4.txt @@ -0,0 +1,6 @@ +/// This is the North European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 4. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-4.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-4-bmp.html) +/// +/// This encoding matches the Windows code page 28594. diff --git a/doc/ISO-8859-5.txt b/doc/ISO-8859-5.txt new file mode 100644 index 0000000..41774ec --- /dev/null +++ b/doc/ISO-8859-5.txt @@ -0,0 +1,6 @@ +/// This is the Cyrillic part of the ISO/IEC 8859 encoding family. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-5.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-5-bmp.html) +/// +/// This encoding matches the Windows code page 28595. diff --git a/doc/ISO-8859-6.txt b/doc/ISO-8859-6.txt new file mode 100644 index 0000000..4c70c22 --- /dev/null +++ b/doc/ISO-8859-6.txt @@ -0,0 +1,7 @@ +/// This is the Arabic part of the ISO/IEC 8859 encoding family. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-6.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-6-bmp.html) +/// +/// This encoding matches the Windows code page 28596, except Windows decodes +/// unassigned code points to the Private Use Area of Unicode. diff --git a/doc/ISO-8859-7.txt b/doc/ISO-8859-7.txt new file mode 100644 index 0000000..b78ed38 --- /dev/null +++ b/doc/ISO-8859-7.txt @@ -0,0 +1,11 @@ +/// This is the Greek part of the ISO/IEC 8859 encoding family. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-7.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-7-bmp.html) +/// +/// This encoding roughly matches the Windows code page 28597. Windows decodes +/// unassigned code points, the currency signs at 0xA4 and 0xA5 as well as +/// 0xAA, which should be U+037A GREEK YPOGEGRAMMENI, to the Private Use Area +/// of Unicode. Windows decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA +/// instead of U+2018 LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER +/// LETTER APOSTROPHE instead of U+2019 RIGHT SINGLE QUOTATION MARK. diff --git a/doc/ISO-8859-8-I.txt b/doc/ISO-8859-8-I.txt new file mode 100644 index 0000000..b73e572 --- /dev/null +++ b/doc/ISO-8859-8-I.txt @@ -0,0 +1,9 @@ +/// This is the Hebrew part of the ISO/IEC 8859 encoding family in logical order. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html) +/// +/// This encoding roughly matches the Windows code page 38598. Windows decodes +/// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use +/// Area instead of LRM and RLM. Windows decodes unassigned code points to +/// the private use area. diff --git a/doc/ISO-8859-8.txt b/doc/ISO-8859-8.txt new file mode 100644 index 0000000..c5600e3 --- /dev/null +++ b/doc/ISO-8859-8.txt @@ -0,0 +1,9 @@ +/// This is the Hebrew part of the ISO/IEC 8859 encoding family in visual order. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html) +/// +/// This encoding roughly matches the Windows code page 28598. Windows decodes +/// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use +/// Area instead of LRM and RLM. Windows decodes unassigned code points to +/// the private use area. diff --git a/doc/KOI8-R.txt b/doc/KOI8-R.txt new file mode 100644 index 0000000..46dcfe7 --- /dev/null +++ b/doc/KOI8-R.txt @@ -0,0 +1,6 @@ +/// This is an encoding for Russian from [RFC 1489](https://tools.ietf.org/html/rfc1489). +/// +/// [Index visualization](https://encoding.spec.whatwg.org/koi8-r.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-r-bmp.html) +/// +/// This encoding matches the Windows code page 20866. diff --git a/doc/KOI8-U.txt b/doc/KOI8-U.txt new file mode 100644 index 0000000..a263745 --- /dev/null +++ b/doc/KOI8-U.txt @@ -0,0 +1,6 @@ +/// This is an encoding for Ukrainian adapted from KOI8-R. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/koi8-u.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-u-bmp.html) +/// +/// This encoding matches the Windows code page 21866. diff --git a/doc/Shift_JIS.txt b/doc/Shift_JIS.txt new file mode 100644 index 0000000..b982ab5 --- /dev/null +++ b/doc/Shift_JIS.txt @@ -0,0 +1,8 @@ +/// This is the Japanese encoding for Windows. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/shift_jis.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/shift_jis-bmp.html) +/// +/// This encoding matches the Windows code page 932, except Windows decodes some byte +/// sequences that are error per the Encoding Standard to the question mark or the +/// Private Use Area and generally uses U+30FB in place of the REPLACEMENT CHARACTER. diff --git a/doc/UTF-16BE.txt b/doc/UTF-16BE.txt new file mode 100644 index 0000000..0a7df99 --- /dev/null +++ b/doc/UTF-16BE.txt @@ -0,0 +1,8 @@ +/// This decode-only encoding uses 16-bit code units due to Unicode originally +/// having been designed as a 16-bit reportoire. In the absence of a byte order +/// mark the big endian byte order is assumed. +/// +/// There is no corresponding encoder in this crate or in the Encoding +/// Standard. The output encoding of this encoding is UTF-8. +/// +/// This encoding matches the Windows code page 1201. diff --git a/doc/UTF-16LE.txt b/doc/UTF-16LE.txt new file mode 100644 index 0000000..3a98e8b --- /dev/null +++ b/doc/UTF-16LE.txt @@ -0,0 +1,8 @@ +/// This decode-only encoding uses 16-bit code units due to Unicode originally +/// having been designed as a 16-bit reportoire. In the absence of a byte order +/// mark the little endian byte order is assumed. +/// +/// There is no corresponding encoder in this crate or in the Encoding +/// Standard. The output encoding of this encoding is UTF-8. +/// +/// This encoding matches the Windows code page 1200. diff --git a/doc/UTF-8.txt b/doc/UTF-8.txt new file mode 100644 index 0000000..3a93e67 --- /dev/null +++ b/doc/UTF-8.txt @@ -0,0 +1,5 @@ +/// This is the encoding that should be used for all new development it can +/// represent all of Unicode. +/// +/// This encoding matches the Windows code page 65001, except Windows differs +/// in the number of errors generated for some erroneous byte sequences. diff --git a/doc/gb18030.txt b/doc/gb18030.txt new file mode 100644 index 0000000..572a593 --- /dev/null +++ b/doc/gb18030.txt @@ -0,0 +1,9 @@ +/// This encoding matches GB18030-2005 except the two-byte sequence 0xA3 0xA0 +/// maps to U+3000 for compatibility with existing Web content. As a result, +/// this encoding can represent all of Unicode except for the private-use +/// character U+E5E5. +/// +/// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html), +/// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html) +/// +/// This encoding matches the Windows code page 54936. diff --git a/doc/macintosh.txt b/doc/macintosh.txt new file mode 100644 index 0000000..d00fece --- /dev/null +++ b/doc/macintosh.txt @@ -0,0 +1,7 @@ +/// This is the MacRoman encoding from Mac OS Classic. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/macintosh.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/macintosh-bmp.html) +/// +/// This encoding matches the Windows code page 10000, except Windows decodes +/// 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA. diff --git a/doc/replacement.txt b/doc/replacement.txt new file mode 100644 index 0000000..2398df0 --- /dev/null +++ b/doc/replacement.txt @@ -0,0 +1,10 @@ +/// This decode-only encoding decodes all non-zero-length streams to a single +/// REPLACEMENT CHARACTER. Its purpose is to avoid the use of an +/// ASCII-compatible fallback encoding (typically windows-1252) for some +/// encodings that are no longer supported by the Web Platform and that +/// would be dangerous to treat as ASCII-compatible. +/// +/// There is no corresponding encoder. The output encoding of this encoding +/// is UTF-8. +/// +/// This encoding does not have a Windows code page number. diff --git a/doc/windows-1250.txt b/doc/windows-1250.txt new file mode 100644 index 0000000..96e38ef --- /dev/null +++ b/doc/windows-1250.txt @@ -0,0 +1,6 @@ +/// This is the Central European encoding for Windows. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/windows-1250.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1250-bmp.html) +/// +/// This encoding matches the Windows code page 1250. diff --git a/doc/windows-1251.txt b/doc/windows-1251.txt new file mode 100644 index 0000000..9645611 --- /dev/null +++ b/doc/windows-1251.txt @@ -0,0 +1,6 @@ +/// This is the Cyrillic encoding for Windows. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/windows-1251.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1251-bmp.html) +/// +/// This encoding matches the Windows code page 1251. diff --git a/doc/windows-1252.txt b/doc/windows-1252.txt new file mode 100644 index 0000000..d613fbe --- /dev/null +++ b/doc/windows-1252.txt @@ -0,0 +1,7 @@ +/// This is the Western encoding for Windows. It is an extension of ISO-8859-1, +/// which is known as Latin 1. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/windows-1252.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1252-bmp.html) +/// +/// This encoding matches the Windows code page 1252. diff --git a/doc/windows-1253.txt b/doc/windows-1253.txt new file mode 100644 index 0000000..edcacd9 --- /dev/null +++ b/doc/windows-1253.txt @@ -0,0 +1,8 @@ +/// This is the Greek encoding for Windows. It is mostly an extension of +/// ISO-8859-7, but U+0386 is mapped to a different byte. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/windows-1253.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1253-bmp.html) +/// +/// This encoding matches the Windows code page 1253, except Windows decodes +/// unassigned code points to the Private Use Area of Unicode. diff --git a/doc/windows-1254.txt b/doc/windows-1254.txt new file mode 100644 index 0000000..26491a9 --- /dev/null +++ b/doc/windows-1254.txt @@ -0,0 +1,7 @@ +/// This is the Turkish encoding for Windows. It is an extension of ISO-8859-9, +/// which is known as Latin 5. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/windows-1254.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1254-bmp.html) +/// +/// This encoding matches the Windows code page 1254. diff --git a/doc/windows-1255.txt b/doc/windows-1255.txt new file mode 100644 index 0000000..cbcf86d --- /dev/null +++ b/doc/windows-1255.txt @@ -0,0 +1,8 @@ +/// This is the Hebrew encoding for Windows. It is an extension of ISO-8859-8-I, +/// except for a currency sign swap. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/windows-1255.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1255-bmp.html) +/// +/// This encoding matches the Windows code page 1255, except Windows decodes +/// unassigned code points to the Private Use Area of Unicode. diff --git a/doc/windows-1256.txt b/doc/windows-1256.txt new file mode 100644 index 0000000..38bf2ef --- /dev/null +++ b/doc/windows-1256.txt @@ -0,0 +1,6 @@ +/// This is the Arabic encoding for Windows. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/windows-1256.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1256-bmp.html) +/// +/// This encoding matches the Windows code page 1256. diff --git a/doc/windows-1257.txt b/doc/windows-1257.txt new file mode 100644 index 0000000..fc3fad2 --- /dev/null +++ b/doc/windows-1257.txt @@ -0,0 +1,7 @@ +/// This is the Baltic encoding for Windows. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/windows-1257.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1257-bmp.html) +/// +/// This encoding matches the Windows code page 1257, except Windows decodes +/// unassigned code points to the Private Use Area of Unicode. diff --git a/doc/windows-1258.txt b/doc/windows-1258.txt new file mode 100644 index 0000000..1ae5bbb --- /dev/null +++ b/doc/windows-1258.txt @@ -0,0 +1,11 @@ +/// This is the Vietnamese encoding for Windows. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/windows-1258.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1258-bmp.html) +/// +/// This encoding matches the Windows code page 1258 when used in the +/// non-normalizing mode. Unlike with the other single-byte encodings, the +/// result of decoding is not necessarily in Normalization Form C. On the +/// other hand, input in the Normalization Form C is not encoded without +/// replacement. In general, it's a bad idea to encode to encodings other +/// than UTF-8, but this encoding is especially hazardous to encode to. diff --git a/doc/windows-874.txt b/doc/windows-874.txt new file mode 100644 index 0000000..ddbc711 --- /dev/null +++ b/doc/windows-874.txt @@ -0,0 +1,7 @@ +/// This is the Thai encoding for Windows. It is an extension of TIS-620 / ISO-8859-11. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/windows-874.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-874-bmp.html) +/// +/// This encoding matches the Windows code page 874, except Windows decodes +/// unassigned code points to the Private Use Area of Unicode. diff --git a/doc/x-mac-cyrillic.txt b/doc/x-mac-cyrillic.txt new file mode 100644 index 0000000..b5519a1 --- /dev/null +++ b/doc/x-mac-cyrillic.txt @@ -0,0 +1,6 @@ +/// This is the MacUkrainian encoding from Mac OS Classic. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/x-mac-cyrillic.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/x-mac-cyrillic-bmp.html) +/// +/// This encoding matches the Windows code page 10017. diff --git a/doc/x-user-defined.txt b/doc/x-user-defined.txt new file mode 100644 index 0000000..e00ddc6 --- /dev/null +++ b/doc/x-user-defined.txt @@ -0,0 +1,6 @@ +/// This encoding offsets the non-ASCII bytes by `0xF700` thereby decoding +/// them to the Private Use Area of Unicode. It was used for loading binary +/// data into a JavaScript string using `XMLHttpRequest` before XHR supported +/// the `"arraybuffer"` response type. +/// +/// This encoding does not have a Windows code page number. diff --git a/generate-encoding-data.py b/generate-encoding-data.py index 0b38e90..7c17b18 100644 --- a/generate-encoding-data.py +++ b/generate-encoding-data.py @@ -33,6 +33,13 @@ class Label: def __cmp__(self, other): return cmp_from_end(self.label, other.label) +class CodePage: + def __init__(self, code_page, preferred): + self.code_page = code_page + self.preferred = preferred + def __cmp__(self, other): + return self.code_page, other.code_page + def static_u16_table(name, data): data_file.write('''pub static %s: [u16; %d] = [ ''' % (name, len(data))) @@ -82,6 +89,8 @@ single_byte = [] multi_byte = [] +code_pages = [] + def to_camel_name(name): if name == u"iso-8859-8-i": return u"Iso8I" @@ -98,6 +107,66 @@ def to_snake_name(name): def to_dom_name(name): return name +encodings_by_code_page = { + 932: "Shift_JIS", + 936: "GBK", + 949: "EUC-KR", + 950: "Big5", + 866: "IBM866", + 874: "windows-874", + 1200: "UTF-16LE", + 1201: "UTF-16BE", + 1250: "windows-1250", + 1251: "windows-1251", + 1252: "windows-1252", + 1253: "windows-1253", + 1254: "windows-1254", + 1255: "windows-1255", + 1256: "windows-1256", + 1257: "windows-1257", + 1258: "windows-1258", + 10000: "macintosh", + 10017: "x-mac-cyrillic", + 20866: "KOI8-R", + 20932: "EUC-JP", + 21866: "KOI8-U", + 28592: "ISO-8859-2", + 28593: "ISO-8859-3", + 28594: "ISO-8859-4", + 28595: "ISO-8859-5", + 28596: "ISO-8859-6", + 28597: "ISO-8859-7", + 28598: "ISO-8859-8", + 28600: "ISO-8859-10", + 28603: "ISO-8859-13", + 28604: "ISO-8859-14", + 28605: "ISO-8859-15", + 28606: "ISO-8859-16", + 38598: "ISO-8859-8-I", + 50221: "ISO-2022-JP", + 54936: "gb18030", + 65001: "UTF-8", +} + +code_pages_by_encoding = {} + +for code_page, encoding in encodings_by_code_page.iteritems(): + code_pages_by_encoding[encoding] = code_page + +encoding_by_alias_code_page = { + 951: "Big5", + 20936: "GBK", + 20949: "EUC-KR", + 28591: "windows-1252", + 28599: "windows-1254", + 28601: "windows-847", + 50220: "ISO-2022-JP", + 50222: "ISO-2022-JP", + 51949: "EUC-JP", + 51936: "GBK", + 51949: "EUC-KR", +} + # for group in data: @@ -177,7 +246,11 @@ for name in preferred: else: variant = to_camel_name(name) - label_file.write('''/// The initializer for the %s encoding. + docfile = open("doc/%s.txt" % name, "r") + doctext = docfile.read() + docfile.close() + + label_file.write('''/// The initializer for the [%s](static.%s.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -196,13 +269,14 @@ pub static %s_INIT: Encoding = Encoding { /// The %s encoding. /// +%s/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static %s: &'static Encoding = &%s_INIT; -''' % (to_dom_name(name), to_constant_name(name), to_dom_name(name), variant, to_dom_name(name), to_constant_name(name), to_constant_name(name))) +''' % (to_dom_name(name), to_constant_name(name), to_constant_name(name), to_dom_name(name), variant, to_dom_name(name), doctext, to_constant_name(name), to_constant_name(name))) label_file.write("""static LABELS_SORTED: [&'static str; %d] = [ """ % len(labels)) diff --git a/src/euc_jp.rs b/src/euc_jp.rs index 9857989..ea9d515 100644 --- a/src/euc_jp.rs +++ b/src/euc_jp.rs @@ -286,7 +286,8 @@ impl EucJpEncoder { let lead = (pointer / 94) + 0xA1; let trail = (pointer % 94) + 0xA1; handle.write_two(lead as u8, trail as u8) - } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D) || bmp == 0xF929 + } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D) + || bmp == 0xF929 || bmp == 0xF9DC { // Guaranteed to be found in IBM_KANJI diff --git a/src/euc_kr.rs b/src/euc_kr.rs index d27a1ef..51939d1 100644 --- a/src/euc_kr.rs +++ b/src/euc_kr.rs @@ -205,7 +205,8 @@ fn ksx1001_encode_misc(bmp: u16) -> Option<(usize, usize)> { return Some((0x81 + 0x25, 0xA1 + pos)); } } - if in_inclusive_range16(bmp, 0x2015, 0x266D) || in_inclusive_range16(bmp, 0x321C, 0x33D8) + if in_inclusive_range16(bmp, 0x2015, 0x266D) + || in_inclusive_range16(bmp, 0x321C, 0x33D8) || in_inclusive_range16(bmp, 0xFF3C, 0xFFE5) || in_inclusive_range16(bmp, 0x00A1, 0x00F7) || in_inclusive_range16(bmp, 0x02C7, 0x02DD) diff --git a/src/handles.rs b/src/handles.rs index be481c5..5b46d14 100644 --- a/src/handles.rs +++ b/src/handles.rs @@ -1477,12 +1477,14 @@ impl<'a> Utf8Source<'a> { return unsafe { ::std::mem::transmute(point) }; } if unit < 0xF0u32 { - let point = ((unit & 0xFu32) << 12) | ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 6) + let point = ((unit & 0xFu32) << 12) + | ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 6) | (self.slice[self.pos + 2] as u32 & 0x3Fu32); self.pos += 3; return unsafe { ::std::mem::transmute(point) }; } - let point = ((unit & 0x7u32) << 18) | ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 12) + let point = ((unit & 0x7u32) << 18) + | ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 12) | ((self.slice[self.pos + 2] as u32 & 0x3Fu32) << 6) | (self.slice[self.pos + 3] as u32 & 0x3Fu32); self.pos += 4; diff --git a/src/iso_2022_jp.rs b/src/iso_2022_jp.rs index 32a088a..23c53ff 100644 --- a/src/iso_2022_jp.rs +++ b/src/iso_2022_jp.rs @@ -667,7 +667,8 @@ impl Iso2022JpEncoder { let trail = (pointer % 94) + 0x21; handle.write_two(lead as u8, trail as u8); continue; - } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D) || bmp == 0xF929 + } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D) + || bmp == 0xF929 || bmp == 0xF9DC { // Guaranteed to be found in IBM_KANJI diff --git a/src/lib.rs b/src/lib.rs index dd608c7..e2e9af2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -561,8 +561,14 @@ //! EncodingCode PagePUARemarks //! //! +//! Shift_JIS932 +//! GBK936 +//! EUC-KR949 +//! Big5950 //! IBM866866 //! windows-874874• +//! UTF-16LE1200 +//! UTF-16BE1201 //! windows-12501250 //! windows-12511251 //! windows-12521252 @@ -575,6 +581,7 @@ //! macintosh100001 //! x-mac-cyrillic100172 //! KOI8-R20866 +//! EUC-JP20932 //! KOI8-U21866 //! ISO-8859-228592 //! ISO-8859-328593 @@ -586,6 +593,9 @@ //! ISO-8859-1328603• //! ISO-8859-1528605 //! ISO-8859-8-I385985 +//! ISO-2022-JP50220 +//! gb1803054936 +//! UTF-865001 //! //! //! @@ -739,7 +749,7 @@ const NCR_EXTRA: usize = 10; // 􏿿 const LONGEST_LABEL_LENGTH: usize = 19; // cseucpkdfmtjapanese -/// The initializer for the Big5 encoding. +/// The initializer for the [Big5](static.BIG5.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -758,13 +768,30 @@ pub static BIG5_INIT: Encoding = Encoding { /// The Big5 encoding. /// +/// This is Big5 with HKSCS with mappings to more recent Unicode assignments +/// instead of the Private Use Area code points that have been used historically. +/// It is believed to be able to decode existing Web content in a way that makes +/// sense. +/// +/// To avoid form submissions generating data that Web servers don't understand, +/// the encoder doesn't use the HKSCS byte sequences that precede the unextended +/// Big5 in the lexical order. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/big5.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/big5-bmp.html) +/// +/// This encoding is designed to be suited for decoding the Windows code page 950 +/// and its HKSCS patched "951" variant such that the text makes sense, given +/// assignments that Unicode has made after those encodings used Private Use +/// Area characters. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static BIG5: &'static Encoding = &BIG5_INIT; -/// The initializer for the EUC-JP encoding. +/// The initializer for the [EUC-JP](static.EUC_JP.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -783,13 +810,26 @@ pub static EUC_JP_INIT: Encoding = Encoding { /// The EUC-JP encoding. /// +/// This is the legacy Unix encoding for Japanese. +/// +/// For compatibility with Web servers that don't expect three-byte sequences +/// in form submissions, the encoder doesn't generate three-byte sequences. +/// That is, the JIS X 0212 support is decode-only. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/euc-jp.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-jp-bmp.html) +/// +/// This encoding roughly matches the Windows code page 20932. There are error +/// handling differences and a handful of 2-byte sequences that decode differently. +/// Additionall, Windows doesn't support 3-byte sequences. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static EUC_JP: &'static Encoding = &EUC_JP_INIT; -/// The initializer for the EUC-KR encoding. +/// The initializer for the [EUC-KR](static.EUC_KR.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -808,13 +848,24 @@ pub static EUC_KR_INIT: Encoding = Encoding { /// The EUC-KR encoding. /// +/// This is the Korean encoding for Windows. It extends the Unix legacy encoding +/// for Korean, based on KS X 1001 (which also formed the base of MacKorean on Mac OS +/// Classic), with all the characters from the Hangul Syllables block of Unicode. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/euc-kr.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-kr-bmp.html) +/// +/// This encoding matches the Windows code page 949, except Windows decodes byte 0x80 +/// to U+0080 and some byte sequences that are error per the Encoding Standard to +/// the question mark or the Private Use Area. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static EUC_KR: &'static Encoding = &EUC_KR_INIT; -/// The initializer for the GBK encoding. +/// The initializer for the [GBK](static.GBK.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -833,13 +884,30 @@ pub static GBK_INIT: Encoding = Encoding { /// The GBK encoding. /// +/// The decoder for this encoding is the same as the decoder for gb18030. +/// The encoder side of this encoding is GBK with Windows code page 936 euro +/// sign behavior. GBK extends GB2312-80 to cover the CJK Unified Ideographs +/// Unicode block as well as a handful of ideographs from the CJK Unified +/// Ideographs Extension A and CJK Compatibility Ideographs blocks. +/// +/// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't +/// unified with the gb18030 encoder in the Encoding Standard out of concern +/// that servers that expect GBK form submissions might not be able to handle +/// the four-byte sequences. +/// +/// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html), +/// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html) +/// +/// The encoder of this encoding roughly matches the Windows code page 936. +/// The decoder side is a superset. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static GBK: &'static Encoding = &GBK_INIT; -/// The initializer for the IBM866 encoding. +/// The initializer for the [IBM866](static.IBM866.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -858,13 +926,22 @@ pub static IBM866_INIT: Encoding = Encoding { /// The IBM866 encoding. /// +/// This the most notable one of the DOS Cyrillic code pages. It has the same +/// box drawing characters as code page 437, so it can be used for decoding +/// DOS-era ASCII + box drawing data. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/ibm866.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/ibm866-bmp.html) +/// +/// This encoding matches the Windows code page 866. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static IBM866: &'static Encoding = &IBM866_INIT; -/// The initializer for the ISO-2022-JP encoding. +/// The initializer for the [ISO-2022-JP](static.ISO_2022_JP.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -883,13 +960,24 @@ pub static ISO_2022_JP_INIT: Encoding = Encoding { /// The ISO-2022-JP encoding. /// +/// This the primary pre-UTF-8 encoding for Japanese email. It uses the ASCII +/// byte range to encode non-Basic Latin characters. It's the only encoding +/// supported by this crate whose encoder is stateful. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/jis0208.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/jis0208-bmp.html) +/// +/// This encoding roughly matches the Windows code page 50220. Notably, Windows +/// uses U+30FB in place of the REPLACEMENT CHARACTER and otherwise differs in +/// error handling. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static ISO_2022_JP: &'static Encoding = &ISO_2022_JP_INIT; -/// The initializer for the ISO-8859-10 encoding. +/// The initializer for the [ISO-8859-10](static.ISO_8859_10.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -908,13 +996,22 @@ pub static ISO_8859_10_INIT: Encoding = Encoding { /// The ISO-8859-10 encoding. /// +/// This is the Nordic part of the ISO/IEC 8859 encoding family. This encoding +/// is also known as Latin 6. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-10.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-10-bmp.html) +/// +/// The Windows code page number for this encoding is 28600, but kernel32.dll +/// does not support this encoding. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static ISO_8859_10: &'static Encoding = &ISO_8859_10_INIT; -/// The initializer for the ISO-8859-13 encoding. +/// The initializer for the [ISO-8859-13](static.ISO_8859_13.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -933,13 +1030,22 @@ pub static ISO_8859_13_INIT: Encoding = Encoding { /// The ISO-8859-13 encoding. /// +/// This is the Baltic part of the ISO/IEC 8859 encoding family. This encoding +/// is also known as Latin 7. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-13.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-13-bmp.html) +/// +/// This encoding matches the Windows code page 28603, except Windows decodes +/// unassigned code points to the Private Use Area of Unicode. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static ISO_8859_13: &'static Encoding = &ISO_8859_13_INIT; -/// The initializer for the ISO-8859-14 encoding. +/// The initializer for the [ISO-8859-14](static.ISO_8859_14.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -958,13 +1064,22 @@ pub static ISO_8859_14_INIT: Encoding = Encoding { /// The ISO-8859-14 encoding. /// +/// This is the Celtic part of the ISO/IEC 8859 encoding family. This encoding +/// is also known as Latin 8. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-14.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-14-bmp.html) +/// +/// The Windows code page number for this encoding is 28604, but kernel32.dll +/// does not support this encoding. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static ISO_8859_14: &'static Encoding = &ISO_8859_14_INIT; -/// The initializer for the ISO-8859-15 encoding. +/// The initializer for the [ISO-8859-15](static.ISO_8859_15.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -983,13 +1098,21 @@ pub static ISO_8859_15_INIT: Encoding = Encoding { /// The ISO-8859-15 encoding. /// +/// This is the revised Western European part of the ISO/IEC 8859 encoding +/// family. This encoding is also known as Latin 9. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-15.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-15-bmp.html) +/// +/// This encoding matches the Windows code page 28605. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static ISO_8859_15: &'static Encoding = &ISO_8859_15_INIT; -/// The initializer for the ISO-8859-16 encoding. +/// The initializer for the [ISO-8859-16](static.ISO_8859_16.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1008,13 +1131,22 @@ pub static ISO_8859_16_INIT: Encoding = Encoding { /// The ISO-8859-16 encoding. /// +/// This is the South-Eastern European part of the ISO/IEC 8859 encoding +/// family. This encoding is also known as Latin 10. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-16.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-16-bmp.html) +/// +/// The Windows code page number for this encoding is 28606, but kernel32.dll +/// does not support this encoding. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static ISO_8859_16: &'static Encoding = &ISO_8859_16_INIT; -/// The initializer for the ISO-8859-2 encoding. +/// The initializer for the [ISO-8859-2](static.ISO_8859_2.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1033,13 +1165,20 @@ pub static ISO_8859_2_INIT: Encoding = Encoding { /// The ISO-8859-2 encoding. /// +/// This is the Central European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 2. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-2.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-2-bmp.html) +/// +/// This encoding matches the Windows code page 28592. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static ISO_8859_2: &'static Encoding = &ISO_8859_2_INIT; -/// The initializer for the ISO-8859-3 encoding. +/// The initializer for the [ISO-8859-3](static.ISO_8859_3.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1058,13 +1197,20 @@ pub static ISO_8859_3_INIT: Encoding = Encoding { /// The ISO-8859-3 encoding. /// +/// This is the South European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 3. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-3.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-3-bmp.html) +/// +/// This encoding matches the Windows code page 28593. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static ISO_8859_3: &'static Encoding = &ISO_8859_3_INIT; -/// The initializer for the ISO-8859-4 encoding. +/// The initializer for the [ISO-8859-4](static.ISO_8859_4.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1083,13 +1229,20 @@ pub static ISO_8859_4_INIT: Encoding = Encoding { /// The ISO-8859-4 encoding. /// +/// This is the North European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 4. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-4.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-4-bmp.html) +/// +/// This encoding matches the Windows code page 28594. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static ISO_8859_4: &'static Encoding = &ISO_8859_4_INIT; -/// The initializer for the ISO-8859-5 encoding. +/// The initializer for the [ISO-8859-5](static.ISO_8859_5.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1108,13 +1261,20 @@ pub static ISO_8859_5_INIT: Encoding = Encoding { /// The ISO-8859-5 encoding. /// +/// This is the Cyrillic part of the ISO/IEC 8859 encoding family. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-5.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-5-bmp.html) +/// +/// This encoding matches the Windows code page 28595. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static ISO_8859_5: &'static Encoding = &ISO_8859_5_INIT; -/// The initializer for the ISO-8859-6 encoding. +/// The initializer for the [ISO-8859-6](static.ISO_8859_6.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1133,13 +1293,21 @@ pub static ISO_8859_6_INIT: Encoding = Encoding { /// The ISO-8859-6 encoding. /// +/// This is the Arabic part of the ISO/IEC 8859 encoding family. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-6.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-6-bmp.html) +/// +/// This encoding matches the Windows code page 28596, except Windows decodes +/// unassigned code points to the Private Use Area of Unicode. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static ISO_8859_6: &'static Encoding = &ISO_8859_6_INIT; -/// The initializer for the ISO-8859-7 encoding. +/// The initializer for the [ISO-8859-7](static.ISO_8859_7.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1158,13 +1326,25 @@ pub static ISO_8859_7_INIT: Encoding = Encoding { /// The ISO-8859-7 encoding. /// +/// This is the Greek part of the ISO/IEC 8859 encoding family. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-7.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-7-bmp.html) +/// +/// This encoding roughly matches the Windows code page 28597. Windows decodes +/// unassigned code points, the currency signs at 0xA4 and 0xA5 as well as +/// 0xAA, which should be U+037A GREEK YPOGEGRAMMENI, to the Private Use Area +/// of Unicode. Windows decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA +/// instead of U+2018 LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER +/// LETTER APOSTROPHE instead of U+2019 RIGHT SINGLE QUOTATION MARK. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static ISO_8859_7: &'static Encoding = &ISO_8859_7_INIT; -/// The initializer for the ISO-8859-8 encoding. +/// The initializer for the [ISO-8859-8](static.ISO_8859_8.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1183,13 +1363,23 @@ pub static ISO_8859_8_INIT: Encoding = Encoding { /// The ISO-8859-8 encoding. /// +/// This is the Hebrew part of the ISO/IEC 8859 encoding family in visual order. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html) +/// +/// This encoding roughly matches the Windows code page 28598. Windows decodes +/// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use +/// Area instead of LRM and RLM. Windows decodes unassigned code points to +/// the private use area. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static ISO_8859_8: &'static Encoding = &ISO_8859_8_INIT; -/// The initializer for the ISO-8859-8-I encoding. +/// The initializer for the [ISO-8859-8-I](static.ISO_8859_8_I.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1208,13 +1398,23 @@ pub static ISO_8859_8_I_INIT: Encoding = Encoding { /// The ISO-8859-8-I encoding. /// +/// This is the Hebrew part of the ISO/IEC 8859 encoding family in logical order. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html) +/// +/// This encoding roughly matches the Windows code page 38598. Windows decodes +/// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use +/// Area instead of LRM and RLM. Windows decodes unassigned code points to +/// the private use area. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static ISO_8859_8_I: &'static Encoding = &ISO_8859_8_I_INIT; -/// The initializer for the KOI8-R encoding. +/// The initializer for the [KOI8-R](static.KOI8_R.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1233,13 +1433,20 @@ pub static KOI8_R_INIT: Encoding = Encoding { /// The KOI8-R encoding. /// +/// This is an encoding for Russian from [RFC 1489](https://tools.ietf.org/html/rfc1489). +/// +/// [Index visualization](https://encoding.spec.whatwg.org/koi8-r.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-r-bmp.html) +/// +/// This encoding matches the Windows code page 20866. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static KOI8_R: &'static Encoding = &KOI8_R_INIT; -/// The initializer for the KOI8-U encoding. +/// The initializer for the [KOI8-U](static.KOI8_U.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1258,13 +1465,20 @@ pub static KOI8_U_INIT: Encoding = Encoding { /// The KOI8-U encoding. /// +/// This is an encoding for Ukrainian adapted from KOI8-R. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/koi8-u.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-u-bmp.html) +/// +/// This encoding matches the Windows code page 21866. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static KOI8_U: &'static Encoding = &KOI8_U_INIT; -/// The initializer for the Shift_JIS encoding. +/// The initializer for the [Shift_JIS](static.SHIFT_JIS.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1283,13 +1497,22 @@ pub static SHIFT_JIS_INIT: Encoding = Encoding { /// The Shift_JIS encoding. /// +/// This is the Japanese encoding for Windows. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/shift_jis.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/shift_jis-bmp.html) +/// +/// This encoding matches the Windows code page 932, except Windows decodes some byte +/// sequences that are error per the Encoding Standard to the question mark or the +/// Private Use Area and generally uses U+30FB in place of the REPLACEMENT CHARACTER. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static SHIFT_JIS: &'static Encoding = &SHIFT_JIS_INIT; -/// The initializer for the UTF-16BE encoding. +/// The initializer for the [UTF-16BE](static.UTF_16BE.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1308,13 +1531,22 @@ pub static UTF_16BE_INIT: Encoding = Encoding { /// The UTF-16BE encoding. /// +/// This decode-only encoding uses 16-bit code units due to Unicode originally +/// having been designed as a 16-bit reportoire. In the absence of a byte order +/// mark the big endian byte order is assumed. +/// +/// There is no corresponding encoder in this crate or in the Encoding +/// Standard. The output encoding of this encoding is UTF-8. +/// +/// This encoding matches the Windows code page 1201. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static UTF_16BE: &'static Encoding = &UTF_16BE_INIT; -/// The initializer for the UTF-16LE encoding. +/// The initializer for the [UTF-16LE](static.UTF_16LE.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1333,13 +1565,22 @@ pub static UTF_16LE_INIT: Encoding = Encoding { /// The UTF-16LE encoding. /// +/// This decode-only encoding uses 16-bit code units due to Unicode originally +/// having been designed as a 16-bit reportoire. In the absence of a byte order +/// mark the little endian byte order is assumed. +/// +/// There is no corresponding encoder in this crate or in the Encoding +/// Standard. The output encoding of this encoding is UTF-8. +/// +/// This encoding matches the Windows code page 1200. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static UTF_16LE: &'static Encoding = &UTF_16LE_INIT; -/// The initializer for the UTF-8 encoding. +/// The initializer for the [UTF-8](static.UTF_8.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1358,13 +1599,19 @@ pub static UTF_8_INIT: Encoding = Encoding { /// The UTF-8 encoding. /// +/// This is the encoding that should be used for all new development it can +/// represent all of Unicode. +/// +/// This encoding matches the Windows code page 65001, except Windows differs +/// in the number of errors generated for some erroneous byte sequences. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static UTF_8: &'static Encoding = &UTF_8_INIT; -/// The initializer for the gb18030 encoding. +/// The initializer for the [gb18030](static.GB18030.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1383,13 +1630,23 @@ pub static GB18030_INIT: Encoding = Encoding { /// The gb18030 encoding. /// +/// This encoding matches GB18030-2005 except the two-byte sequence 0xA3 0xA0 +/// maps to U+3000 for compatibility with existing Web content. As a result, +/// this encoding can represent all of Unicode except for the private-use +/// character U+E5E5. +/// +/// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html), +/// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html) +/// +/// This encoding matches the Windows code page 54936. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static GB18030: &'static Encoding = &GB18030_INIT; -/// The initializer for the macintosh encoding. +/// The initializer for the [macintosh](static.MACINTOSH.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1408,13 +1665,21 @@ pub static MACINTOSH_INIT: Encoding = Encoding { /// The macintosh encoding. /// +/// This is the MacRoman encoding from Mac OS Classic. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/macintosh.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/macintosh-bmp.html) +/// +/// This encoding matches the Windows code page 10000, except Windows decodes +/// 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static MACINTOSH: &'static Encoding = &MACINTOSH_INIT; -/// The initializer for the replacement encoding. +/// The initializer for the [replacement](static.REPLACEMENT.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1433,13 +1698,24 @@ pub static REPLACEMENT_INIT: Encoding = Encoding { /// The replacement encoding. /// +/// This decode-only encoding decodes all non-zero-length streams to a single +/// REPLACEMENT CHARACTER. Its purpose is to avoid the use of an +/// ASCII-compatible fallback encoding (typically windows-1252) for some +/// encodings that are no longer supported by the Web Platform and that +/// would be dangerous to treat as ASCII-compatible. +/// +/// There is no corresponding encoder. The output encoding of this encoding +/// is UTF-8. +/// +/// This encoding does not have a Windows code page number. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static REPLACEMENT: &'static Encoding = &REPLACEMENT_INIT; -/// The initializer for the windows-1250 encoding. +/// The initializer for the [windows-1250](static.WINDOWS_1250.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1458,13 +1734,20 @@ pub static WINDOWS_1250_INIT: Encoding = Encoding { /// The windows-1250 encoding. /// +/// This is the Central European encoding for Windows. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/windows-1250.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1250-bmp.html) +/// +/// This encoding matches the Windows code page 1250. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static WINDOWS_1250: &'static Encoding = &WINDOWS_1250_INIT; -/// The initializer for the windows-1251 encoding. +/// The initializer for the [windows-1251](static.WINDOWS_1251.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1483,13 +1766,20 @@ pub static WINDOWS_1251_INIT: Encoding = Encoding { /// The windows-1251 encoding. /// +/// This is the Cyrillic encoding for Windows. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/windows-1251.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1251-bmp.html) +/// +/// This encoding matches the Windows code page 1251. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static WINDOWS_1251: &'static Encoding = &WINDOWS_1251_INIT; -/// The initializer for the windows-1252 encoding. +/// The initializer for the [windows-1252](static.WINDOWS_1252.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1508,13 +1798,21 @@ pub static WINDOWS_1252_INIT: Encoding = Encoding { /// The windows-1252 encoding. /// +/// This is the Western encoding for Windows. It is an extension of ISO-8859-1, +/// which is known as Latin 1. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/windows-1252.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1252-bmp.html) +/// +/// This encoding matches the Windows code page 1252. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static WINDOWS_1252: &'static Encoding = &WINDOWS_1252_INIT; -/// The initializer for the windows-1253 encoding. +/// The initializer for the [windows-1253](static.WINDOWS_1253.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1533,13 +1831,22 @@ pub static WINDOWS_1253_INIT: Encoding = Encoding { /// The windows-1253 encoding. /// +/// This is the Greek encoding for Windows. It is mostly an extension of +/// ISO-8859-7, but U+0386 is mapped to a different byte. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/windows-1253.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1253-bmp.html) +/// +/// This encoding matches the Windows code page 1253, except Windows decodes +/// unassigned code points to the Private Use Area of Unicode. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static WINDOWS_1253: &'static Encoding = &WINDOWS_1253_INIT; -/// The initializer for the windows-1254 encoding. +/// The initializer for the [windows-1254](static.WINDOWS_1254.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1558,13 +1865,21 @@ pub static WINDOWS_1254_INIT: Encoding = Encoding { /// The windows-1254 encoding. /// +/// This is the Turkish encoding for Windows. It is an extension of ISO-8859-9, +/// which is known as Latin 5. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/windows-1254.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1254-bmp.html) +/// +/// This encoding matches the Windows code page 1254. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static WINDOWS_1254: &'static Encoding = &WINDOWS_1254_INIT; -/// The initializer for the windows-1255 encoding. +/// The initializer for the [windows-1255](static.WINDOWS_1255.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1583,13 +1898,22 @@ pub static WINDOWS_1255_INIT: Encoding = Encoding { /// The windows-1255 encoding. /// +/// This is the Hebrew encoding for Windows. It is an extension of ISO-8859-8-I, +/// except for a currency sign swap. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/windows-1255.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1255-bmp.html) +/// +/// This encoding matches the Windows code page 1255, except Windows decodes +/// unassigned code points to the Private Use Area of Unicode. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static WINDOWS_1255: &'static Encoding = &WINDOWS_1255_INIT; -/// The initializer for the windows-1256 encoding. +/// The initializer for the [windows-1256](static.WINDOWS_1256.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1608,13 +1932,20 @@ pub static WINDOWS_1256_INIT: Encoding = Encoding { /// The windows-1256 encoding. /// +/// This is the Arabic encoding for Windows. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/windows-1256.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1256-bmp.html) +/// +/// This encoding matches the Windows code page 1256. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static WINDOWS_1256: &'static Encoding = &WINDOWS_1256_INIT; -/// The initializer for the windows-1257 encoding. +/// The initializer for the [windows-1257](static.WINDOWS_1257.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1633,13 +1964,21 @@ pub static WINDOWS_1257_INIT: Encoding = Encoding { /// The windows-1257 encoding. /// +/// This is the Baltic encoding for Windows. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/windows-1257.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1257-bmp.html) +/// +/// This encoding matches the Windows code page 1257, except Windows decodes +/// unassigned code points to the Private Use Area of Unicode. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static WINDOWS_1257: &'static Encoding = &WINDOWS_1257_INIT; -/// The initializer for the windows-1258 encoding. +/// The initializer for the [windows-1258](static.WINDOWS_1258.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1658,13 +1997,25 @@ pub static WINDOWS_1258_INIT: Encoding = Encoding { /// The windows-1258 encoding. /// +/// This is the Vietnamese encoding for Windows. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/windows-1258.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1258-bmp.html) +/// +/// This encoding matches the Windows code page 1258 when used in the +/// non-normalizing mode. Unlike with the other single-byte encodings, the +/// result of decoding is not necessarily in Normalization Form C. On the +/// other hand, input in the Normalization Form C is not encoded without +/// replacement. In general, it's a bad idea to encode to encodings other +/// than UTF-8, but this encoding is especially hazardous to encode to. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static WINDOWS_1258: &'static Encoding = &WINDOWS_1258_INIT; -/// The initializer for the windows-874 encoding. +/// The initializer for the [windows-874](static.WINDOWS_874.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1683,13 +2034,21 @@ pub static WINDOWS_874_INIT: Encoding = Encoding { /// The windows-874 encoding. /// +/// This is the Thai encoding for Windows. It is an extension of TIS-620 / ISO-8859-11. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/windows-874.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-874-bmp.html) +/// +/// This encoding matches the Windows code page 874, except Windows decodes +/// unassigned code points to the Private Use Area of Unicode. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static WINDOWS_874: &'static Encoding = &WINDOWS_874_INIT; -/// The initializer for the x-mac-cyrillic encoding. +/// The initializer for the [x-mac-cyrillic](static.X_MAC_CYRILLIC.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1708,13 +2067,20 @@ pub static X_MAC_CYRILLIC_INIT: Encoding = Encoding { /// The x-mac-cyrillic encoding. /// +/// This is the MacUkrainian encoding from Mac OS Classic. +/// +/// [Index visualization](https://encoding.spec.whatwg.org/x-mac-cyrillic.html), +/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/x-mac-cyrillic-bmp.html) +/// +/// This encoding matches the Windows code page 10017. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this /// `static`. pub static X_MAC_CYRILLIC: &'static Encoding = &X_MAC_CYRILLIC_INIT; -/// The initializer for the x-user-defined encoding. +/// The initializer for the [x-user-defined](static.X_USER_DEFINED.html) encoding. /// /// For use only for taking the address of this form when /// Rust prohibits the use of the non-`_INIT` form directly, @@ -1733,6 +2099,13 @@ pub static X_USER_DEFINED_INIT: Encoding = Encoding { /// The x-user-defined encoding. /// +/// This encoding offsets the non-ASCII bytes by `0xF700` thereby decoding +/// them to the Private Use Area of Unicode. It was used for loading binary +/// data into a JavaScript string using `XMLHttpRequest` before XHR supported +/// the `"arraybuffer"` response type. +/// +/// This encoding does not have a Windows code page number. +/// /// This will change from `static` to `const` if Rust changes /// to make the referent of `pub const FOO: &'static Encoding` /// unique cross-crate, so don't take the address of this @@ -3347,7 +3720,8 @@ impl Decoder { | DecoderLifeCycle::AtUtf8Start | DecoderLifeCycle::AtUtf16LeStart | DecoderLifeCycle::AtUtf16BeStart => { - return self.variant + return self + .variant .max_utf8_buffer_length_without_replacement(byte_length) } DecoderLifeCycle::AtStart => { @@ -3362,7 +3736,8 @@ impl Decoder { // No need to consider the internal state of the underlying decoder, // because it is at start, because no data has reached it yet. return Some(utf_bom); - } else if let Some(non_bom) = self.variant + } else if let Some(non_bom) = self + .variant .max_utf8_buffer_length_without_replacement(byte_length) { return Some(std::cmp::max(utf_bom, non_bom)); diff --git a/src/mem.rs b/src/mem.rs index 7e84ecb..81c5b6e 100644 --- a/src/mem.rs +++ b/src/mem.rs @@ -195,9 +195,8 @@ macro_rules! by_unit_check_simd { } let mut simd_accu = $splat; while offset <= len_minus_stride { - simd_accu = simd_accu | unsafe { - *(src.offset(offset as isize) as *const $simd_ty) - }; + simd_accu = simd_accu + | unsafe { *(src.offset(offset as isize) as *const $simd_ty) }; offset += SIMD_STRIDE_SIZE / unit_size; } if !$func(simd_accu) { @@ -1279,7 +1278,9 @@ pub fn is_char_bidi(c: char) -> bool { // Above Arabic Extended-A and below Arabic Presentation Forms if in_inclusive_range32(code_point, 0x200F, 0x2067) { // In the range that contains the RTL controls - return code_point == 0x200F || code_point == 0x202B || code_point == 0x202E + return code_point == 0x200F + || code_point == 0x202B + || code_point == 0x202E || code_point == 0x2067; } return false; @@ -1514,7 +1515,8 @@ pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize { // Three-byte let second = bytes[read + 1]; let third = bytes[read + 2]; - let point = (((byte as u32) & 0xFu32) << 12) | ((second as u32 & 0x3Fu32) << 6) + let point = (((byte as u32) & 0xFu32) << 12) + | ((second as u32 & 0x3Fu32) << 6) | (third as u32 & 0x3Fu32); dst[written] = point as u16; read += 3; @@ -1524,7 +1526,8 @@ pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize { let second = bytes[read + 1]; let third = bytes[read + 2]; let fourth = bytes[read + 3]; - let point = (((byte as u32) & 0x7u32) << 18) | ((second as u32 & 0x3Fu32) << 12) + let point = (((byte as u32) & 0x7u32) << 18) + | ((second as u32 & 0x3Fu32) << 12) | ((third as u32 & 0x3Fu32) << 6) | (fourth as u32 & 0x3Fu32); dst[written] = (0xD7C0 + (point >> 10)) as u16; diff --git a/src/shift_jis.rs b/src/shift_jis.rs index e93ae2c..1aea7c3 100644 --- a/src/shift_jis.rs +++ b/src/shift_jis.rs @@ -248,7 +248,8 @@ impl ShiftJisEncoder { 10716 + bmp_minus_roman as usize } else if let Some(pointer) = jis0208_range_encode(bmp) { pointer - } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D) || bmp == 0xF929 + } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D) + || bmp == 0xF929 || bmp == 0xF9DC { // Guaranteed to be found in IBM_KANJI diff --git a/src/simd_funcs.rs b/src/simd_funcs.rs index 3c2ec34..e90343a 100644 --- a/src/simd_funcs.rs +++ b/src/simd_funcs.rs @@ -277,10 +277,15 @@ pub fn is_u16x8_bidi(s: u16x8) -> bool { // Quick refutation failed. Let's do the full check. - (in_range16x8!(s, 0x0590, 0x0900) | in_range16x8!(s, 0xFB50, 0xFE00) - | in_range16x8!(s, 0xFE70, 0xFF00) | in_range16x8!(s, 0xD802, 0xD804) - | in_range16x8!(s, 0xD83A, 0xD83C) | s.eq(u16x8::splat(0x200F)) - | s.eq(u16x8::splat(0x202B)) | s.eq(u16x8::splat(0x202E)) | s.eq(u16x8::splat(0x2067))) + (in_range16x8!(s, 0x0590, 0x0900) + | in_range16x8!(s, 0xFB50, 0xFE00) + | in_range16x8!(s, 0xFE70, 0xFF00) + | in_range16x8!(s, 0xD802, 0xD804) + | in_range16x8!(s, 0xD83A, 0xD83C) + | s.eq(u16x8::splat(0x200F)) + | s.eq(u16x8::splat(0x202B)) + | s.eq(u16x8::splat(0x202E)) + | s.eq(u16x8::splat(0x2067))) .any() } diff --git a/src/utf_16.rs b/src/utf_16.rs index f3ec16b..8f82010 100644 --- a/src/utf_16.rs +++ b/src/utf_16.rs @@ -29,11 +29,9 @@ impl Utf16Decoder { } pub fn additional_from_state(&self) -> usize { - 1 + if self.lead_byte.is_some() { 1 } else { 0 } + if self.lead_surrogate == 0 { - 0 - } else { - 2 - } + 1 + + if self.lead_byte.is_some() { 1 } else { 0 } + + if self.lead_surrogate == 0 { 0 } else { 2 } } pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option { diff --git a/src/utf_8.rs b/src/utf_8.rs index f9c02d3..db5c62a 100644 --- a/src/utf_8.rs +++ b/src/utf_8.rs @@ -372,7 +372,8 @@ pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usiz { break 'outer; } - let point = (((byte as u32) & 0xFu32) << 12) | ((second as u32 & 0x3Fu32) << 6) + let point = (((byte as u32) & 0xFu32) << 12) + | ((second as u32 & 0x3Fu32) << 6) | (third as u32 & 0x3Fu32); dst[written] = point as u16; read = new_read; @@ -393,7 +394,8 @@ pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usiz { break 'outer; } - let point = (((byte as u32) & 0xFu32) << 12) | ((second as u32 & 0x3Fu32) << 6) + let point = (((byte as u32) & 0xFu32) << 12) + | ((second as u32 & 0x3Fu32) << 6) | (third as u32 & 0x3Fu32); dst[written] = point as u16; read = new_read; @@ -414,7 +416,8 @@ pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usiz { break 'outer; } - let point = (((byte as u32) & 0xFu32) << 12) | ((second as u32 & 0x3Fu32) << 6) + let point = (((byte as u32) & 0xFu32) << 12) + | ((second as u32 & 0x3Fu32) << 6) | (third as u32 & 0x3Fu32); dst[written] = point as u16; read = new_read;