diff --git a/doc/Big5.txt b/doc/Big5.txt
new file mode 100644
index 0000000..61e8fd5
--- /dev/null
+++ b/doc/Big5.txt
@@ -0,0 +1,16 @@
+/// This is Big5 with HKSCS with mappings to more recent Unicode assignments
+/// instead of the Private Use Area code points that have been used historically.
+/// It is believed to be able to decode existing Web content in a way that makes
+/// sense.
+///
+/// To avoid form submissions generating data that Web servers don't understand,
+/// the encoder doesn't use the HKSCS byte sequences that precede the unextended
+/// Big5 in the lexical order.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/big5.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/big5-bmp.html)
+///
+/// This encoding is designed to be suited for decoding the Windows code page 950
+/// and its HKSCS patched "951" variant such that the text makes sense, given
+/// assignments that Unicode has made after those encodings used Private Use
+/// Area characters.
diff --git a/doc/EUC-JP.txt b/doc/EUC-JP.txt
new file mode 100644
index 0000000..f90a735
--- /dev/null
+++ b/doc/EUC-JP.txt
@@ -0,0 +1,12 @@
+/// This is the legacy Unix encoding for Japanese.
+///
+/// For compatibility with Web servers that don't expect three-byte sequences
+/// in form submissions, the encoder doesn't generate three-byte sequences.
+/// That is, the JIS X 0212 support is decode-only.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/euc-jp.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-jp-bmp.html)
+///
+/// This encoding roughly matches the Windows code page 20932. There are error
+/// handling differences and a handful of 2-byte sequences that decode differently.
+/// Additionall, Windows doesn't support 3-byte sequences.
diff --git a/doc/EUC-KR.txt b/doc/EUC-KR.txt
new file mode 100644
index 0000000..ef24c98
--- /dev/null
+++ b/doc/EUC-KR.txt
@@ -0,0 +1,10 @@
+/// This is the Korean encoding for Windows. It extends the Unix legacy encoding
+/// for Korean, based on KS X 1001 (which also formed the base of MacKorean on Mac OS
+/// Classic), with all the characters from the Hangul Syllables block of Unicode.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/euc-kr.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-kr-bmp.html)
+///
+/// This encoding matches the Windows code page 949, except Windows decodes byte 0x80
+/// to U+0080 and some byte sequences that are error per the Encoding Standard to
+/// the question mark or the Private Use Area.
diff --git a/doc/GBK.txt b/doc/GBK.txt
new file mode 100644
index 0000000..2faefff
--- /dev/null
+++ b/doc/GBK.txt
@@ -0,0 +1,16 @@
+/// The decoder for this encoding is the same as the decoder for gb18030.
+/// The encoder side of this encoding is GBK with Windows code page 936 euro
+/// sign behavior. GBK extends GB2312-80 to cover the CJK Unified Ideographs
+/// Unicode block as well as a handful of ideographs from the CJK Unified
+/// Ideographs Extension A and CJK Compatibility Ideographs blocks.
+///
+/// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't
+/// unified with the gb18030 encoder in the Encoding Standard out of concern
+/// that servers that expect GBK form submissions might not be able to handle
+/// the four-byte sequences.
+///
+/// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
+/// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
+///
+/// The encoder of this encoding roughly matches the Windows code page 936.
+/// The decoder side is a superset.
diff --git a/doc/IBM866.txt b/doc/IBM866.txt
new file mode 100644
index 0000000..871ff42
--- /dev/null
+++ b/doc/IBM866.txt
@@ -0,0 +1,8 @@
+/// This the most notable one of the DOS Cyrillic code pages. It has the same
+/// box drawing characters as code page 437, so it can be used for decoding
+/// DOS-era ASCII + box drawing data.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/ibm866.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/ibm866-bmp.html)
+///
+/// This encoding matches the Windows code page 866.
diff --git a/doc/ISO-2022-JP.txt b/doc/ISO-2022-JP.txt
new file mode 100644
index 0000000..65713a1
--- /dev/null
+++ b/doc/ISO-2022-JP.txt
@@ -0,0 +1,10 @@
+/// This the primary pre-UTF-8 encoding for Japanese email. It uses the ASCII
+/// byte range to encode non-Basic Latin characters. It's the only encoding
+/// supported by this crate whose encoder is stateful.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/jis0208.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/jis0208-bmp.html)
+///
+/// This encoding roughly matches the Windows code page 50220. Notably, Windows
+/// uses U+30FB in place of the REPLACEMENT CHARACTER and otherwise differs in
+/// error handling.
diff --git a/doc/ISO-8859-10.txt b/doc/ISO-8859-10.txt
new file mode 100644
index 0000000..8aca388
--- /dev/null
+++ b/doc/ISO-8859-10.txt
@@ -0,0 +1,8 @@
+/// This is the Nordic part of the ISO/IEC 8859 encoding family. This encoding
+/// is also known as Latin 6.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-10.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-10-bmp.html)
+///
+/// The Windows code page number for this encoding is 28600, but kernel32.dll
+/// does not support this encoding.
diff --git a/doc/ISO-8859-13.txt b/doc/ISO-8859-13.txt
new file mode 100644
index 0000000..20cd549
--- /dev/null
+++ b/doc/ISO-8859-13.txt
@@ -0,0 +1,8 @@
+/// This is the Baltic part of the ISO/IEC 8859 encoding family. This encoding
+/// is also known as Latin 7.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-13.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-13-bmp.html)
+///
+/// This encoding matches the Windows code page 28603, except Windows decodes
+/// unassigned code points to the Private Use Area of Unicode.
diff --git a/doc/ISO-8859-14.txt b/doc/ISO-8859-14.txt
new file mode 100644
index 0000000..3e4833b
--- /dev/null
+++ b/doc/ISO-8859-14.txt
@@ -0,0 +1,8 @@
+/// This is the Celtic part of the ISO/IEC 8859 encoding family. This encoding
+/// is also known as Latin 8.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-14.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-14-bmp.html)
+///
+/// The Windows code page number for this encoding is 28604, but kernel32.dll
+/// does not support this encoding.
diff --git a/doc/ISO-8859-15.txt b/doc/ISO-8859-15.txt
new file mode 100644
index 0000000..922896a
--- /dev/null
+++ b/doc/ISO-8859-15.txt
@@ -0,0 +1,7 @@
+/// This is the revised Western European part of the ISO/IEC 8859 encoding
+/// family. This encoding is also known as Latin 9.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-15.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-15-bmp.html)
+///
+/// This encoding matches the Windows code page 28605.
diff --git a/doc/ISO-8859-16.txt b/doc/ISO-8859-16.txt
new file mode 100644
index 0000000..d1ae50b
--- /dev/null
+++ b/doc/ISO-8859-16.txt
@@ -0,0 +1,8 @@
+/// This is the South-Eastern European part of the ISO/IEC 8859 encoding
+/// family. This encoding is also known as Latin 10.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-16.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-16-bmp.html)
+///
+/// The Windows code page number for this encoding is 28606, but kernel32.dll
+/// does not support this encoding.
diff --git a/doc/ISO-8859-2.txt b/doc/ISO-8859-2.txt
new file mode 100644
index 0000000..298df09
--- /dev/null
+++ b/doc/ISO-8859-2.txt
@@ -0,0 +1,6 @@
+/// This is the Central European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 2.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-2.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-2-bmp.html)
+///
+/// This encoding matches the Windows code page 28592.
diff --git a/doc/ISO-8859-3.txt b/doc/ISO-8859-3.txt
new file mode 100644
index 0000000..c462ce8
--- /dev/null
+++ b/doc/ISO-8859-3.txt
@@ -0,0 +1,6 @@
+/// This is the South European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 3.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-3.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-3-bmp.html)
+///
+/// This encoding matches the Windows code page 28593.
diff --git a/doc/ISO-8859-4.txt b/doc/ISO-8859-4.txt
new file mode 100644
index 0000000..40449c4
--- /dev/null
+++ b/doc/ISO-8859-4.txt
@@ -0,0 +1,6 @@
+/// This is the North European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 4.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-4.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-4-bmp.html)
+///
+/// This encoding matches the Windows code page 28594.
diff --git a/doc/ISO-8859-5.txt b/doc/ISO-8859-5.txt
new file mode 100644
index 0000000..41774ec
--- /dev/null
+++ b/doc/ISO-8859-5.txt
@@ -0,0 +1,6 @@
+/// This is the Cyrillic part of the ISO/IEC 8859 encoding family.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-5.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-5-bmp.html)
+///
+/// This encoding matches the Windows code page 28595.
diff --git a/doc/ISO-8859-6.txt b/doc/ISO-8859-6.txt
new file mode 100644
index 0000000..4c70c22
--- /dev/null
+++ b/doc/ISO-8859-6.txt
@@ -0,0 +1,7 @@
+/// This is the Arabic part of the ISO/IEC 8859 encoding family.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-6.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-6-bmp.html)
+///
+/// This encoding matches the Windows code page 28596, except Windows decodes
+/// unassigned code points to the Private Use Area of Unicode.
diff --git a/doc/ISO-8859-7.txt b/doc/ISO-8859-7.txt
new file mode 100644
index 0000000..b78ed38
--- /dev/null
+++ b/doc/ISO-8859-7.txt
@@ -0,0 +1,11 @@
+/// This is the Greek part of the ISO/IEC 8859 encoding family.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-7.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-7-bmp.html)
+///
+/// This encoding roughly matches the Windows code page 28597. Windows decodes
+/// unassigned code points, the currency signs at 0xA4 and 0xA5 as well as
+/// 0xAA, which should be U+037A GREEK YPOGEGRAMMENI, to the Private Use Area
+/// of Unicode. Windows decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA
+/// instead of U+2018 LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER
+/// LETTER APOSTROPHE instead of U+2019 RIGHT SINGLE QUOTATION MARK.
diff --git a/doc/ISO-8859-8-I.txt b/doc/ISO-8859-8-I.txt
new file mode 100644
index 0000000..b73e572
--- /dev/null
+++ b/doc/ISO-8859-8-I.txt
@@ -0,0 +1,9 @@
+/// This is the Hebrew part of the ISO/IEC 8859 encoding family in logical order.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
+///
+/// This encoding roughly matches the Windows code page 38598. Windows decodes
+/// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
+/// Area instead of LRM and RLM. Windows decodes unassigned code points to
+/// the private use area.
diff --git a/doc/ISO-8859-8.txt b/doc/ISO-8859-8.txt
new file mode 100644
index 0000000..c5600e3
--- /dev/null
+++ b/doc/ISO-8859-8.txt
@@ -0,0 +1,9 @@
+/// This is the Hebrew part of the ISO/IEC 8859 encoding family in visual order.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
+///
+/// This encoding roughly matches the Windows code page 28598. Windows decodes
+/// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
+/// Area instead of LRM and RLM. Windows decodes unassigned code points to
+/// the private use area.
diff --git a/doc/KOI8-R.txt b/doc/KOI8-R.txt
new file mode 100644
index 0000000..46dcfe7
--- /dev/null
+++ b/doc/KOI8-R.txt
@@ -0,0 +1,6 @@
+/// This is an encoding for Russian from [RFC 1489](https://tools.ietf.org/html/rfc1489).
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/koi8-r.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-r-bmp.html)
+///
+/// This encoding matches the Windows code page 20866.
diff --git a/doc/KOI8-U.txt b/doc/KOI8-U.txt
new file mode 100644
index 0000000..a263745
--- /dev/null
+++ b/doc/KOI8-U.txt
@@ -0,0 +1,6 @@
+/// This is an encoding for Ukrainian adapted from KOI8-R.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/koi8-u.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-u-bmp.html)
+///
+/// This encoding matches the Windows code page 21866.
diff --git a/doc/Shift_JIS.txt b/doc/Shift_JIS.txt
new file mode 100644
index 0000000..b982ab5
--- /dev/null
+++ b/doc/Shift_JIS.txt
@@ -0,0 +1,8 @@
+/// This is the Japanese encoding for Windows.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/shift_jis.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/shift_jis-bmp.html)
+///
+/// This encoding matches the Windows code page 932, except Windows decodes some byte
+/// sequences that are error per the Encoding Standard to the question mark or the
+/// Private Use Area and generally uses U+30FB in place of the REPLACEMENT CHARACTER.
diff --git a/doc/UTF-16BE.txt b/doc/UTF-16BE.txt
new file mode 100644
index 0000000..0a7df99
--- /dev/null
+++ b/doc/UTF-16BE.txt
@@ -0,0 +1,8 @@
+/// This decode-only encoding uses 16-bit code units due to Unicode originally
+/// having been designed as a 16-bit reportoire. In the absence of a byte order
+/// mark the big endian byte order is assumed.
+///
+/// There is no corresponding encoder in this crate or in the Encoding
+/// Standard. The output encoding of this encoding is UTF-8.
+///
+/// This encoding matches the Windows code page 1201.
diff --git a/doc/UTF-16LE.txt b/doc/UTF-16LE.txt
new file mode 100644
index 0000000..3a98e8b
--- /dev/null
+++ b/doc/UTF-16LE.txt
@@ -0,0 +1,8 @@
+/// This decode-only encoding uses 16-bit code units due to Unicode originally
+/// having been designed as a 16-bit reportoire. In the absence of a byte order
+/// mark the little endian byte order is assumed.
+///
+/// There is no corresponding encoder in this crate or in the Encoding
+/// Standard. The output encoding of this encoding is UTF-8.
+///
+/// This encoding matches the Windows code page 1200.
diff --git a/doc/UTF-8.txt b/doc/UTF-8.txt
new file mode 100644
index 0000000..3a93e67
--- /dev/null
+++ b/doc/UTF-8.txt
@@ -0,0 +1,5 @@
+/// This is the encoding that should be used for all new development it can
+/// represent all of Unicode.
+///
+/// This encoding matches the Windows code page 65001, except Windows differs
+/// in the number of errors generated for some erroneous byte sequences.
diff --git a/doc/gb18030.txt b/doc/gb18030.txt
new file mode 100644
index 0000000..572a593
--- /dev/null
+++ b/doc/gb18030.txt
@@ -0,0 +1,9 @@
+/// This encoding matches GB18030-2005 except the two-byte sequence 0xA3 0xA0
+/// maps to U+3000 for compatibility with existing Web content. As a result,
+/// this encoding can represent all of Unicode except for the private-use
+/// character U+E5E5.
+///
+/// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
+/// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
+///
+/// This encoding matches the Windows code page 54936.
diff --git a/doc/macintosh.txt b/doc/macintosh.txt
new file mode 100644
index 0000000..d00fece
--- /dev/null
+++ b/doc/macintosh.txt
@@ -0,0 +1,7 @@
+/// This is the MacRoman encoding from Mac OS Classic.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/macintosh.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/macintosh-bmp.html)
+///
+/// This encoding matches the Windows code page 10000, except Windows decodes
+/// 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
diff --git a/doc/replacement.txt b/doc/replacement.txt
new file mode 100644
index 0000000..2398df0
--- /dev/null
+++ b/doc/replacement.txt
@@ -0,0 +1,10 @@
+/// This decode-only encoding decodes all non-zero-length streams to a single
+/// REPLACEMENT CHARACTER. Its purpose is to avoid the use of an
+/// ASCII-compatible fallback encoding (typically windows-1252) for some
+/// encodings that are no longer supported by the Web Platform and that
+/// would be dangerous to treat as ASCII-compatible.
+///
+/// There is no corresponding encoder. The output encoding of this encoding
+/// is UTF-8.
+///
+/// This encoding does not have a Windows code page number.
diff --git a/doc/windows-1250.txt b/doc/windows-1250.txt
new file mode 100644
index 0000000..96e38ef
--- /dev/null
+++ b/doc/windows-1250.txt
@@ -0,0 +1,6 @@
+/// This is the Central European encoding for Windows.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/windows-1250.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1250-bmp.html)
+///
+/// This encoding matches the Windows code page 1250.
diff --git a/doc/windows-1251.txt b/doc/windows-1251.txt
new file mode 100644
index 0000000..9645611
--- /dev/null
+++ b/doc/windows-1251.txt
@@ -0,0 +1,6 @@
+/// This is the Cyrillic encoding for Windows.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/windows-1251.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1251-bmp.html)
+///
+/// This encoding matches the Windows code page 1251.
diff --git a/doc/windows-1252.txt b/doc/windows-1252.txt
new file mode 100644
index 0000000..d613fbe
--- /dev/null
+++ b/doc/windows-1252.txt
@@ -0,0 +1,7 @@
+/// This is the Western encoding for Windows. It is an extension of ISO-8859-1,
+/// which is known as Latin 1.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/windows-1252.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1252-bmp.html)
+///
+/// This encoding matches the Windows code page 1252.
diff --git a/doc/windows-1253.txt b/doc/windows-1253.txt
new file mode 100644
index 0000000..edcacd9
--- /dev/null
+++ b/doc/windows-1253.txt
@@ -0,0 +1,8 @@
+/// This is the Greek encoding for Windows. It is mostly an extension of
+/// ISO-8859-7, but U+0386 is mapped to a different byte.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/windows-1253.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1253-bmp.html)
+///
+/// This encoding matches the Windows code page 1253, except Windows decodes
+/// unassigned code points to the Private Use Area of Unicode.
diff --git a/doc/windows-1254.txt b/doc/windows-1254.txt
new file mode 100644
index 0000000..26491a9
--- /dev/null
+++ b/doc/windows-1254.txt
@@ -0,0 +1,7 @@
+/// This is the Turkish encoding for Windows. It is an extension of ISO-8859-9,
+/// which is known as Latin 5.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/windows-1254.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1254-bmp.html)
+///
+/// This encoding matches the Windows code page 1254.
diff --git a/doc/windows-1255.txt b/doc/windows-1255.txt
new file mode 100644
index 0000000..cbcf86d
--- /dev/null
+++ b/doc/windows-1255.txt
@@ -0,0 +1,8 @@
+/// This is the Hebrew encoding for Windows. It is an extension of ISO-8859-8-I,
+/// except for a currency sign swap.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/windows-1255.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1255-bmp.html)
+///
+/// This encoding matches the Windows code page 1255, except Windows decodes
+/// unassigned code points to the Private Use Area of Unicode.
diff --git a/doc/windows-1256.txt b/doc/windows-1256.txt
new file mode 100644
index 0000000..38bf2ef
--- /dev/null
+++ b/doc/windows-1256.txt
@@ -0,0 +1,6 @@
+/// This is the Arabic encoding for Windows.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/windows-1256.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1256-bmp.html)
+///
+/// This encoding matches the Windows code page 1256.
diff --git a/doc/windows-1257.txt b/doc/windows-1257.txt
new file mode 100644
index 0000000..fc3fad2
--- /dev/null
+++ b/doc/windows-1257.txt
@@ -0,0 +1,7 @@
+/// This is the Baltic encoding for Windows.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/windows-1257.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1257-bmp.html)
+///
+/// This encoding matches the Windows code page 1257, except Windows decodes
+/// unassigned code points to the Private Use Area of Unicode.
diff --git a/doc/windows-1258.txt b/doc/windows-1258.txt
new file mode 100644
index 0000000..1ae5bbb
--- /dev/null
+++ b/doc/windows-1258.txt
@@ -0,0 +1,11 @@
+/// This is the Vietnamese encoding for Windows.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/windows-1258.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1258-bmp.html)
+///
+/// This encoding matches the Windows code page 1258 when used in the
+/// non-normalizing mode. Unlike with the other single-byte encodings, the
+/// result of decoding is not necessarily in Normalization Form C. On the
+/// other hand, input in the Normalization Form C is not encoded without
+/// replacement. In general, it's a bad idea to encode to encodings other
+/// than UTF-8, but this encoding is especially hazardous to encode to.
diff --git a/doc/windows-874.txt b/doc/windows-874.txt
new file mode 100644
index 0000000..ddbc711
--- /dev/null
+++ b/doc/windows-874.txt
@@ -0,0 +1,7 @@
+/// This is the Thai encoding for Windows. It is an extension of TIS-620 / ISO-8859-11.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/windows-874.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-874-bmp.html)
+///
+/// This encoding matches the Windows code page 874, except Windows decodes
+/// unassigned code points to the Private Use Area of Unicode.
diff --git a/doc/x-mac-cyrillic.txt b/doc/x-mac-cyrillic.txt
new file mode 100644
index 0000000..b5519a1
--- /dev/null
+++ b/doc/x-mac-cyrillic.txt
@@ -0,0 +1,6 @@
+/// This is the MacUkrainian encoding from Mac OS Classic.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/x-mac-cyrillic.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/x-mac-cyrillic-bmp.html)
+///
+/// This encoding matches the Windows code page 10017.
diff --git a/doc/x-user-defined.txt b/doc/x-user-defined.txt
new file mode 100644
index 0000000..e00ddc6
--- /dev/null
+++ b/doc/x-user-defined.txt
@@ -0,0 +1,6 @@
+/// This encoding offsets the non-ASCII bytes by `0xF700` thereby decoding
+/// them to the Private Use Area of Unicode. It was used for loading binary
+/// data into a JavaScript string using `XMLHttpRequest` before XHR supported
+/// the `"arraybuffer"` response type.
+///
+/// This encoding does not have a Windows code page number.
diff --git a/generate-encoding-data.py b/generate-encoding-data.py
index 0b38e90..7c17b18 100644
--- a/generate-encoding-data.py
+++ b/generate-encoding-data.py
@@ -33,6 +33,13 @@ class Label:
def __cmp__(self, other):
return cmp_from_end(self.label, other.label)
+class CodePage:
+ def __init__(self, code_page, preferred):
+ self.code_page = code_page
+ self.preferred = preferred
+ def __cmp__(self, other):
+ return self.code_page, other.code_page
+
def static_u16_table(name, data):
data_file.write('''pub static %s: [u16; %d] = [
''' % (name, len(data)))
@@ -82,6 +89,8 @@ single_byte = []
multi_byte = []
+code_pages = []
+
def to_camel_name(name):
if name == u"iso-8859-8-i":
return u"Iso8I"
@@ -98,6 +107,66 @@ def to_snake_name(name):
def to_dom_name(name):
return name
+encodings_by_code_page = {
+ 932: "Shift_JIS",
+ 936: "GBK",
+ 949: "EUC-KR",
+ 950: "Big5",
+ 866: "IBM866",
+ 874: "windows-874",
+ 1200: "UTF-16LE",
+ 1201: "UTF-16BE",
+ 1250: "windows-1250",
+ 1251: "windows-1251",
+ 1252: "windows-1252",
+ 1253: "windows-1253",
+ 1254: "windows-1254",
+ 1255: "windows-1255",
+ 1256: "windows-1256",
+ 1257: "windows-1257",
+ 1258: "windows-1258",
+ 10000: "macintosh",
+ 10017: "x-mac-cyrillic",
+ 20866: "KOI8-R",
+ 20932: "EUC-JP",
+ 21866: "KOI8-U",
+ 28592: "ISO-8859-2",
+ 28593: "ISO-8859-3",
+ 28594: "ISO-8859-4",
+ 28595: "ISO-8859-5",
+ 28596: "ISO-8859-6",
+ 28597: "ISO-8859-7",
+ 28598: "ISO-8859-8",
+ 28600: "ISO-8859-10",
+ 28603: "ISO-8859-13",
+ 28604: "ISO-8859-14",
+ 28605: "ISO-8859-15",
+ 28606: "ISO-8859-16",
+ 38598: "ISO-8859-8-I",
+ 50221: "ISO-2022-JP",
+ 54936: "gb18030",
+ 65001: "UTF-8",
+}
+
+code_pages_by_encoding = {}
+
+for code_page, encoding in encodings_by_code_page.iteritems():
+ code_pages_by_encoding[encoding] = code_page
+
+encoding_by_alias_code_page = {
+ 951: "Big5",
+ 20936: "GBK",
+ 20949: "EUC-KR",
+ 28591: "windows-1252",
+ 28599: "windows-1254",
+ 28601: "windows-847",
+ 50220: "ISO-2022-JP",
+ 50222: "ISO-2022-JP",
+ 51949: "EUC-JP",
+ 51936: "GBK",
+ 51949: "EUC-KR",
+}
+
#
for group in data:
@@ -177,7 +246,11 @@ for name in preferred:
else:
variant = to_camel_name(name)
- label_file.write('''/// The initializer for the %s encoding.
+ docfile = open("doc/%s.txt" % name, "r")
+ doctext = docfile.read()
+ docfile.close()
+
+ label_file.write('''/// The initializer for the [%s](static.%s.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -196,13 +269,14 @@ pub static %s_INIT: Encoding = Encoding {
/// The %s encoding.
///
+%s///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static %s: &'static Encoding = &%s_INIT;
-''' % (to_dom_name(name), to_constant_name(name), to_dom_name(name), variant, to_dom_name(name), to_constant_name(name), to_constant_name(name)))
+''' % (to_dom_name(name), to_constant_name(name), to_constant_name(name), to_dom_name(name), variant, to_dom_name(name), doctext, to_constant_name(name), to_constant_name(name)))
label_file.write("""static LABELS_SORTED: [&'static str; %d] = [
""" % len(labels))
diff --git a/src/euc_jp.rs b/src/euc_jp.rs
index 9857989..ea9d515 100644
--- a/src/euc_jp.rs
+++ b/src/euc_jp.rs
@@ -286,7 +286,8 @@ impl EucJpEncoder {
let lead = (pointer / 94) + 0xA1;
let trail = (pointer % 94) + 0xA1;
handle.write_two(lead as u8, trail as u8)
- } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D) || bmp == 0xF929
+ } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D)
+ || bmp == 0xF929
|| bmp == 0xF9DC
{
// Guaranteed to be found in IBM_KANJI
diff --git a/src/euc_kr.rs b/src/euc_kr.rs
index d27a1ef..51939d1 100644
--- a/src/euc_kr.rs
+++ b/src/euc_kr.rs
@@ -205,7 +205,8 @@ fn ksx1001_encode_misc(bmp: u16) -> Option<(usize, usize)> {
return Some((0x81 + 0x25, 0xA1 + pos));
}
}
- if in_inclusive_range16(bmp, 0x2015, 0x266D) || in_inclusive_range16(bmp, 0x321C, 0x33D8)
+ if in_inclusive_range16(bmp, 0x2015, 0x266D)
+ || in_inclusive_range16(bmp, 0x321C, 0x33D8)
|| in_inclusive_range16(bmp, 0xFF3C, 0xFFE5)
|| in_inclusive_range16(bmp, 0x00A1, 0x00F7)
|| in_inclusive_range16(bmp, 0x02C7, 0x02DD)
diff --git a/src/handles.rs b/src/handles.rs
index be481c5..5b46d14 100644
--- a/src/handles.rs
+++ b/src/handles.rs
@@ -1477,12 +1477,14 @@ impl<'a> Utf8Source<'a> {
return unsafe { ::std::mem::transmute(point) };
}
if unit < 0xF0u32 {
- let point = ((unit & 0xFu32) << 12) | ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 6)
+ let point = ((unit & 0xFu32) << 12)
+ | ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 6)
| (self.slice[self.pos + 2] as u32 & 0x3Fu32);
self.pos += 3;
return unsafe { ::std::mem::transmute(point) };
}
- let point = ((unit & 0x7u32) << 18) | ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 12)
+ let point = ((unit & 0x7u32) << 18)
+ | ((self.slice[self.pos + 1] as u32 & 0x3Fu32) << 12)
| ((self.slice[self.pos + 2] as u32 & 0x3Fu32) << 6)
| (self.slice[self.pos + 3] as u32 & 0x3Fu32);
self.pos += 4;
diff --git a/src/iso_2022_jp.rs b/src/iso_2022_jp.rs
index 32a088a..23c53ff 100644
--- a/src/iso_2022_jp.rs
+++ b/src/iso_2022_jp.rs
@@ -667,7 +667,8 @@ impl Iso2022JpEncoder {
let trail = (pointer % 94) + 0x21;
handle.write_two(lead as u8, trail as u8);
continue;
- } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D) || bmp == 0xF929
+ } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D)
+ || bmp == 0xF929
|| bmp == 0xF9DC
{
// Guaranteed to be found in IBM_KANJI
diff --git a/src/lib.rs b/src/lib.rs
index dd608c7..e2e9af2 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -561,8 +561,14 @@
//!
| Encoding | Code Page | PUA | Remarks |
//!
//!
+//! | Shift_JIS | 932 | | |
+//! | GBK | 936 | | |
+//! | EUC-KR | 949 | | |
+//! | Big5 | 950 | | |
//! | IBM866 | 866 | | |
//! | windows-874 | 874 | • | |
+//! | UTF-16LE | 1200 | | |
+//! | UTF-16BE | 1201 | | |
//! | windows-1250 | 1250 | | |
//! | windows-1251 | 1251 | | |
//! | windows-1252 | 1252 | | |
@@ -575,6 +581,7 @@
//! | macintosh | 10000 | | 1 |
//! | x-mac-cyrillic | 10017 | | 2 |
//! | KOI8-R | 20866 | | |
+//! | EUC-JP | 20932 | | |
//! | KOI8-U | 21866 | | |
//! | ISO-8859-2 | 28592 | | |
//! | ISO-8859-3 | 28593 | | |
@@ -586,6 +593,9 @@
//! | ISO-8859-13 | 28603 | • | |
//! | ISO-8859-15 | 28605 | | |
//! | ISO-8859-8-I | 38598 | | 5 |
+//! | ISO-2022-JP | 50220 | | |
+//! | gb18030 | 54936 | | |
+//! | UTF-8 | 65001 | | |
//!
//!
//!
@@ -739,7 +749,7 @@ const NCR_EXTRA: usize = 10; //
const LONGEST_LABEL_LENGTH: usize = 19; // cseucpkdfmtjapanese
-/// The initializer for the Big5 encoding.
+/// The initializer for the [Big5](static.BIG5.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -758,13 +768,30 @@ pub static BIG5_INIT: Encoding = Encoding {
/// The Big5 encoding.
///
+/// This is Big5 with HKSCS with mappings to more recent Unicode assignments
+/// instead of the Private Use Area code points that have been used historically.
+/// It is believed to be able to decode existing Web content in a way that makes
+/// sense.
+///
+/// To avoid form submissions generating data that Web servers don't understand,
+/// the encoder doesn't use the HKSCS byte sequences that precede the unextended
+/// Big5 in the lexical order.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/big5.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/big5-bmp.html)
+///
+/// This encoding is designed to be suited for decoding the Windows code page 950
+/// and its HKSCS patched "951" variant such that the text makes sense, given
+/// assignments that Unicode has made after those encodings used Private Use
+/// Area characters.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static BIG5: &'static Encoding = &BIG5_INIT;
-/// The initializer for the EUC-JP encoding.
+/// The initializer for the [EUC-JP](static.EUC_JP.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -783,13 +810,26 @@ pub static EUC_JP_INIT: Encoding = Encoding {
/// The EUC-JP encoding.
///
+/// This is the legacy Unix encoding for Japanese.
+///
+/// For compatibility with Web servers that don't expect three-byte sequences
+/// in form submissions, the encoder doesn't generate three-byte sequences.
+/// That is, the JIS X 0212 support is decode-only.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/euc-jp.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-jp-bmp.html)
+///
+/// This encoding roughly matches the Windows code page 20932. There are error
+/// handling differences and a handful of 2-byte sequences that decode differently.
+/// Additionall, Windows doesn't support 3-byte sequences.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static EUC_JP: &'static Encoding = &EUC_JP_INIT;
-/// The initializer for the EUC-KR encoding.
+/// The initializer for the [EUC-KR](static.EUC_KR.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -808,13 +848,24 @@ pub static EUC_KR_INIT: Encoding = Encoding {
/// The EUC-KR encoding.
///
+/// This is the Korean encoding for Windows. It extends the Unix legacy encoding
+/// for Korean, based on KS X 1001 (which also formed the base of MacKorean on Mac OS
+/// Classic), with all the characters from the Hangul Syllables block of Unicode.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/euc-kr.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-kr-bmp.html)
+///
+/// This encoding matches the Windows code page 949, except Windows decodes byte 0x80
+/// to U+0080 and some byte sequences that are error per the Encoding Standard to
+/// the question mark or the Private Use Area.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static EUC_KR: &'static Encoding = &EUC_KR_INIT;
-/// The initializer for the GBK encoding.
+/// The initializer for the [GBK](static.GBK.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -833,13 +884,30 @@ pub static GBK_INIT: Encoding = Encoding {
/// The GBK encoding.
///
+/// The decoder for this encoding is the same as the decoder for gb18030.
+/// The encoder side of this encoding is GBK with Windows code page 936 euro
+/// sign behavior. GBK extends GB2312-80 to cover the CJK Unified Ideographs
+/// Unicode block as well as a handful of ideographs from the CJK Unified
+/// Ideographs Extension A and CJK Compatibility Ideographs blocks.
+///
+/// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't
+/// unified with the gb18030 encoder in the Encoding Standard out of concern
+/// that servers that expect GBK form submissions might not be able to handle
+/// the four-byte sequences.
+///
+/// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
+/// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
+///
+/// The encoder of this encoding roughly matches the Windows code page 936.
+/// The decoder side is a superset.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static GBK: &'static Encoding = &GBK_INIT;
-/// The initializer for the IBM866 encoding.
+/// The initializer for the [IBM866](static.IBM866.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -858,13 +926,22 @@ pub static IBM866_INIT: Encoding = Encoding {
/// The IBM866 encoding.
///
+/// This the most notable one of the DOS Cyrillic code pages. It has the same
+/// box drawing characters as code page 437, so it can be used for decoding
+/// DOS-era ASCII + box drawing data.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/ibm866.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/ibm866-bmp.html)
+///
+/// This encoding matches the Windows code page 866.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static IBM866: &'static Encoding = &IBM866_INIT;
-/// The initializer for the ISO-2022-JP encoding.
+/// The initializer for the [ISO-2022-JP](static.ISO_2022_JP.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -883,13 +960,24 @@ pub static ISO_2022_JP_INIT: Encoding = Encoding {
/// The ISO-2022-JP encoding.
///
+/// This the primary pre-UTF-8 encoding for Japanese email. It uses the ASCII
+/// byte range to encode non-Basic Latin characters. It's the only encoding
+/// supported by this crate whose encoder is stateful.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/jis0208.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/jis0208-bmp.html)
+///
+/// This encoding roughly matches the Windows code page 50220. Notably, Windows
+/// uses U+30FB in place of the REPLACEMENT CHARACTER and otherwise differs in
+/// error handling.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static ISO_2022_JP: &'static Encoding = &ISO_2022_JP_INIT;
-/// The initializer for the ISO-8859-10 encoding.
+/// The initializer for the [ISO-8859-10](static.ISO_8859_10.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -908,13 +996,22 @@ pub static ISO_8859_10_INIT: Encoding = Encoding {
/// The ISO-8859-10 encoding.
///
+/// This is the Nordic part of the ISO/IEC 8859 encoding family. This encoding
+/// is also known as Latin 6.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-10.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-10-bmp.html)
+///
+/// The Windows code page number for this encoding is 28600, but kernel32.dll
+/// does not support this encoding.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static ISO_8859_10: &'static Encoding = &ISO_8859_10_INIT;
-/// The initializer for the ISO-8859-13 encoding.
+/// The initializer for the [ISO-8859-13](static.ISO_8859_13.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -933,13 +1030,22 @@ pub static ISO_8859_13_INIT: Encoding = Encoding {
/// The ISO-8859-13 encoding.
///
+/// This is the Baltic part of the ISO/IEC 8859 encoding family. This encoding
+/// is also known as Latin 7.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-13.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-13-bmp.html)
+///
+/// This encoding matches the Windows code page 28603, except Windows decodes
+/// unassigned code points to the Private Use Area of Unicode.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static ISO_8859_13: &'static Encoding = &ISO_8859_13_INIT;
-/// The initializer for the ISO-8859-14 encoding.
+/// The initializer for the [ISO-8859-14](static.ISO_8859_14.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -958,13 +1064,22 @@ pub static ISO_8859_14_INIT: Encoding = Encoding {
/// The ISO-8859-14 encoding.
///
+/// This is the Celtic part of the ISO/IEC 8859 encoding family. This encoding
+/// is also known as Latin 8.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-14.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-14-bmp.html)
+///
+/// The Windows code page number for this encoding is 28604, but kernel32.dll
+/// does not support this encoding.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static ISO_8859_14: &'static Encoding = &ISO_8859_14_INIT;
-/// The initializer for the ISO-8859-15 encoding.
+/// The initializer for the [ISO-8859-15](static.ISO_8859_15.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -983,13 +1098,21 @@ pub static ISO_8859_15_INIT: Encoding = Encoding {
/// The ISO-8859-15 encoding.
///
+/// This is the revised Western European part of the ISO/IEC 8859 encoding
+/// family. This encoding is also known as Latin 9.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-15.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-15-bmp.html)
+///
+/// This encoding matches the Windows code page 28605.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static ISO_8859_15: &'static Encoding = &ISO_8859_15_INIT;
-/// The initializer for the ISO-8859-16 encoding.
+/// The initializer for the [ISO-8859-16](static.ISO_8859_16.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1008,13 +1131,22 @@ pub static ISO_8859_16_INIT: Encoding = Encoding {
/// The ISO-8859-16 encoding.
///
+/// This is the South-Eastern European part of the ISO/IEC 8859 encoding
+/// family. This encoding is also known as Latin 10.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-16.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-16-bmp.html)
+///
+/// The Windows code page number for this encoding is 28606, but kernel32.dll
+/// does not support this encoding.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static ISO_8859_16: &'static Encoding = &ISO_8859_16_INIT;
-/// The initializer for the ISO-8859-2 encoding.
+/// The initializer for the [ISO-8859-2](static.ISO_8859_2.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1033,13 +1165,20 @@ pub static ISO_8859_2_INIT: Encoding = Encoding {
/// The ISO-8859-2 encoding.
///
+/// This is the Central European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 2.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-2.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-2-bmp.html)
+///
+/// This encoding matches the Windows code page 28592.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static ISO_8859_2: &'static Encoding = &ISO_8859_2_INIT;
-/// The initializer for the ISO-8859-3 encoding.
+/// The initializer for the [ISO-8859-3](static.ISO_8859_3.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1058,13 +1197,20 @@ pub static ISO_8859_3_INIT: Encoding = Encoding {
/// The ISO-8859-3 encoding.
///
+/// This is the South European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 3.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-3.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-3-bmp.html)
+///
+/// This encoding matches the Windows code page 28593.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static ISO_8859_3: &'static Encoding = &ISO_8859_3_INIT;
-/// The initializer for the ISO-8859-4 encoding.
+/// The initializer for the [ISO-8859-4](static.ISO_8859_4.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1083,13 +1229,20 @@ pub static ISO_8859_4_INIT: Encoding = Encoding {
/// The ISO-8859-4 encoding.
///
+/// This is the North European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 4.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-4.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-4-bmp.html)
+///
+/// This encoding matches the Windows code page 28594.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static ISO_8859_4: &'static Encoding = &ISO_8859_4_INIT;
-/// The initializer for the ISO-8859-5 encoding.
+/// The initializer for the [ISO-8859-5](static.ISO_8859_5.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1108,13 +1261,20 @@ pub static ISO_8859_5_INIT: Encoding = Encoding {
/// The ISO-8859-5 encoding.
///
+/// This is the Cyrillic part of the ISO/IEC 8859 encoding family.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-5.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-5-bmp.html)
+///
+/// This encoding matches the Windows code page 28595.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static ISO_8859_5: &'static Encoding = &ISO_8859_5_INIT;
-/// The initializer for the ISO-8859-6 encoding.
+/// The initializer for the [ISO-8859-6](static.ISO_8859_6.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1133,13 +1293,21 @@ pub static ISO_8859_6_INIT: Encoding = Encoding {
/// The ISO-8859-6 encoding.
///
+/// This is the Arabic part of the ISO/IEC 8859 encoding family.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-6.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-6-bmp.html)
+///
+/// This encoding matches the Windows code page 28596, except Windows decodes
+/// unassigned code points to the Private Use Area of Unicode.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static ISO_8859_6: &'static Encoding = &ISO_8859_6_INIT;
-/// The initializer for the ISO-8859-7 encoding.
+/// The initializer for the [ISO-8859-7](static.ISO_8859_7.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1158,13 +1326,25 @@ pub static ISO_8859_7_INIT: Encoding = Encoding {
/// The ISO-8859-7 encoding.
///
+/// This is the Greek part of the ISO/IEC 8859 encoding family.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-7.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-7-bmp.html)
+///
+/// This encoding roughly matches the Windows code page 28597. Windows decodes
+/// unassigned code points, the currency signs at 0xA4 and 0xA5 as well as
+/// 0xAA, which should be U+037A GREEK YPOGEGRAMMENI, to the Private Use Area
+/// of Unicode. Windows decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA
+/// instead of U+2018 LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER
+/// LETTER APOSTROPHE instead of U+2019 RIGHT SINGLE QUOTATION MARK.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static ISO_8859_7: &'static Encoding = &ISO_8859_7_INIT;
-/// The initializer for the ISO-8859-8 encoding.
+/// The initializer for the [ISO-8859-8](static.ISO_8859_8.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1183,13 +1363,23 @@ pub static ISO_8859_8_INIT: Encoding = Encoding {
/// The ISO-8859-8 encoding.
///
+/// This is the Hebrew part of the ISO/IEC 8859 encoding family in visual order.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
+///
+/// This encoding roughly matches the Windows code page 28598. Windows decodes
+/// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
+/// Area instead of LRM and RLM. Windows decodes unassigned code points to
+/// the private use area.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static ISO_8859_8: &'static Encoding = &ISO_8859_8_INIT;
-/// The initializer for the ISO-8859-8-I encoding.
+/// The initializer for the [ISO-8859-8-I](static.ISO_8859_8_I.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1208,13 +1398,23 @@ pub static ISO_8859_8_I_INIT: Encoding = Encoding {
/// The ISO-8859-8-I encoding.
///
+/// This is the Hebrew part of the ISO/IEC 8859 encoding family in logical order.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
+///
+/// This encoding roughly matches the Windows code page 38598. Windows decodes
+/// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
+/// Area instead of LRM and RLM. Windows decodes unassigned code points to
+/// the private use area.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static ISO_8859_8_I: &'static Encoding = &ISO_8859_8_I_INIT;
-/// The initializer for the KOI8-R encoding.
+/// The initializer for the [KOI8-R](static.KOI8_R.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1233,13 +1433,20 @@ pub static KOI8_R_INIT: Encoding = Encoding {
/// The KOI8-R encoding.
///
+/// This is an encoding for Russian from [RFC 1489](https://tools.ietf.org/html/rfc1489).
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/koi8-r.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-r-bmp.html)
+///
+/// This encoding matches the Windows code page 20866.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static KOI8_R: &'static Encoding = &KOI8_R_INIT;
-/// The initializer for the KOI8-U encoding.
+/// The initializer for the [KOI8-U](static.KOI8_U.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1258,13 +1465,20 @@ pub static KOI8_U_INIT: Encoding = Encoding {
/// The KOI8-U encoding.
///
+/// This is an encoding for Ukrainian adapted from KOI8-R.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/koi8-u.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-u-bmp.html)
+///
+/// This encoding matches the Windows code page 21866.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static KOI8_U: &'static Encoding = &KOI8_U_INIT;
-/// The initializer for the Shift_JIS encoding.
+/// The initializer for the [Shift_JIS](static.SHIFT_JIS.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1283,13 +1497,22 @@ pub static SHIFT_JIS_INIT: Encoding = Encoding {
/// The Shift_JIS encoding.
///
+/// This is the Japanese encoding for Windows.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/shift_jis.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/shift_jis-bmp.html)
+///
+/// This encoding matches the Windows code page 932, except Windows decodes some byte
+/// sequences that are error per the Encoding Standard to the question mark or the
+/// Private Use Area and generally uses U+30FB in place of the REPLACEMENT CHARACTER.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static SHIFT_JIS: &'static Encoding = &SHIFT_JIS_INIT;
-/// The initializer for the UTF-16BE encoding.
+/// The initializer for the [UTF-16BE](static.UTF_16BE.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1308,13 +1531,22 @@ pub static UTF_16BE_INIT: Encoding = Encoding {
/// The UTF-16BE encoding.
///
+/// This decode-only encoding uses 16-bit code units due to Unicode originally
+/// having been designed as a 16-bit reportoire. In the absence of a byte order
+/// mark the big endian byte order is assumed.
+///
+/// There is no corresponding encoder in this crate or in the Encoding
+/// Standard. The output encoding of this encoding is UTF-8.
+///
+/// This encoding matches the Windows code page 1201.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static UTF_16BE: &'static Encoding = &UTF_16BE_INIT;
-/// The initializer for the UTF-16LE encoding.
+/// The initializer for the [UTF-16LE](static.UTF_16LE.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1333,13 +1565,22 @@ pub static UTF_16LE_INIT: Encoding = Encoding {
/// The UTF-16LE encoding.
///
+/// This decode-only encoding uses 16-bit code units due to Unicode originally
+/// having been designed as a 16-bit reportoire. In the absence of a byte order
+/// mark the little endian byte order is assumed.
+///
+/// There is no corresponding encoder in this crate or in the Encoding
+/// Standard. The output encoding of this encoding is UTF-8.
+///
+/// This encoding matches the Windows code page 1200.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static UTF_16LE: &'static Encoding = &UTF_16LE_INIT;
-/// The initializer for the UTF-8 encoding.
+/// The initializer for the [UTF-8](static.UTF_8.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1358,13 +1599,19 @@ pub static UTF_8_INIT: Encoding = Encoding {
/// The UTF-8 encoding.
///
+/// This is the encoding that should be used for all new development it can
+/// represent all of Unicode.
+///
+/// This encoding matches the Windows code page 65001, except Windows differs
+/// in the number of errors generated for some erroneous byte sequences.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static UTF_8: &'static Encoding = &UTF_8_INIT;
-/// The initializer for the gb18030 encoding.
+/// The initializer for the [gb18030](static.GB18030.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1383,13 +1630,23 @@ pub static GB18030_INIT: Encoding = Encoding {
/// The gb18030 encoding.
///
+/// This encoding matches GB18030-2005 except the two-byte sequence 0xA3 0xA0
+/// maps to U+3000 for compatibility with existing Web content. As a result,
+/// this encoding can represent all of Unicode except for the private-use
+/// character U+E5E5.
+///
+/// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
+/// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
+///
+/// This encoding matches the Windows code page 54936.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static GB18030: &'static Encoding = &GB18030_INIT;
-/// The initializer for the macintosh encoding.
+/// The initializer for the [macintosh](static.MACINTOSH.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1408,13 +1665,21 @@ pub static MACINTOSH_INIT: Encoding = Encoding {
/// The macintosh encoding.
///
+/// This is the MacRoman encoding from Mac OS Classic.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/macintosh.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/macintosh-bmp.html)
+///
+/// This encoding matches the Windows code page 10000, except Windows decodes
+/// 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static MACINTOSH: &'static Encoding = &MACINTOSH_INIT;
-/// The initializer for the replacement encoding.
+/// The initializer for the [replacement](static.REPLACEMENT.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1433,13 +1698,24 @@ pub static REPLACEMENT_INIT: Encoding = Encoding {
/// The replacement encoding.
///
+/// This decode-only encoding decodes all non-zero-length streams to a single
+/// REPLACEMENT CHARACTER. Its purpose is to avoid the use of an
+/// ASCII-compatible fallback encoding (typically windows-1252) for some
+/// encodings that are no longer supported by the Web Platform and that
+/// would be dangerous to treat as ASCII-compatible.
+///
+/// There is no corresponding encoder. The output encoding of this encoding
+/// is UTF-8.
+///
+/// This encoding does not have a Windows code page number.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static REPLACEMENT: &'static Encoding = &REPLACEMENT_INIT;
-/// The initializer for the windows-1250 encoding.
+/// The initializer for the [windows-1250](static.WINDOWS_1250.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1458,13 +1734,20 @@ pub static WINDOWS_1250_INIT: Encoding = Encoding {
/// The windows-1250 encoding.
///
+/// This is the Central European encoding for Windows.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/windows-1250.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1250-bmp.html)
+///
+/// This encoding matches the Windows code page 1250.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static WINDOWS_1250: &'static Encoding = &WINDOWS_1250_INIT;
-/// The initializer for the windows-1251 encoding.
+/// The initializer for the [windows-1251](static.WINDOWS_1251.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1483,13 +1766,20 @@ pub static WINDOWS_1251_INIT: Encoding = Encoding {
/// The windows-1251 encoding.
///
+/// This is the Cyrillic encoding for Windows.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/windows-1251.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1251-bmp.html)
+///
+/// This encoding matches the Windows code page 1251.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static WINDOWS_1251: &'static Encoding = &WINDOWS_1251_INIT;
-/// The initializer for the windows-1252 encoding.
+/// The initializer for the [windows-1252](static.WINDOWS_1252.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1508,13 +1798,21 @@ pub static WINDOWS_1252_INIT: Encoding = Encoding {
/// The windows-1252 encoding.
///
+/// This is the Western encoding for Windows. It is an extension of ISO-8859-1,
+/// which is known as Latin 1.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/windows-1252.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1252-bmp.html)
+///
+/// This encoding matches the Windows code page 1252.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static WINDOWS_1252: &'static Encoding = &WINDOWS_1252_INIT;
-/// The initializer for the windows-1253 encoding.
+/// The initializer for the [windows-1253](static.WINDOWS_1253.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1533,13 +1831,22 @@ pub static WINDOWS_1253_INIT: Encoding = Encoding {
/// The windows-1253 encoding.
///
+/// This is the Greek encoding for Windows. It is mostly an extension of
+/// ISO-8859-7, but U+0386 is mapped to a different byte.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/windows-1253.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1253-bmp.html)
+///
+/// This encoding matches the Windows code page 1253, except Windows decodes
+/// unassigned code points to the Private Use Area of Unicode.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static WINDOWS_1253: &'static Encoding = &WINDOWS_1253_INIT;
-/// The initializer for the windows-1254 encoding.
+/// The initializer for the [windows-1254](static.WINDOWS_1254.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1558,13 +1865,21 @@ pub static WINDOWS_1254_INIT: Encoding = Encoding {
/// The windows-1254 encoding.
///
+/// This is the Turkish encoding for Windows. It is an extension of ISO-8859-9,
+/// which is known as Latin 5.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/windows-1254.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1254-bmp.html)
+///
+/// This encoding matches the Windows code page 1254.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static WINDOWS_1254: &'static Encoding = &WINDOWS_1254_INIT;
-/// The initializer for the windows-1255 encoding.
+/// The initializer for the [windows-1255](static.WINDOWS_1255.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1583,13 +1898,22 @@ pub static WINDOWS_1255_INIT: Encoding = Encoding {
/// The windows-1255 encoding.
///
+/// This is the Hebrew encoding for Windows. It is an extension of ISO-8859-8-I,
+/// except for a currency sign swap.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/windows-1255.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1255-bmp.html)
+///
+/// This encoding matches the Windows code page 1255, except Windows decodes
+/// unassigned code points to the Private Use Area of Unicode.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static WINDOWS_1255: &'static Encoding = &WINDOWS_1255_INIT;
-/// The initializer for the windows-1256 encoding.
+/// The initializer for the [windows-1256](static.WINDOWS_1256.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1608,13 +1932,20 @@ pub static WINDOWS_1256_INIT: Encoding = Encoding {
/// The windows-1256 encoding.
///
+/// This is the Arabic encoding for Windows.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/windows-1256.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1256-bmp.html)
+///
+/// This encoding matches the Windows code page 1256.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static WINDOWS_1256: &'static Encoding = &WINDOWS_1256_INIT;
-/// The initializer for the windows-1257 encoding.
+/// The initializer for the [windows-1257](static.WINDOWS_1257.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1633,13 +1964,21 @@ pub static WINDOWS_1257_INIT: Encoding = Encoding {
/// The windows-1257 encoding.
///
+/// This is the Baltic encoding for Windows.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/windows-1257.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1257-bmp.html)
+///
+/// This encoding matches the Windows code page 1257, except Windows decodes
+/// unassigned code points to the Private Use Area of Unicode.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static WINDOWS_1257: &'static Encoding = &WINDOWS_1257_INIT;
-/// The initializer for the windows-1258 encoding.
+/// The initializer for the [windows-1258](static.WINDOWS_1258.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1658,13 +1997,25 @@ pub static WINDOWS_1258_INIT: Encoding = Encoding {
/// The windows-1258 encoding.
///
+/// This is the Vietnamese encoding for Windows.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/windows-1258.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1258-bmp.html)
+///
+/// This encoding matches the Windows code page 1258 when used in the
+/// non-normalizing mode. Unlike with the other single-byte encodings, the
+/// result of decoding is not necessarily in Normalization Form C. On the
+/// other hand, input in the Normalization Form C is not encoded without
+/// replacement. In general, it's a bad idea to encode to encodings other
+/// than UTF-8, but this encoding is especially hazardous to encode to.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static WINDOWS_1258: &'static Encoding = &WINDOWS_1258_INIT;
-/// The initializer for the windows-874 encoding.
+/// The initializer for the [windows-874](static.WINDOWS_874.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1683,13 +2034,21 @@ pub static WINDOWS_874_INIT: Encoding = Encoding {
/// The windows-874 encoding.
///
+/// This is the Thai encoding for Windows. It is an extension of TIS-620 / ISO-8859-11.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/windows-874.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-874-bmp.html)
+///
+/// This encoding matches the Windows code page 874, except Windows decodes
+/// unassigned code points to the Private Use Area of Unicode.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static WINDOWS_874: &'static Encoding = &WINDOWS_874_INIT;
-/// The initializer for the x-mac-cyrillic encoding.
+/// The initializer for the [x-mac-cyrillic](static.X_MAC_CYRILLIC.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1708,13 +2067,20 @@ pub static X_MAC_CYRILLIC_INIT: Encoding = Encoding {
/// The x-mac-cyrillic encoding.
///
+/// This is the MacUkrainian encoding from Mac OS Classic.
+///
+/// [Index visualization](https://encoding.spec.whatwg.org/x-mac-cyrillic.html),
+/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/x-mac-cyrillic-bmp.html)
+///
+/// This encoding matches the Windows code page 10017.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
/// `static`.
pub static X_MAC_CYRILLIC: &'static Encoding = &X_MAC_CYRILLIC_INIT;
-/// The initializer for the x-user-defined encoding.
+/// The initializer for the [x-user-defined](static.X_USER_DEFINED.html) encoding.
///
/// For use only for taking the address of this form when
/// Rust prohibits the use of the non-`_INIT` form directly,
@@ -1733,6 +2099,13 @@ pub static X_USER_DEFINED_INIT: Encoding = Encoding {
/// The x-user-defined encoding.
///
+/// This encoding offsets the non-ASCII bytes by `0xF700` thereby decoding
+/// them to the Private Use Area of Unicode. It was used for loading binary
+/// data into a JavaScript string using `XMLHttpRequest` before XHR supported
+/// the `"arraybuffer"` response type.
+///
+/// This encoding does not have a Windows code page number.
+///
/// This will change from `static` to `const` if Rust changes
/// to make the referent of `pub const FOO: &'static Encoding`
/// unique cross-crate, so don't take the address of this
@@ -3347,7 +3720,8 @@ impl Decoder {
| DecoderLifeCycle::AtUtf8Start
| DecoderLifeCycle::AtUtf16LeStart
| DecoderLifeCycle::AtUtf16BeStart => {
- return self.variant
+ return self
+ .variant
.max_utf8_buffer_length_without_replacement(byte_length)
}
DecoderLifeCycle::AtStart => {
@@ -3362,7 +3736,8 @@ impl Decoder {
// No need to consider the internal state of the underlying decoder,
// because it is at start, because no data has reached it yet.
return Some(utf_bom);
- } else if let Some(non_bom) = self.variant
+ } else if let Some(non_bom) = self
+ .variant
.max_utf8_buffer_length_without_replacement(byte_length)
{
return Some(std::cmp::max(utf_bom, non_bom));
diff --git a/src/mem.rs b/src/mem.rs
index 7e84ecb..81c5b6e 100644
--- a/src/mem.rs
+++ b/src/mem.rs
@@ -195,9 +195,8 @@ macro_rules! by_unit_check_simd {
}
let mut simd_accu = $splat;
while offset <= len_minus_stride {
- simd_accu = simd_accu | unsafe {
- *(src.offset(offset as isize) as *const $simd_ty)
- };
+ simd_accu = simd_accu
+ | unsafe { *(src.offset(offset as isize) as *const $simd_ty) };
offset += SIMD_STRIDE_SIZE / unit_size;
}
if !$func(simd_accu) {
@@ -1279,7 +1278,9 @@ pub fn is_char_bidi(c: char) -> bool {
// Above Arabic Extended-A and below Arabic Presentation Forms
if in_inclusive_range32(code_point, 0x200F, 0x2067) {
// In the range that contains the RTL controls
- return code_point == 0x200F || code_point == 0x202B || code_point == 0x202E
+ return code_point == 0x200F
+ || code_point == 0x202B
+ || code_point == 0x202E
|| code_point == 0x2067;
}
return false;
@@ -1514,7 +1515,8 @@ pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
// Three-byte
let second = bytes[read + 1];
let third = bytes[read + 2];
- let point = (((byte as u32) & 0xFu32) << 12) | ((second as u32 & 0x3Fu32) << 6)
+ let point = (((byte as u32) & 0xFu32) << 12)
+ | ((second as u32 & 0x3Fu32) << 6)
| (third as u32 & 0x3Fu32);
dst[written] = point as u16;
read += 3;
@@ -1524,7 +1526,8 @@ pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
let second = bytes[read + 1];
let third = bytes[read + 2];
let fourth = bytes[read + 3];
- let point = (((byte as u32) & 0x7u32) << 18) | ((second as u32 & 0x3Fu32) << 12)
+ let point = (((byte as u32) & 0x7u32) << 18)
+ | ((second as u32 & 0x3Fu32) << 12)
| ((third as u32 & 0x3Fu32) << 6)
| (fourth as u32 & 0x3Fu32);
dst[written] = (0xD7C0 + (point >> 10)) as u16;
diff --git a/src/shift_jis.rs b/src/shift_jis.rs
index e93ae2c..1aea7c3 100644
--- a/src/shift_jis.rs
+++ b/src/shift_jis.rs
@@ -248,7 +248,8 @@ impl ShiftJisEncoder {
10716 + bmp_minus_roman as usize
} else if let Some(pointer) = jis0208_range_encode(bmp) {
pointer
- } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D) || bmp == 0xF929
+ } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D)
+ || bmp == 0xF929
|| bmp == 0xF9DC
{
// Guaranteed to be found in IBM_KANJI
diff --git a/src/simd_funcs.rs b/src/simd_funcs.rs
index 3c2ec34..e90343a 100644
--- a/src/simd_funcs.rs
+++ b/src/simd_funcs.rs
@@ -277,10 +277,15 @@ pub fn is_u16x8_bidi(s: u16x8) -> bool {
// Quick refutation failed. Let's do the full check.
- (in_range16x8!(s, 0x0590, 0x0900) | in_range16x8!(s, 0xFB50, 0xFE00)
- | in_range16x8!(s, 0xFE70, 0xFF00) | in_range16x8!(s, 0xD802, 0xD804)
- | in_range16x8!(s, 0xD83A, 0xD83C) | s.eq(u16x8::splat(0x200F))
- | s.eq(u16x8::splat(0x202B)) | s.eq(u16x8::splat(0x202E)) | s.eq(u16x8::splat(0x2067)))
+ (in_range16x8!(s, 0x0590, 0x0900)
+ | in_range16x8!(s, 0xFB50, 0xFE00)
+ | in_range16x8!(s, 0xFE70, 0xFF00)
+ | in_range16x8!(s, 0xD802, 0xD804)
+ | in_range16x8!(s, 0xD83A, 0xD83C)
+ | s.eq(u16x8::splat(0x200F))
+ | s.eq(u16x8::splat(0x202B))
+ | s.eq(u16x8::splat(0x202E))
+ | s.eq(u16x8::splat(0x2067)))
.any()
}
diff --git a/src/utf_16.rs b/src/utf_16.rs
index f3ec16b..8f82010 100644
--- a/src/utf_16.rs
+++ b/src/utf_16.rs
@@ -29,11 +29,9 @@ impl Utf16Decoder {
}
pub fn additional_from_state(&self) -> usize {
- 1 + if self.lead_byte.is_some() { 1 } else { 0 } + if self.lead_surrogate == 0 {
- 0
- } else {
- 2
- }
+ 1
+ + if self.lead_byte.is_some() { 1 } else { 0 }
+ + if self.lead_surrogate == 0 { 0 } else { 2 }
}
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option {
diff --git a/src/utf_8.rs b/src/utf_8.rs
index f9c02d3..db5c62a 100644
--- a/src/utf_8.rs
+++ b/src/utf_8.rs
@@ -372,7 +372,8 @@ pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usiz
{
break 'outer;
}
- let point = (((byte as u32) & 0xFu32) << 12) | ((second as u32 & 0x3Fu32) << 6)
+ let point = (((byte as u32) & 0xFu32) << 12)
+ | ((second as u32 & 0x3Fu32) << 6)
| (third as u32 & 0x3Fu32);
dst[written] = point as u16;
read = new_read;
@@ -393,7 +394,8 @@ pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usiz
{
break 'outer;
}
- let point = (((byte as u32) & 0xFu32) << 12) | ((second as u32 & 0x3Fu32) << 6)
+ let point = (((byte as u32) & 0xFu32) << 12)
+ | ((second as u32 & 0x3Fu32) << 6)
| (third as u32 & 0x3Fu32);
dst[written] = point as u16;
read = new_read;
@@ -414,7 +416,8 @@ pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usiz
{
break 'outer;
}
- let point = (((byte as u32) & 0xFu32) << 12) | ((second as u32 & 0x3Fu32) << 6)
+ let point = (((byte as u32) & 0xFu32) << 12)
+ | ((second as u32 & 0x3Fu32) << 6)
| (third as u32 & 0x3Fu32);
dst[written] = point as u16;
read = new_read;