[libc++][format] Improves Unicode decoders.

During the implementation of P2286 a second Unicode decoder was added.
The original decoder was only used for the width estimation. Changing
an ill-formed Unicode sequence to the replacement character, works
properly for this use case. For P2286 an ill-formed Unicode sequence
needs to be formatted as a sequence of code units. The exact wording in
the Standard as a bit unclear and there was odd example in the WP. This
made it hard to use the same decoder. SG16 determined the odd example in
the WP was a bug and this has been fixed in the WP.

This made it possible to combine the two decoders. The P2286 decoder
kept track of the size of the ill-formed sequence. However this was not
needed since the output algorithm needs to keep track of size of a
well-formed and an ill-formed sequence. So this feature has been
removed.

The error status remains since it's needed for P2286, the grapheme
clustering can ignore this unneeded value. (In general, grapheme
clustering is only has specified behaviour for Unicode. When the string
is in a non-Unicode encoding there are no requirements. Ill-formed
Unicode is a non-Unicode encoding. Still libc++ does a best effort
estimation.)

There UTF-8 decoder accepted several ill-formed sequences:
- Values in the surrogate range U+D800..U+DFFF.
- Values encoded in more code units than required, for example 0+0020
  in theory can be encoded using 1, 2, 3, or 4 were accepted. This is
  not allowed by the Unicode Standard.
- Values larger than U+10FFFF were not always rejected.

Reviewed By: #libc, ldionne, tahonermann, Mordante

Differential Revision: https://reviews.llvm.org/D144346
This commit is contained in:
Mark de Wever 2023-02-09 21:38:42 +01:00
parent 95bc01dbec
commit c866855b42
3 changed files with 202 additions and 206 deletions

View File

@ -12,6 +12,7 @@
#include <__algorithm/ranges_copy.h>
#include <__algorithm/ranges_fill_n.h>
#include <__algorithm/ranges_for_each.h>
#include <__algorithm/ranges_transform.h>
#include <__chrono/statically_widen.h>
#include <__concepts/same_as.h>
@ -503,36 +504,17 @@ __escape(basic_string<_CharT>& __str, basic_string_view<_CharT> __values, __esca
__unicode::__code_point_view<_CharT> __view{__values.begin(), __values.end()};
while (!__view.__at_end()) {
auto __first = __view.__position();
typename __unicode::__consume_p2286_result __result = __view.__consume_p2286();
if (__result.__ill_formed_size == 0) {
if (!__formatter::__is_escaped_sequence_written(__str, __result.__value, __mark))
auto __first = __view.__position();
typename __unicode::__consume_result __result = __view.__consume();
if (__result.__status == __unicode::__consume_result::__ok) {
if (!__formatter::__is_escaped_sequence_written(__str, __result.__code_point, __mark))
// 2.2.1.3 - Add the character
ranges::copy(__first, __view.__position(), std::back_insert_iterator(__str));
} else {
// 2.2.3 sequence of ill-formed code units
// The number of code-units in __result.__value depends on the character type being used.
if constexpr (sizeof(_CharT) == 1) {
_LIBCPP_ASSERT(__result.__ill_formed_size == 1 || __result.__ill_formed_size == 4,
"illegal number of invalid code units.");
if (__result.__ill_formed_size == 1) // ill-formed, one code unit
__formatter::__write_escape_ill_formed_code_unit(__str, __result.__value & 0xff);
else { // out of valid range, four code units
// The code point was properly encoded, decode the value.
__formatter::__write_escape_ill_formed_code_unit(__str, __result.__value >> 18 | 0xf0);
__formatter::__write_escape_ill_formed_code_unit(__str, (__result.__value >> 12 & 0x3f) | 0x80);
__formatter::__write_escape_ill_formed_code_unit(__str, (__result.__value >> 6 & 0x3f) | 0x80);
__formatter::__write_escape_ill_formed_code_unit(__str, (__result.__value & 0x3f) | 0x80);
}
} else if constexpr (sizeof(_CharT) == 2) {
_LIBCPP_ASSERT(__result.__ill_formed_size == 1, "for UTF-16 at most one invalid code unit");
__formatter::__write_escape_ill_formed_code_unit(__str, __result.__value & 0xffff);
} else {
static_assert(sizeof(_CharT) == 4, "unsupported character width");
_LIBCPP_ASSERT(__result.__ill_formed_size == 1, "for UTF-32 one code unit is one code point");
__formatter::__write_escape_ill_formed_code_unit(__str, __result.__value);
}
ranges::for_each(__first, __view.__position(), [&](_CharT __value) {
__formatter::__write_escape_ill_formed_code_unit(__str, __formatter::__to_char32(__value));
});
}
}
}

View File

@ -31,23 +31,28 @@ _LIBCPP_BEGIN_NAMESPACE_STD
namespace __unicode {
# if _LIBCPP_STD_VER >= 23
// Helper struct for the result of a consume operation.
//
// The status value for a correct code point is 0. This allows a valid value to
// be used without masking.
// When the decoding fails it know the number of code units affected. For the
// current use-cases that value is not needed, therefore it is not stored.
// The escape routine needs the number of code units for both a valid and
// invalid character and keeps track of it itself. Doing it in this result
// unconditionally would give some overhead when the value is unneeded.
struct __consume_result {
// When __status == __ok it contains the decoded code point.
// Else it contains the replacement character U+FFFD
char32_t __code_point : 31;
/// The result of consuming a code point using P2286' semantics
///
/// TODO FMT Combine __consume and __consume_p2286 in one function.
struct __consume_p2286_result {
// A size of 0 means well formed. This to differenciate between
// a valid code point and a code unit that's invalid like 0b11111xxx.
int __ill_formed_size;
// If well formed the consumed code point.
// Otherwise the ill-formed code units as unsigned 8-bit values. They are
// stored in reverse order, to make it easier to extract the values.
char32_t __value;
enum : char32_t {
// Consumed a well-formed code point.
__ok = 0,
// Encountered invalid UTF-8
__error = 1
} __status : 1 {__ok};
};
# endif // _LIBCPP_STD_VER >= 23
static_assert(sizeof(__consume_result) == sizeof(char32_t));
# ifndef _LIBCPP_HAS_NO_UNICODE
@ -66,6 +71,36 @@ struct __consume_p2286_result {
inline constexpr char32_t __replacement_character = U'\ufffd';
// The error of a consume operation.
//
// This sets the code point to the replacement character. This code point does
// not participate in the grapheme clustering, so grapheme clustering code can
// ignore the error status and always use the code point.
inline constexpr __consume_result __consume_result_error{__replacement_character, __consume_result::__error};
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __is_high_surrogate(char32_t __value) {
return __value >= 0xd800 && __value <= 0xdbff;
}
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __is_low_surrogate(char32_t __value) {
return __value >= 0xdc00 && __value <= 0xdfff;
}
// https://www.unicode.org/glossary/#surrogate_code_point
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_surrogate(char32_t __value) {
return __value >= 0xd800 && __value <= 0xdfff;
}
// https://www.unicode.org/glossary/#code_point
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_code_point(char32_t __value) {
return __value <= 0x10ffff;
}
// https://www.unicode.org/glossary/#unicode_scalar_value
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_scalar_value(char32_t __value) {
return __unicode::__is_code_point(__value) && !__unicode::__is_surrogate(__value);
}
template <contiguous_iterator _Iterator>
requires same_as<iter_value_t<_Iterator>, char>
_LIBCPP_HIDE_FROM_ABI constexpr bool __is_continuation(_Iterator __char, int __count) {
@ -97,61 +132,29 @@ public:
_LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
_LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
_LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
_LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
// Based on the number of leading 1 bits the number of code units in the
// code point can be determined. See
// https://en.wikipedia.org/wiki/UTF-8#Encoding
switch (_VSTD::countl_one(static_cast<unsigned char>(*__first_))) {
case 0:
return *__first_++;
case 2:
if (__last_ - __first_ < 2 || !__unicode::__is_continuation(__first_ + 1, 1)) [[unlikely]]
break;
else {
char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
__value <<= 6;
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
return __value;
}
case 3:
if (__last_ - __first_ < 3 || !__unicode::__is_continuation(__first_ + 1, 2)) [[unlikely]]
break;
else {
char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
__value <<= 6;
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
__value <<= 6;
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
return __value;
}
case 4:
if (__last_ - __first_ < 4 || !__unicode::__is_continuation(__first_ + 1, 3)) [[unlikely]]
break;
else {
char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
__value <<= 6;
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
__value <<= 6;
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
__value <<= 6;
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
return __value;
}
}
// An invalid number of leading ones can be garbage or a code unit in the
// middle of a code point. By consuming one code unit the parser may get
// "in sync" after a few code units.
++__first_;
return __replacement_character;
}
# if _LIBCPP_STD_VER >= 23
_LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept {
// https://www.unicode.org/versions/latest/ch03.pdf#G7404
// Based on Table 3-7, Well-Formed UTF-8 Byte Sequences
//
// Code Points First Byte Second Byte Third Byte Fourth Byte Remarks
// U+0000..U+007F 00..7F U+0000..U+007F 1 code unit range
// C0..C1 80..BF invalid overlong encoding
// U+0080..U+07FF C2..DF 80..BF U+0080..U+07FF 2 code unit range
// E0 80..9F 80..BF invalid overlong encoding
// U+0800..U+0FFF E0 A0..BF 80..BF U+0800..U+FFFF 3 code unit range
// U+1000..U+CFFF E1..EC 80..BF 80..BF
// U+D000..U+D7FF ED 80..9F 80..BF
// U+D800..U+DFFF ED A0..BF 80..BF invalid encoding of surrogate code point
// U+E000..U+FFFF EE..EF 80..BF 80..BF
// F0 80..8F 80..BF 80..BF invalid overlong encoding
// U+10000..U+3FFFF F0 90..BF 80..BF 80..BF U+10000..U+10FFFF 4 code unit range
// U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
// U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
// F4 90..BF 80..BF 80..BF U+110000.. invalid code point range
//
// Unlike other parsers, these invalid entries are tested after decoding.
// - The parser always needs to consume these code units
// - The code is optimized for well-formed UTF-8
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
_LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
// Based on the number of leading 1 bits the number of code units in the
@ -159,60 +162,73 @@ public:
// https://en.wikipedia.org/wiki/UTF-8#Encoding
switch (std::countl_one(static_cast<unsigned char>(*__first_))) {
case 0:
return {0, static_cast<unsigned char>(*__first_++)};
return {static_cast<unsigned char>(*__first_++)};
case 2:
if (__last_ - __first_ < 2) [[unlikely]]
case 2: {
if (__last_ - __first_ < 2 || !__unicode::__is_continuation(__first_ + 1, 1)) [[unlikely]]
break;
if (__unicode::__is_continuation(__first_ + 1, 1)) {
char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
__value <<= 6;
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
return {0, __value};
}
break;
char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
__value <<= 6;
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
case 3:
if (__last_ - __first_ < 3) [[unlikely]]
// These values should be encoded in 1 UTF-8 code unit.
if (__value < 0x0080) [[unlikely]]
return __consume_result_error;
return {__value};
}
case 3: {
if (__last_ - __first_ < 3 || !__unicode::__is_continuation(__first_ + 1, 2)) [[unlikely]]
break;
if (__unicode::__is_continuation(__first_ + 1, 2)) {
char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
__value <<= 6;
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
__value <<= 6;
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
return {0, __value};
}
break;
char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
__value <<= 6;
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
__value <<= 6;
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
case 4:
if (__last_ - __first_ < 4) [[unlikely]]
// These values should be encoded in 1 or 2 UTF-8 code units.
if (__value < 0x0800) [[unlikely]]
return __consume_result_error;
// A surrogate value is always encoded in 3 UTF-8 code units.
if (__unicode::__is_surrogate(__value)) [[unlikely]]
return __consume_result_error;
return {__value};
}
case 4: {
if (__last_ - __first_ < 4 || !__unicode::__is_continuation(__first_ + 1, 3)) [[unlikely]]
break;
if (__unicode::__is_continuation(__first_ + 1, 3)) {
char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
__value <<= 6;
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
__value <<= 6;
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
__value <<= 6;
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
__value <<= 6;
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
__value <<= 6;
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
__value <<= 6;
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
if (__value > 0x10FFFF) // Outside the valid Unicode range?
return {4, __value};
// These values should be encoded in 1, 2, or 3 UTF-8 code units.
if (__value < 0x10000) [[unlikely]]
return __consume_result_error;
return {0, __value};
}
break;
// A value too large is always encoded in 4 UTF-8 code units.
if (!__unicode::__is_code_point(__value)) [[unlikely]]
return __consume_result_error;
return {__value};
}
}
// An invalid number of leading ones can be garbage or a code unit in the
// middle of a code point. By consuming one code unit the parser may get
// "in sync" after a few code units.
return {1, static_cast<unsigned char>(*__first_++)};
++__first_;
return __consume_result_error;
}
# endif // _LIBCPP_STD_VER >= 23
private:
_Iterator __first_;
@ -244,62 +260,33 @@ public:
_LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
_LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
_LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
_LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
char32_t __value = static_cast<char32_t>(*__first_++);
if constexpr (sizeof(wchar_t) == 2) {
char32_t __result = *__first_++;
// Is the code unit part of a surrogate pair? See
// https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
if (__result >= 0xd800 && __result <= 0xDfff) {
// Malformed Unicode.
if (__first_ == __last_) [[unlikely]]
return __replacement_character;
if (__unicode::__is_low_surrogate(__value)) [[unlikely]]
return __consume_result_error;
__result -= 0xd800;
__result <<= 10;
__result += *__first_++ - 0xdc00;
__result += 0x10000;
if (__unicode::__is_high_surrogate(__value)) {
if (__first_ == __last_ || !__unicode::__is_low_surrogate(static_cast<char32_t>(*__first_))) [[unlikely]]
return __consume_result_error;
__value -= 0xd800;
__value <<= 10;
__value += static_cast<char32_t>(*__first_++) - 0xdc00;
__value += 0x10000;
if (!__unicode::__is_code_point(__value)) [[unlikely]]
return __consume_result_error;
}
return __result;
} else if constexpr (sizeof(wchar_t) == 4) {
char32_t __result = *__first_++;
if (__result > 0x10FFFF) [[unlikely]]
return __replacement_character;
return __result;
} else {
__libcpp_unreachable();
}
}
# if _LIBCPP_STD_VER >= 23
_LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept {
_LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
char32_t __result = *__first_++;
if constexpr (sizeof(wchar_t) == 2) {
// https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
if (__is_surrogate_pair_high(__result)) {
// Malformed Unicode.
if (__first_ == __last_ || !__is_surrogate_pair_low(*(__first_ + 1))) [[unlikely]]
return {1, __result};
__result -= 0xd800;
__result <<= 10;
__result += *__first_++ - 0xdc00;
__result += 0x10000;
} else if (__is_surrogate_pair_low(__result))
// A code point shouldn't start with the low surrogate pair
return {1, __result};
} else {
if (__result > 0x10FFFF) [[unlikely]]
return {1, __result};
if (!__unicode::__is_scalar_value(__value)) [[unlikely]]
return __consume_result_error;
}
return {0, __result};
return {__value};
}
# endif // _LIBCPP_STD_VER >= 23
private:
_Iterator __first_;
@ -399,7 +386,7 @@ class __extended_grapheme_cluster_view {
public:
_LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_view(_Iterator __first, _Iterator __last)
: __code_point_view_(__first, __last),
__next_code_point_(__code_point_view_.__consume()),
__next_code_point_(__code_point_view_.__consume().__code_point),
__next_prop_(__extended_grapheme_custer_property_boundary::__get_property(__next_code_point_)) {}
struct __cluster {
@ -420,6 +407,7 @@ public:
_LIBCPP_ASSERT(
__next_prop_ != __extended_grapheme_custer_property_boundary::__property::__eot,
"can't move beyond the end of input");
char32_t __code_point = __next_code_point_;
if (!__code_point_view_.__at_end())
return {__code_point, __get_break()};
@ -444,7 +432,7 @@ private:
__next_prop_ = __extended_grapheme_custer_property_boundary::__property::__eot;
return __result;
}
__next_code_point_ = __code_point_view_.__consume();
__next_code_point_ = __code_point_view_.__consume().__code_point;
__next_prop_ = __extended_grapheme_custer_property_boundary::__get_property(__next_code_point_);
__has_extened_pictographic |=
@ -474,19 +462,11 @@ public:
_LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
_LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
_LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
_LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
return *__first_++;
return {static_cast<char32_t>(*__first_++)};
}
# if _LIBCPP_STD_VER >= 23
_LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept {
_LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
return {0, std::make_unsigned_t<_CharT>(*__first_++)};
}
# endif // _LIBCPP_STD_VER >= 23
private:
_Iterator __first_;
_Iterator __last_;

View File

@ -202,14 +202,9 @@ void test_char() {
test_format(V{L"'\\u{600}'"}, L"{:?}", L'\x600'); // ARABIC NUMBER SIGN
test_format(V{L"'\\u{feff}'"}, L"{:?}", L'\xfeff'); // ZERO WIDTH NO-BREAK SPACE
if constexpr (sizeof(CharT) == 2) {
// Incomplete surrogate pair in UTF-16
test_format(V{L"'\\x{d800}'"}, L"{:?}", L'\xd800'); // <surrogate-D800>
test_format(V{L"'\\x{dfff}'"}, L"{:?}", L'\xdfff'); // <surrogate-DFFF>
} else {
test_format(V{L"'\\u{d800}'"}, L"{:?}", L'\xd800'); // <surrogate-D800>
test_format(V{L"'\\u{dfff}'"}, L"{:?}", L'\xdfff'); // <surrogate-DFFF>
}
// Incomplete surrogate pair in UTF-16
test_format(V{L"'\\x{d800}'"}, L"{:?}", L'\xd800'); // <surrogate-D800>
test_format(V{L"'\\x{dfff}'"}, L"{:?}", L'\xdfff'); // <surrogate-DFFF>
// Private_Use
test_format(V{L"'\\u{e000}'"}, L"{:?}", L'\xe000'); // <private-use-E000>
@ -277,6 +272,48 @@ void test_string() {
// Ill-formend UTF-8
test_format(SV(R"(["\x{c3}"])"), SV("[{:?}]"), "\xc3");
test_format(SV(R"(["\x{c3}("])"), SV("[{:?}]"), "\xc3\x28");
/* U+0000..U+0007F 1 code unit range, encoded in 2 code units. */
test_format(SV(R"(["\x{c0}\x{80}"])"), SV("[{:?}]"), "\xc0\x80"); // U+0000
test_format(SV(R"(["\x{c1}\x{bf}"])"), SV("[{:?}]"), "\xc1\xbf"); // U+007F
test_format(SV(R"(["\u{80}"])"), SV("[{:?}]"), "\xc2\x80"); // U+0080 first valid (General_Category=Control)
/* U+0000..U+07FFF 1 and 2 code unit range, encoded in 3 code units. */
test_format(SV(R"(["\x{e0}\x{80}\x{80}"])"), SV("[{:?}]"), "\xe0\x80\x80"); // U+0000
test_format(SV(R"(["\x{e0}\x{81}\x{bf}"])"), SV("[{:?}]"), "\xe0\x81\xbf"); // U+007F
test_format(SV(R"(["\x{e0}\x{82}\x{80}"])"), SV("[{:?}]"), "\xe0\x82\x80"); // U+0080
test_format(SV(R"(["\x{e0}\x{9f}\x{bf}"])"), SV("[{:?}]"), "\xe0\x9f\xbf"); // U+07FF
test_format(SV("[\"\u0800\"]"), SV("[{:?}]"), "\xe0\xa0\x80"); // U+0800 first valid
#if 0
// This code point is in the Hangul Jamo Extended-B block and at the time of writing
// it's unassigned. When it comes defined, this branch might become true.
test_format(SV("[\"\ud7ff\"]"), SV("[{:?}]"), "\xed\x9f\xbf"); // U+D7FF last valid
#else
/* U+D800..D+DFFFF surrogate range */
test_format(SV(R"(["\u{d7ff}"])"), SV("[{:?}]"), "\xed\x9f\xbf"); // U+D7FF last valid
#endif
test_format(SV(R"(["\x{ed}\x{a0}\x{80}"])"), SV("[{:?}]"), "\xed\xa0\x80"); // U+D800
test_format(SV(R"(["\x{ed}\x{af}\x{bf}"])"), SV("[{:?}]"), "\xed\xaf\xbf"); // U+DBFF
test_format(SV(R"(["\x{ed}\x{bf}\x{80}"])"), SV("[{:?}]"), "\xed\xbf\x80"); // U+DC00
test_format(SV(R"(["\x{ed}\x{bf}\x{bf}"])"), SV("[{:?}]"), "\xed\xbf\xbf"); // U+DFFF
test_format(SV(R"(["\u{e000}"])"), SV("[{:?}]"), "\xee\x80\x80"); // U+E000 first valid
// (in the Private Use Area block)
/* U+0000..U+FFFF 1, 2, and 3 code unit range */
test_format(SV(R"(["\x{f0}\x{80}\x{80}\x{80}"])"), SV("[{:?}]"), "\xf0\x80\x80\x80"); // U+0000
test_format(SV(R"(["\x{f0}\x{80}\x{81}\x{bf}"])"), SV("[{:?}]"), "\xf0\x80\x81\xbf"); // U+007F
test_format(SV(R"(["\x{f0}\x{80}\x{82}\x{80}"])"), SV("[{:?}]"), "\xf0\x80\x82\x80"); // U+0080
test_format(SV(R"(["\x{f0}\x{80}\x{9f}\x{bf}"])"), SV("[{:?}]"), "\xf0\x80\x9f\xbf"); // U+07FF
test_format(SV(R"(["\x{f0}\x{80}\x{a0}\x{80}"])"), SV("[{:?}]"), "\xf0\x80\xa0\x80"); // U+0800
test_format(SV(R"(["\x{f0}\x{8f}\x{bf}\x{bf}"])"), SV("[{:?}]"), "\xf0\x8f\xbf\xbf"); // U+FFFF
test_format(SV("[\"\U00010000\"]"), SV("[{:?}]"), "\xf0\x90\x80\x80"); // U+10000 first valid
/* U+10FFFF..U+1FFFFF invalid range */
test_format(SV(R"(["\u{10ffff}"])"), SV("[{:?}]"), "\xf4\x8f\xbf\xbf"); // U+10FFFF last valid
// (in Supplementary Private Use Area-B)
test_format(SV(R"(["\x{f4}\x{90}\x{80}\x{80}"])"), SV("[{:?}]"), "\xf4\x90\x80\x80"); // U+110000
test_format(SV(R"(["\x{f4}\x{bf}\x{bf}\x{bf}"])"), SV("[{:?}]"), "\xf4\xbf\xbf\xbf"); // U+11FFFF
} else {
// Valid UTF-16 and UTF-32
test_format(SV("[\"\u00c3\"]"), SV("[{:?}]"), L"\xc3"); // LATIN CAPITAL LETTER A WITH TILDE
@ -320,11 +357,8 @@ void test_string() {
// Format
test_format(V{LR"("\u{ad}\u{600}\u{feff}")"}, L"{:?}", L"\xad\x600\xfeff");
if constexpr (sizeof(CharT) == 2)
// Incomplete surrogate pair in UTF-16
test_format(V{LR"("\x{d800}")"}, L"{:?}", L"\xd800");
else
test_format(V{LR"("\u{d800}")"}, L"{:?}", L"\xd800");
// Incomplete surrogate pair in UTF-16
test_format(V{LR"("\x{d800}")"}, L"{:?}", L"\xd800");
// Private_Use
test_format(V{LR"("\u{e000}\u{f8ff}")"}, L"{:?}", L"\xe000\xf8ff");