[libc++][format] Improves Unicode decoders.

During the implementation of P2286 a second Unicode decoder was added. The original decoder was only used for the width estimation. Changing an ill-formed Unicode sequence to the replacement character, works properly for this use case. For P2286 an ill-formed Unicode sequence needs to be formatted as a sequence of code units. The exact wording in the Standard as a bit unclear and there was odd example in the WP. This made it hard to use the same decoder. SG16 determined the odd example in the WP was a bug and this has been fixed in the WP. This made it possible to combine the two decoders. The P2286 decoder kept track of the size of the ill-formed sequence. However this was not needed since the output algorithm needs to keep track of size of a well-formed and an ill-formed sequence. So this feature has been removed. The error status remains since it's needed for P2286, the grapheme clustering can ignore this unneeded value. (In general, grapheme clustering is only has specified behaviour for Unicode. When the string is in a non-Unicode encoding there are no requirements. Ill-formed Unicode is a non-Unicode encoding. Still libc++ does a best effort estimation.) There UTF-8 decoder accepted several ill-formed sequences: - Values in the surrogate range U+D800..U+DFFF. - Values encoded in more code units than required, for example 0+0020 in theory can be encoded using 1, 2, 3, or 4 were accepted. This is not allowed by the Unicode Standard. - Values larger than U+10FFFF were not always rejected. Reviewed By: #libc, ldionne, tahonermann, Mordante Differential Revision: https://reviews.llvm.org/D144346
2024-12-13 19:24:21 +00:00 · 2023-02-09 21:38:42 +01:00 · 2023-02-09 21:38:42 +01:00 · c866855b42
commit c866855b42
parent 95bc01dbec
3 changed files with 202 additions and 206 deletions
--- a/libcxx/include/__format/formatter_output.h
+++ b/libcxx/include/__format/formatter_output.h
@ -12,6 +12,7 @@

 #include <__algorithm/ranges_copy.h>
 #include <__algorithm/ranges_fill_n.h>
+#include <__algorithm/ranges_for_each.h>
 #include <__algorithm/ranges_transform.h>
 #include <__chrono/statically_widen.h>
 #include <__concepts/same_as.h>
@ -503,36 +504,17 @@ __escape(basic_string<_CharT>& __str, basic_string_view<_CharT> __values, __esca
  __unicode::__code_point_view<_CharT> __view{__values.begin(), __values.end()};

  while (!__view.__at_end()) {
-    auto __first                                        = __view.__position();
-    typename __unicode::__consume_p2286_result __result = __view.__consume_p2286();
-    if (__result.__ill_formed_size == 0) {
-      if (!__formatter::__is_escaped_sequence_written(__str, __result.__value, __mark))
+    auto __first                                  = __view.__position();
+    typename __unicode::__consume_result __result = __view.__consume();
+    if (__result.__status == __unicode::__consume_result::__ok) {
+      if (!__formatter::__is_escaped_sequence_written(__str, __result.__code_point, __mark))
        // 2.2.1.3 - Add the character
        ranges::copy(__first, __view.__position(), std::back_insert_iterator(__str));
-
    } else {
      // 2.2.3 sequence of ill-formed code units
-      // The number of code-units in __result.__value depends on the character type being used.
-      if constexpr (sizeof(_CharT) == 1) {
-        _LIBCPP_ASSERT(__result.__ill_formed_size == 1 || __result.__ill_formed_size == 4,
-                       "illegal number of invalid code units.");
-        if (__result.__ill_formed_size == 1) // ill-formed, one code unit
-          __formatter::__write_escape_ill_formed_code_unit(__str, __result.__value & 0xff);
-        else { // out of valid range, four code units
-               // The code point was properly encoded, decode the value.
-          __formatter::__write_escape_ill_formed_code_unit(__str, __result.__value >> 18 | 0xf0);
-          __formatter::__write_escape_ill_formed_code_unit(__str, (__result.__value >> 12 & 0x3f) | 0x80);
-          __formatter::__write_escape_ill_formed_code_unit(__str, (__result.__value >> 6 & 0x3f) | 0x80);
-          __formatter::__write_escape_ill_formed_code_unit(__str, (__result.__value & 0x3f) | 0x80);
-        }
-      } else if constexpr (sizeof(_CharT) == 2) {
-        _LIBCPP_ASSERT(__result.__ill_formed_size == 1, "for UTF-16 at most one invalid code unit");
-        __formatter::__write_escape_ill_formed_code_unit(__str, __result.__value & 0xffff);
-      } else {
-        static_assert(sizeof(_CharT) == 4, "unsupported character width");
-        _LIBCPP_ASSERT(__result.__ill_formed_size == 1, "for UTF-32 one code unit is one code point");
-        __formatter::__write_escape_ill_formed_code_unit(__str, __result.__value);
-      }
+      ranges::for_each(__first, __view.__position(), [&](_CharT __value) {
+        __formatter::__write_escape_ill_formed_code_unit(__str, __formatter::__to_char32(__value));
+      });
    }
  }
 }
--- a/libcxx/include/__format/unicode.h
+++ b/libcxx/include/__format/unicode.h
@ -31,23 +31,28 @@ _LIBCPP_BEGIN_NAMESPACE_STD

 namespace __unicode {

-#  if _LIBCPP_STD_VER >= 23
+// Helper struct for the result of a consume operation.
+//
+// The status value for a correct code point is 0. This allows a valid value to
+// be used without masking.
+// When the decoding fails it know the number of code units affected. For the
+// current use-cases that value is not needed, therefore it is not stored.
+// The escape routine needs the number of code units for both a valid and
+// invalid character and keeps track of it itself. Doing it in this result
+// unconditionally would give some overhead when the value is unneeded.
+struct __consume_result {
+  // When __status == __ok it contains the decoded code point.
+  // Else it contains the replacement character U+FFFD
+  char32_t __code_point : 31;

-/// The result of consuming a code point using P2286' semantics
-///
-/// TODO FMT Combine __consume and  __consume_p2286 in one function.
-struct __consume_p2286_result {
-  // A size of 0 means well formed. This to differenciate between
-  // a valid code point and a code unit that's invalid like 0b11111xxx.
-  int __ill_formed_size;
-
-  // If well formed the consumed code point.
-  // Otherwise the ill-formed code units as unsigned 8-bit values. They are
-  // stored in reverse order, to make it easier to extract the values.
-  char32_t __value;
+  enum : char32_t {
+    // Consumed a well-formed code point.
+    __ok = 0,
+    // Encountered invalid UTF-8
+    __error = 1
+  } __status : 1 {__ok};
 };
-
-#  endif // _LIBCPP_STD_VER >= 23
+static_assert(sizeof(__consume_result) == sizeof(char32_t));

 #  ifndef _LIBCPP_HAS_NO_UNICODE

@ -66,6 +71,36 @@ struct __consume_p2286_result {

 inline constexpr char32_t __replacement_character = U'\ufffd';

+// The error of a consume operation.
+//
+// This sets the code point to the replacement character. This code point does
+// not participate in the grapheme clustering, so grapheme clustering code can
+// ignore the error status and always use the code point.
+inline constexpr __consume_result __consume_result_error{__replacement_character, __consume_result::__error};
+
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __is_high_surrogate(char32_t __value) {
+  return __value >= 0xd800 && __value <= 0xdbff;
+}
+
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __is_low_surrogate(char32_t __value) {
+  return __value >= 0xdc00 && __value <= 0xdfff;
+}
+
+// https://www.unicode.org/glossary/#surrogate_code_point
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_surrogate(char32_t __value) {
+  return __value >= 0xd800 && __value <= 0xdfff;
+}
+
+// https://www.unicode.org/glossary/#code_point
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_code_point(char32_t __value) {
+  return __value <= 0x10ffff;
+}
+
+// https://www.unicode.org/glossary/#unicode_scalar_value
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_scalar_value(char32_t __value) {
+  return __unicode::__is_code_point(__value) && !__unicode::__is_surrogate(__value);
+}
+
 template <contiguous_iterator _Iterator>
  requires same_as<iter_value_t<_Iterator>, char>
 _LIBCPP_HIDE_FROM_ABI constexpr bool __is_continuation(_Iterator __char, int __count) {
@ -97,61 +132,29 @@ public:
  _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
  _LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }

-  _LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
-    _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
-
-    // Based on the number of leading 1 bits the number of code units in the
-    // code point can be determined. See
-    // https://en.wikipedia.org/wiki/UTF-8#Encoding
-    switch (_VSTD::countl_one(static_cast<unsigned char>(*__first_))) {
-    case 0:
-      return *__first_++;
-
-    case 2:
-      if (__last_ - __first_ < 2 || !__unicode::__is_continuation(__first_ + 1, 1)) [[unlikely]]
-        break;
-      else {
-        char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        return __value;
-      }
-
-    case 3:
-      if (__last_ - __first_ < 3 || !__unicode::__is_continuation(__first_ + 1, 2)) [[unlikely]]
-        break;
-      else {
-        char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        return __value;
-      }
-
-    case 4:
-      if (__last_ - __first_ < 4 || !__unicode::__is_continuation(__first_ + 1, 3)) [[unlikely]]
-        break;
-      else {
-        char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        return __value;
-      }
-    }
-    // An invalid number of leading ones can be garbage or a code unit in the
-    // middle of a code point. By consuming one code unit the parser may get
-    // "in sync" after a few code units.
-    ++__first_;
-    return __replacement_character;
-  }
-
-#    if _LIBCPP_STD_VER >= 23
-  _LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept {
+  // https://www.unicode.org/versions/latest/ch03.pdf#G7404
+  // Based on Table 3-7, Well-Formed UTF-8 Byte Sequences
+  //
+  // Code Points        First Byte Second Byte Third Byte Fourth Byte  Remarks
+  // U+0000..U+007F     00..7F                                         U+0000..U+007F 1 code unit range
+  //                    C0..C1     80..BF                              invalid overlong encoding
+  // U+0080..U+07FF     C2..DF     80..BF                              U+0080..U+07FF 2 code unit range
+  //                    E0         80..9F      80..BF                  invalid overlong encoding
+  // U+0800..U+0FFF     E0         A0..BF      80..BF                  U+0800..U+FFFF 3 code unit range
+  // U+1000..U+CFFF     E1..EC     80..BF      80..BF
+  // U+D000..U+D7FF     ED         80..9F      80..BF
+  // U+D800..U+DFFF     ED         A0..BF      80..BF                  invalid encoding of surrogate code point
+  // U+E000..U+FFFF     EE..EF     80..BF      80..BF
+  //                    F0         80..8F      80..BF     80..BF       invalid overlong encoding
+  // U+10000..U+3FFFF   F0         90..BF      80..BF     80..BF       U+10000..U+10FFFF 4 code unit range
+  // U+40000..U+FFFFF   F1..F3     80..BF      80..BF     80..BF
+  // U+100000..U+10FFFF F4         80..8F      80..BF     80..BF
+  //                    F4         90..BF      80..BF     80..BF       U+110000.. invalid code point range
+  //
+  // Unlike other parsers, these invalid entries are tested after decoding.
+  // - The parser always needs to consume these code units
+  // - The code is optimized for well-formed UTF-8
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
    _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");

    // Based on the number of leading 1 bits the number of code units in the
@ -159,60 +162,73 @@ public:
    // https://en.wikipedia.org/wiki/UTF-8#Encoding
    switch (std::countl_one(static_cast<unsigned char>(*__first_))) {
    case 0:
-      return {0, static_cast<unsigned char>(*__first_++)};
+      return {static_cast<unsigned char>(*__first_++)};

-    case 2:
-      if (__last_ - __first_ < 2) [[unlikely]]
+    case 2: {
+      if (__last_ - __first_ < 2 || !__unicode::__is_continuation(__first_ + 1, 1)) [[unlikely]]
        break;

-      if (__unicode::__is_continuation(__first_ + 1, 1)) {
-        char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        return {0, __value};
-      }
-      break;
+      char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
+      __value <<= 6;
+      __value |= static_cast<unsigned char>(*__first_++) & 0x3f;

-    case 3:
-      if (__last_ - __first_ < 3) [[unlikely]]
+      // These values should be encoded in 1 UTF-8 code unit.
+      if (__value < 0x0080) [[unlikely]]
+        return __consume_result_error;
+
+      return {__value};
+    }
+
+    case 3: {
+      if (__last_ - __first_ < 3 || !__unicode::__is_continuation(__first_ + 1, 2)) [[unlikely]]
        break;

-      if (__unicode::__is_continuation(__first_ + 1, 2)) {
-        char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        return {0, __value};
-      }
-      break;
+      char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
+      __value <<= 6;
+      __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
+      __value <<= 6;
+      __value |= static_cast<unsigned char>(*__first_++) & 0x3f;

-    case 4:
-      if (__last_ - __first_ < 4) [[unlikely]]
+      // These values should be encoded in 1 or 2 UTF-8 code units.
+      if (__value < 0x0800) [[unlikely]]
+        return __consume_result_error;
+
+      // A surrogate value is always encoded in 3 UTF-8 code units.
+      if (__unicode::__is_surrogate(__value)) [[unlikely]]
+        return __consume_result_error;
+
+      return {__value};
+    }
+
+    case 4: {
+      if (__last_ - __first_ < 4 || !__unicode::__is_continuation(__first_ + 1, 3)) [[unlikely]]
        break;

-      if (__unicode::__is_continuation(__first_ + 1, 3)) {
-        char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
+      char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
+      __value <<= 6;
+      __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
+      __value <<= 6;
+      __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
+      __value <<= 6;
+      __value |= static_cast<unsigned char>(*__first_++) & 0x3f;

-        if (__value > 0x10FFFF) // Outside the valid Unicode range?
-          return {4, __value};
+      // These values should be encoded in 1, 2, or 3 UTF-8 code units.
+      if (__value < 0x10000) [[unlikely]]
+        return __consume_result_error;

-        return {0, __value};
-      }
-      break;
+      // A value too large is always encoded in 4 UTF-8 code units.
+      if (!__unicode::__is_code_point(__value)) [[unlikely]]
+        return __consume_result_error;
+
+      return {__value};
+    }
    }
    // An invalid number of leading ones can be garbage or a code unit in the
    // middle of a code point. By consuming one code unit the parser may get
    // "in sync" after a few code units.
-    return {1, static_cast<unsigned char>(*__first_++)};
+    ++__first_;
+    return __consume_result_error;
  }
-#    endif // _LIBCPP_STD_VER >= 23

 private:
  _Iterator __first_;
@ -244,62 +260,33 @@ public:
  _LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
  _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }

-  _LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
    _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");

+    char32_t __value = static_cast<char32_t>(*__first_++);
    if constexpr (sizeof(wchar_t) == 2) {
-      char32_t __result = *__first_++;
-      // Is the code unit part of a surrogate pair? See
-      // https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
-      if (__result >= 0xd800 && __result <= 0xDfff) {
-        // Malformed Unicode.
-        if (__first_ == __last_) [[unlikely]]
-          return __replacement_character;
+      if (__unicode::__is_low_surrogate(__value)) [[unlikely]]
+        return __consume_result_error;

-        __result -= 0xd800;
-        __result <<= 10;
-        __result += *__first_++ - 0xdc00;
-        __result += 0x10000;
+      if (__unicode::__is_high_surrogate(__value)) {
+        if (__first_ == __last_ || !__unicode::__is_low_surrogate(static_cast<char32_t>(*__first_))) [[unlikely]]
+          return __consume_result_error;
+
+        __value -= 0xd800;
+        __value <<= 10;
+        __value += static_cast<char32_t>(*__first_++) - 0xdc00;
+        __value += 0x10000;
+
+        if (!__unicode::__is_code_point(__value)) [[unlikely]]
+          return __consume_result_error;
      }
-      return __result;
-
-    } else if constexpr (sizeof(wchar_t) == 4) {
-      char32_t __result = *__first_++;
-      if (__result > 0x10FFFF) [[unlikely]]
-        return __replacement_character;
-      return __result;
    } else {
-      __libcpp_unreachable();
-    }
-  }
-
-#      if _LIBCPP_STD_VER >= 23
-  _LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept {
-    _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
-
-    char32_t __result = *__first_++;
-    if constexpr (sizeof(wchar_t) == 2) {
-      // https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
-      if (__is_surrogate_pair_high(__result)) {
-        // Malformed Unicode.
-        if (__first_ == __last_ || !__is_surrogate_pair_low(*(__first_ + 1))) [[unlikely]]
-          return {1, __result};
-
-        __result -= 0xd800;
-        __result <<= 10;
-        __result += *__first_++ - 0xdc00;
-        __result += 0x10000;
-      } else if (__is_surrogate_pair_low(__result))
-        // A code point shouldn't start with the low surrogate pair
-        return {1, __result};
-    } else {
-      if (__result > 0x10FFFF) [[unlikely]]
-        return {1, __result};
+      if (!__unicode::__is_scalar_value(__value)) [[unlikely]]
+        return __consume_result_error;
    }

-    return {0, __result};
+    return {__value};
  }
-#      endif // _LIBCPP_STD_VER >= 23

 private:
  _Iterator __first_;
@ -399,7 +386,7 @@ class __extended_grapheme_cluster_view {
 public:
  _LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_view(_Iterator __first, _Iterator __last)
      : __code_point_view_(__first, __last),
-        __next_code_point_(__code_point_view_.__consume()),
+        __next_code_point_(__code_point_view_.__consume().__code_point),
        __next_prop_(__extended_grapheme_custer_property_boundary::__get_property(__next_code_point_)) {}

  struct __cluster {
@ -420,6 +407,7 @@ public:
    _LIBCPP_ASSERT(
        __next_prop_ != __extended_grapheme_custer_property_boundary::__property::__eot,
        "can't move beyond the end of input");
+
    char32_t __code_point = __next_code_point_;
    if (!__code_point_view_.__at_end())
      return {__code_point, __get_break()};
@ -444,7 +432,7 @@ private:
        __next_prop_ = __extended_grapheme_custer_property_boundary::__property::__eot;
        return __result;
      }
-      __next_code_point_ = __code_point_view_.__consume();
+      __next_code_point_ = __code_point_view_.__consume().__code_point;
      __next_prop_       = __extended_grapheme_custer_property_boundary::__get_property(__next_code_point_);

      __has_extened_pictographic |=
@ -474,19 +462,11 @@ public:
  _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
  _LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }

-  _LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
    _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
-    return *__first_++;
+    return {static_cast<char32_t>(*__first_++)};
  }

-#    if _LIBCPP_STD_VER >= 23
-  _LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept {
-    _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
-
-    return {0, std::make_unsigned_t<_CharT>(*__first_++)};
-  }
-#    endif // _LIBCPP_STD_VER >= 23
-
 private:
  _Iterator __first_;
  _Iterator __last_;
--- a/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
@ -202,14 +202,9 @@ void test_char() {
    test_format(V{L"'\\u{600}'"}, L"{:?}", L'\x600');   // ARABIC NUMBER SIGN
    test_format(V{L"'\\u{feff}'"}, L"{:?}", L'\xfeff'); // ZERO WIDTH NO-BREAK SPACE

-    if constexpr (sizeof(CharT) == 2) {
-      // Incomplete surrogate pair in UTF-16
-      test_format(V{L"'\\x{d800}'"}, L"{:?}", L'\xd800'); // <surrogate-D800>
-      test_format(V{L"'\\x{dfff}'"}, L"{:?}", L'\xdfff'); // <surrogate-DFFF>
-    } else {
-      test_format(V{L"'\\u{d800}'"}, L"{:?}", L'\xd800'); // <surrogate-D800>
-      test_format(V{L"'\\u{dfff}'"}, L"{:?}", L'\xdfff'); // <surrogate-DFFF>
-    }
+    // Incomplete surrogate pair in UTF-16
+    test_format(V{L"'\\x{d800}'"}, L"{:?}", L'\xd800'); // <surrogate-D800>
+    test_format(V{L"'\\x{dfff}'"}, L"{:?}", L'\xdfff'); // <surrogate-DFFF>

    // Private_Use
    test_format(V{L"'\\u{e000}'"}, L"{:?}", L'\xe000'); // <private-use-E000>
@ -277,6 +272,48 @@ void test_string() {
    // Ill-formend UTF-8
    test_format(SV(R"(["\x{c3}"])"), SV("[{:?}]"), "\xc3");
    test_format(SV(R"(["\x{c3}("])"), SV("[{:?}]"), "\xc3\x28");
+
+    /* U+0000..U+0007F 1 code unit range, encoded in 2 code units. */
+    test_format(SV(R"(["\x{c0}\x{80}"])"), SV("[{:?}]"), "\xc0\x80"); // U+0000
+    test_format(SV(R"(["\x{c1}\x{bf}"])"), SV("[{:?}]"), "\xc1\xbf"); // U+007F
+    test_format(SV(R"(["\u{80}"])"), SV("[{:?}]"), "\xc2\x80");       // U+0080 first valid (General_Category=Control)
+
+    /* U+0000..U+07FFF 1 and 2 code unit range, encoded in 3 code units. */
+    test_format(SV(R"(["\x{e0}\x{80}\x{80}"])"), SV("[{:?}]"), "\xe0\x80\x80"); // U+0000
+    test_format(SV(R"(["\x{e0}\x{81}\x{bf}"])"), SV("[{:?}]"), "\xe0\x81\xbf"); // U+007F
+    test_format(SV(R"(["\x{e0}\x{82}\x{80}"])"), SV("[{:?}]"), "\xe0\x82\x80"); // U+0080
+    test_format(SV(R"(["\x{e0}\x{9f}\x{bf}"])"), SV("[{:?}]"), "\xe0\x9f\xbf"); // U+07FF
+    test_format(SV("[\"\u0800\"]"), SV("[{:?}]"), "\xe0\xa0\x80");              // U+0800 first valid
+
+#if 0
+	// This code point is in the Hangul Jamo Extended-B block and at the time of writing
+	// it's unassigned. When it comes defined, this branch might become true.
+    test_format(SV("[\"\ud7ff\"]"), SV("[{:?}]"), "\xed\x9f\xbf");              // U+D7FF last valid
+#else
+    /* U+D800..D+DFFFF surrogate range */
+    test_format(SV(R"(["\u{d7ff}"])"), SV("[{:?}]"), "\xed\x9f\xbf");           // U+D7FF last valid
+#endif
+    test_format(SV(R"(["\x{ed}\x{a0}\x{80}"])"), SV("[{:?}]"), "\xed\xa0\x80"); // U+D800
+    test_format(SV(R"(["\x{ed}\x{af}\x{bf}"])"), SV("[{:?}]"), "\xed\xaf\xbf"); // U+DBFF
+    test_format(SV(R"(["\x{ed}\x{bf}\x{80}"])"), SV("[{:?}]"), "\xed\xbf\x80"); // U+DC00
+    test_format(SV(R"(["\x{ed}\x{bf}\x{bf}"])"), SV("[{:?}]"), "\xed\xbf\xbf"); // U+DFFF
+    test_format(SV(R"(["\u{e000}"])"), SV("[{:?}]"), "\xee\x80\x80");           // U+E000 first valid
+                                                                                // (in the Private Use Area block)
+
+    /* U+0000..U+FFFF 1, 2, and 3 code unit range */
+    test_format(SV(R"(["\x{f0}\x{80}\x{80}\x{80}"])"), SV("[{:?}]"), "\xf0\x80\x80\x80"); // U+0000
+    test_format(SV(R"(["\x{f0}\x{80}\x{81}\x{bf}"])"), SV("[{:?}]"), "\xf0\x80\x81\xbf"); // U+007F
+    test_format(SV(R"(["\x{f0}\x{80}\x{82}\x{80}"])"), SV("[{:?}]"), "\xf0\x80\x82\x80"); // U+0080
+    test_format(SV(R"(["\x{f0}\x{80}\x{9f}\x{bf}"])"), SV("[{:?}]"), "\xf0\x80\x9f\xbf"); // U+07FF
+    test_format(SV(R"(["\x{f0}\x{80}\x{a0}\x{80}"])"), SV("[{:?}]"), "\xf0\x80\xa0\x80"); // U+0800
+    test_format(SV(R"(["\x{f0}\x{8f}\x{bf}\x{bf}"])"), SV("[{:?}]"), "\xf0\x8f\xbf\xbf"); // U+FFFF
+    test_format(SV("[\"\U00010000\"]"), SV("[{:?}]"), "\xf0\x90\x80\x80");                // U+10000 first valid
+
+    /* U+10FFFF..U+1FFFFF invalid range */
+    test_format(SV(R"(["\u{10ffff}"])"), SV("[{:?}]"), "\xf4\x8f\xbf\xbf"); // U+10FFFF last valid
+                                                                            // (in Supplementary Private Use Area-B)
+    test_format(SV(R"(["\x{f4}\x{90}\x{80}\x{80}"])"), SV("[{:?}]"), "\xf4\x90\x80\x80"); // U+110000
+    test_format(SV(R"(["\x{f4}\x{bf}\x{bf}\x{bf}"])"), SV("[{:?}]"), "\xf4\xbf\xbf\xbf"); // U+11FFFF
  } else {
    // Valid UTF-16 and UTF-32
    test_format(SV("[\"\u00c3\"]"), SV("[{:?}]"), L"\xc3"); // LATIN CAPITAL LETTER A WITH TILDE
@ -320,11 +357,8 @@ void test_string() {
    // Format
    test_format(V{LR"("\u{ad}\u{600}\u{feff}")"}, L"{:?}", L"\xad\x600\xfeff");

-    if constexpr (sizeof(CharT) == 2)
-      // Incomplete surrogate pair in UTF-16
-      test_format(V{LR"("\x{d800}")"}, L"{:?}", L"\xd800");
-    else
-      test_format(V{LR"("\u{d800}")"}, L"{:?}", L"\xd800");
+    // Incomplete surrogate pair in UTF-16
+    test_format(V{LR"("\x{d800}")"}, L"{:?}", L"\xd800");

    // Private_Use
    test_format(V{LR"("\u{e000}\u{f8ff}")"}, L"{:?}", L"\xe000\xf8ff");