[libc++][format] Adds a UTF transcoder.

This is a preparation for P2093R14 Formatted output When the output of print is to the terminal it needs to use the native API. This means transcoding UTF-8 to UTF-16 on Windows. The encoder's interface is modeled after P2728 Unicode in the Library, Part 1: UTF Transcoding But only the required part for P2093R14 is implemented. On Windows wchar_t is 16 bits, in order to test on platforms where wchar_t is 32 bits the transcoder has support for char16_t. It also adds and UTF-8 to UTF-32 encoder which is useful for other tests. Note it is possible to use <codecvt> for transcoding, but that header is deprecated. So rather write new code that is not deprecated; the hard part, decoding, has already been done. The <codecvt> header also requires locale support while the new code works without including <locale>. Note the current transcoder implementation can be optimized since it basically does UTF-8 -> UTF-32 -> UTF-16. The first goal is to have a working implementation. Since it's not part of the ABI it's possible to do the optimization later. Depends on D149672 Reviewed By: ldionne, tahonermann, #libc Differential Revision: https://reviews.llvm.org/D150031
2024-10-08 11:44:05 +00:00 · 2023-04-21 08:09:06 +02:00 · 2023-04-21 08:09:06 +02:00 · 20341c3ad6
commit 20341c3ad6
parent 2cb4731902
13 changed files with 255 additions and 4 deletions
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@ -956,6 +956,7 @@ set(files
  numeric
  optional
  ostream
+  print
  queue
  random
  ranges
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@ -1368,6 +1368,10 @@ module std [system] {
    // FIXME: should re-export ios, streambuf?
    export *
  }
+  module print {
+    header "print"
+    export *
+  }
  module queue {
    header "queue"
    export *
--- a/libcxx/include/print
+++ b/libcxx/include/print
@ -0,0 +1,119 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP_PRINT
+#define _LIBCPP_PRINT
+
+#include <__assert> // all public C++ headers provide the assertion handler
+#include <__concepts/same_as.h>
+#include <__config>
+#include <__format/unicode.h>
+#include <string_view>
+#include <version>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if _LIBCPP_STD_VER >= 23
+
+#  ifndef _LIBCPP_HAS_NO_UNICODE
+// This is the code to transcode UTF-8 to UTF-16. This is used on
+// Windows for the native Unicode API. The code is modeled to make it
+// easier to extend to
+//
+//  P2728R0 Unicode in the Library, Part 1: UTF Transcoding
+//
+// This paper is still under heavy development so it makes no sense yet
+// to strictly follow the paper.
+namespace __unicode {
+
+// The names of these concepts are modelled after P2728R0, but the
+// implementation is not. char16_t may contain 32-bits so depending on the
+// number of bits is an issue.
+#    ifdef _LIBCPP_SHORT_WCHAR
+template <class _Tp>
+concept __utf16_code_unit =
+    same_as<_Tp, char16_t>
+#      ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
+    || same_as<_Tp, wchar_t>
+#      endif
+    ;
+template <class _Tp>
+concept __utf32_code_unit = same_as<_Tp, char32_t>;
+#    else // _LIBCPP_SHORT_WCHAR
+template <class _Tp>
+concept __utf16_code_unit = same_as<_Tp, char16_t>;
+template <class _Tp>
+concept __utf32_code_unit =
+    same_as<_Tp, char32_t>
+#      ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
+    || same_as<_Tp, wchar_t>
+#      endif
+    ;
+#    endif // _LIBCPP_SHORT_WCHAR
+
+// Pass by reference since an output_iterator may not be copyable.
+template <class _OutIt>
+_LIBCPP_HIDE_FROM_ABI constexpr void __encode(_OutIt&, char32_t) = delete;
+
+template <class _OutIt>
+  requires __utf16_code_unit<iter_value_t<_OutIt>>
+_LIBCPP_HIDE_FROM_ABI constexpr void __encode(_OutIt& __out_it, char32_t __value) {
+  _LIBCPP_ASSERT(__is_scalar_value(__value), "an invalid unicode scalar value results in invalid UTF-16");
+
+  if (__value < 0x10000) {
+    *__out_it++ = __value;
+    return;
+  }
+
+  __value -= 0x10000;
+  *__out_it++ = 0xd800 + (__value >> 10);
+  *__out_it++ = 0xdc00 + (__value & 0x3FF);
+}
+
+template <class _OutIt>
+  requires __utf32_code_unit<iter_value_t<_OutIt>>
+_LIBCPP_HIDE_FROM_ABI constexpr void __encode(_OutIt& __out_it, char32_t __value) {
+  _LIBCPP_ASSERT(__is_scalar_value(__value), "an invalid unicode scalar value results in invalid UTF-32");
+  *__out_it++ = __value;
+}
+
+template <class _OutIt, input_iterator _InIt>
+  requires output_iterator<_OutIt, const iter_value_t<_OutIt>&> && (!same_as<iter_value_t<_OutIt>, iter_value_t<_InIt>>)
+_LIBCPP_HIDE_FROM_ABI constexpr _OutIt __transcode(_InIt __first, _InIt __last, _OutIt __out_it) {
+  // The __code_point_view has a basic_string_view interface.
+  // When transcoding becomes part of the standard we probably want to
+  // look at smarter algorithms.
+  // For example, when processing a code point that is encoded in
+  // 1 to 3 code units in UTF-8, the result will always be encoded
+  // in 1 code unit in UTF-16 (code points that require 4 code
+  // units in UTF-8 will require 2 code units in UTF-16).
+  //
+  // Note if P2728 is accepted types like int may become valid. In that case
+  // the __code_point_view should use a span. Libc++ will remove support for
+  // char_traits<int>.
+  basic_string_view<iter_value_t<_InIt>> __data{__first, __last};
+  __code_point_view<iter_value_t<_InIt>> __view{__data.begin(), __data.end()};
+  while (!__view.__at_end())
+    __unicode::__encode(__out_it, __view.__consume().__code_point);
+  return __out_it;
+}
+
+} // namespace __unicode
+
+#  endif //  _LIBCPP_HAS_NO_UNICODE
+
+#endif // _LIBCPP_STD_VER >= 23
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_PRINT
--- a/libcxx/modules/std/print.cppm
+++ b/libcxx/modules/std/print.cppm
@ -8,10 +8,7 @@
 //===----------------------------------------------------------------------===//

 module;
-#if __has_include(<print>)
-#  error "include this header unconditionally and uncomment the exported symbols"
 #include <print>
-#endif

 export module std:print;
 export namespace std {
--- a/libcxx/test/libcxx/input.output/iostream.format/print.fun/transcoding.pass.cpp
+++ b/libcxx/test/libcxx/input.output/iostream.format/print.fun/transcoding.pass.cpp
@ -0,0 +1,87 @@
+//===----------------------------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
+
+// <print>
+
+// Tests the UTF-8 to UTF-16/32 encoding.
+// UTF-16 is used on Windows to write to the Unicode API.
+// UTF-32 is used to test the Windows behaviour on Linux using 32-bit wchar_t.
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <print>
+#include <string_view>
+
+#include "test_macros.h"
+#include "make_string.h"
+
+#define SV(S) MAKE_STRING_VIEW(CharT, S)
+
+template <class CharT>
+constexpr void test(std::basic_string_view<CharT> expected, std::string_view input) {
+  assert(expected.size() < 1024);
+  std::array<CharT, 1024> buffer;
+  std::ranges::fill(buffer, CharT('*'));
+
+  CharT* out = std::__unicode::__transcode(input.begin(), input.end(), buffer.data());
+
+  assert(std::basic_string_view<CharT>(buffer.data(), out) == expected);
+
+  out = std::find_if(out, buffer.end(), [](CharT c) { return c != CharT('*'); });
+  assert(out == buffer.end());
+}
+
+template <class CharT>
+constexpr void test() {
+  // *** Test valid UTF-8 ***
+#define TEST(S) test(SV(S), S)
+  TEST("hello world");
+  // copied from benchmarks/std_format_spec_string_unicode.bench.cpp
+  TEST("Lorem ipsum dolor sit amet, ne sensibus evertitur aliquando his. Iuvaret fabulas qui ex.");
+  TEST("Lōrem ipsūm dolor sīt æmeÞ, ea vel nostrud feuġǣit, muciūs tēmporiȝusrefērrēnÞur no mel.");
+  TEST("Лорем ипсум долор сит амет, еу диам тамяуам принципес вис, еяуидем цонцептам диспутандо");
+  TEST("入ト年媛ろ舗学ラロ準募ケカ社金スノ屋検れう策他セヲシ引口ぎ集7独ぱクふ出車ぽでぱ円輪ルノ受打わ。");
+  TEST("\U0001f636\u200d\U0001f32b\ufe0f");
+#undef TEST
+
+  // *** Test invalid UTF-8 ***
+  test(SV("\ufffd"), "\xc3");
+  test(SV("\ufffd("), "\xc3\x28");
+
+  // Surrogate range
+  test(SV("\ufffd"), "\xed\xa0\x80"); // U+D800
+  test(SV("\ufffd"), "\xed\xaf\xbf"); // U+DBFF
+  test(SV("\ufffd"), "\xed\xbf\x80"); // U+DC00
+  test(SV("\ufffd"), "\xed\xbf\xbf"); // U+DFFF
+
+  // Beyond valid values
+  test(SV("\ufffd"), "\xf4\x90\x80\x80"); // U+110000
+  test(SV("\ufffd"), "\xf4\xbf\xbf\xbf"); // U+11FFFF
+
+  // Validates http://unicode.org/review/pr-121.html option 3.
+  test(SV("\u0061\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u0062"), "\x61\xF1\x80\x80\xE1\x80\xC2\x62");
+}
+
+constexpr bool test() {
+  test<char16_t>();
+  test<char32_t>();
+#if !defined(TEST_HAS_NO_WIDE_CHARACTERS)
+  test<wchar_t>();
+#endif
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
--- a/libcxx/test/libcxx/transitive_includes/cxx03.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv
@ -635,6 +635,12 @@ ostream string
 ostream type_traits
 ostream typeinfo
 ostream version
+print cstddef
+print cstdint
+print initializer_list
+print limits
+print string_view
+print version
 queue compare
 queue concepts
 queue cstddef
--- a/libcxx/test/libcxx/transitive_includes/cxx11.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv
@ -636,6 +636,12 @@ ostream string
 ostream type_traits
 ostream typeinfo
 ostream version
+print cstddef
+print cstdint
+print initializer_list
+print limits
+print string_view
+print version
 queue compare
 queue concepts
 queue cstddef
--- a/libcxx/test/libcxx/transitive_includes/cxx14.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv
@ -638,6 +638,12 @@ ostream string
 ostream type_traits
 ostream typeinfo
 ostream version
+print cstddef
+print cstdint
+print initializer_list
+print limits
+print string_view
+print version
 queue compare
 queue concepts
 queue cstddef
--- a/libcxx/test/libcxx/transitive_includes/cxx17.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv
@ -638,6 +638,12 @@ ostream string
 ostream type_traits
 ostream typeinfo
 ostream version
+print cstddef
+print cstdint
+print initializer_list
+print limits
+print string_view
+print version
 queue compare
 queue concepts
 queue cstddef
--- a/libcxx/test/libcxx/transitive_includes/cxx20.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv
@ -644,6 +644,12 @@ ostream string
 ostream type_traits
 ostream typeinfo
 ostream version
+print cstddef
+print cstdint
+print initializer_list
+print limits
+print string_view
+print version
 queue compare
 queue concepts
 queue cstddef
--- a/libcxx/test/libcxx/transitive_includes/cxx23.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv
@ -453,6 +453,12 @@ ostream streambuf
 ostream string
 ostream typeinfo
 ostream version
+print cstddef
+print cstdint
+print initializer_list
+print limits
+print string_view
+print version
 queue compare
 queue cstddef
 queue cstdint
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@ -453,6 +453,12 @@ ostream streambuf
 ostream string
 ostream typeinfo
 ostream version
+print cstddef
+print cstdint
+print initializer_list
+print limits
+print string_view
+print version
 queue compare
 queue cstddef
 queue cstdint
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@ -247,6 +247,7 @@ check-generated-output)
           --exclude 'locale-specific_form.pass.cpp' \
           --exclude 'ostream.pass.cpp' \
           --exclude 'std_format_spec_string_unicode.bench.cpp' \
+           --exclude 'transcoding.pass.cpp' \
           --exclude 'underflow.pass.cpp' \
           || false