[libc++][format] Adds a UTF transcoder.

This is a preparation for

  P2093R14 Formatted output

When the output of print is to the terminal it needs to use the native
API. This means transcoding UTF-8 to UTF-16 on Windows. The encoder's
interface is modeled after

 P2728 Unicode in the Library, Part 1: UTF Transcoding

But only the required part for P2093R14 is implemented.

On Windows wchar_t is 16 bits, in order to test on platforms where
wchar_t is 32 bits the transcoder has support for char16_t. It also adds
and UTF-8 to UTF-32 encoder which is useful for other tests.

Note it is possible to use <codecvt> for transcoding, but that header is
deprecated. So rather write new code that is not deprecated; the hard
part, decoding, has already been done. The <codecvt> header also
requires locale support while the new code works without including
<locale>.

Note the current transcoder implementation can be optimized since it
basically does UTF-8 -> UTF-32 -> UTF-16. The first goal is to have a
working implementation. Since it's not part of the ABI it's possible to
do the optimization later.

Depends on D149672

Reviewed By: ldionne, tahonermann, #libc

Differential Revision: https://reviews.llvm.org/D150031
This commit is contained in:
Mark de Wever 2023-04-21 08:09:06 +02:00
parent 2cb4731902
commit 20341c3ad6
13 changed files with 255 additions and 4 deletions

View File

@ -956,6 +956,7 @@ set(files
numeric
optional
ostream
print
queue
random
ranges

View File

@ -1368,6 +1368,10 @@ module std [system] {
// FIXME: should re-export ios, streambuf?
export *
}
module print {
header "print"
export *
}
module queue {
header "queue"
export *

119
libcxx/include/print Normal file
View File

@ -0,0 +1,119 @@
// -*- C++ -*-
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef _LIBCPP_PRINT
#define _LIBCPP_PRINT
#include <__assert> // all public C++ headers provide the assertion handler
#include <__concepts/same_as.h>
#include <__config>
#include <__format/unicode.h>
#include <string_view>
#include <version>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
#endif
_LIBCPP_BEGIN_NAMESPACE_STD
#if _LIBCPP_STD_VER >= 23
# ifndef _LIBCPP_HAS_NO_UNICODE
// This is the code to transcode UTF-8 to UTF-16. This is used on
// Windows for the native Unicode API. The code is modeled to make it
// easier to extend to
//
// P2728R0 Unicode in the Library, Part 1: UTF Transcoding
//
// This paper is still under heavy development so it makes no sense yet
// to strictly follow the paper.
namespace __unicode {
// The names of these concepts are modelled after P2728R0, but the
// implementation is not. char16_t may contain 32-bits so depending on the
// number of bits is an issue.
# ifdef _LIBCPP_SHORT_WCHAR
template <class _Tp>
concept __utf16_code_unit =
same_as<_Tp, char16_t>
# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
|| same_as<_Tp, wchar_t>
# endif
;
template <class _Tp>
concept __utf32_code_unit = same_as<_Tp, char32_t>;
# else // _LIBCPP_SHORT_WCHAR
template <class _Tp>
concept __utf16_code_unit = same_as<_Tp, char16_t>;
template <class _Tp>
concept __utf32_code_unit =
same_as<_Tp, char32_t>
# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
|| same_as<_Tp, wchar_t>
# endif
;
# endif // _LIBCPP_SHORT_WCHAR
// Pass by reference since an output_iterator may not be copyable.
template <class _OutIt>
_LIBCPP_HIDE_FROM_ABI constexpr void __encode(_OutIt&, char32_t) = delete;
template <class _OutIt>
requires __utf16_code_unit<iter_value_t<_OutIt>>
_LIBCPP_HIDE_FROM_ABI constexpr void __encode(_OutIt& __out_it, char32_t __value) {
_LIBCPP_ASSERT(__is_scalar_value(__value), "an invalid unicode scalar value results in invalid UTF-16");
if (__value < 0x10000) {
*__out_it++ = __value;
return;
}
__value -= 0x10000;
*__out_it++ = 0xd800 + (__value >> 10);
*__out_it++ = 0xdc00 + (__value & 0x3FF);
}
template <class _OutIt>
requires __utf32_code_unit<iter_value_t<_OutIt>>
_LIBCPP_HIDE_FROM_ABI constexpr void __encode(_OutIt& __out_it, char32_t __value) {
_LIBCPP_ASSERT(__is_scalar_value(__value), "an invalid unicode scalar value results in invalid UTF-32");
*__out_it++ = __value;
}
template <class _OutIt, input_iterator _InIt>
requires output_iterator<_OutIt, const iter_value_t<_OutIt>&> && (!same_as<iter_value_t<_OutIt>, iter_value_t<_InIt>>)
_LIBCPP_HIDE_FROM_ABI constexpr _OutIt __transcode(_InIt __first, _InIt __last, _OutIt __out_it) {
// The __code_point_view has a basic_string_view interface.
// When transcoding becomes part of the standard we probably want to
// look at smarter algorithms.
// For example, when processing a code point that is encoded in
// 1 to 3 code units in UTF-8, the result will always be encoded
// in 1 code unit in UTF-16 (code points that require 4 code
// units in UTF-8 will require 2 code units in UTF-16).
//
// Note if P2728 is accepted types like int may become valid. In that case
// the __code_point_view should use a span. Libc++ will remove support for
// char_traits<int>.
basic_string_view<iter_value_t<_InIt>> __data{__first, __last};
__code_point_view<iter_value_t<_InIt>> __view{__data.begin(), __data.end()};
while (!__view.__at_end())
__unicode::__encode(__out_it, __view.__consume().__code_point);
return __out_it;
}
} // namespace __unicode
# endif // _LIBCPP_HAS_NO_UNICODE
#endif // _LIBCPP_STD_VER >= 23
_LIBCPP_END_NAMESPACE_STD
#endif // _LIBCPP_PRINT

View File

@ -8,10 +8,7 @@
//===----------------------------------------------------------------------===//
module;
#if __has_include(<print>)
# error "include this header unconditionally and uncomment the exported symbols"
#include <print>
#endif
export module std:print;
export namespace std {

View File

@ -0,0 +1,87 @@
//===----------------------------------------------------------------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
// UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
// <print>
// Tests the UTF-8 to UTF-16/32 encoding.
// UTF-16 is used on Windows to write to the Unicode API.
// UTF-32 is used to test the Windows behaviour on Linux using 32-bit wchar_t.
#include <algorithm>
#include <array>
#include <cassert>
#include <print>
#include <string_view>
#include "test_macros.h"
#include "make_string.h"
#define SV(S) MAKE_STRING_VIEW(CharT, S)
template <class CharT>
constexpr void test(std::basic_string_view<CharT> expected, std::string_view input) {
assert(expected.size() < 1024);
std::array<CharT, 1024> buffer;
std::ranges::fill(buffer, CharT('*'));
CharT* out = std::__unicode::__transcode(input.begin(), input.end(), buffer.data());
assert(std::basic_string_view<CharT>(buffer.data(), out) == expected);
out = std::find_if(out, buffer.end(), [](CharT c) { return c != CharT('*'); });
assert(out == buffer.end());
}
template <class CharT>
constexpr void test() {
// *** Test valid UTF-8 ***
#define TEST(S) test(SV(S), S)
TEST("hello world");
// copied from benchmarks/std_format_spec_string_unicode.bench.cpp
TEST("Lorem ipsum dolor sit amet, ne sensibus evertitur aliquando his. Iuvaret fabulas qui ex.");
TEST("Lōrem ipsūm dolor sīt æmeÞ, ea vel nostrud feuġǣit, muciūs tēmporiȝusrefērrēnÞur no mel.");
TEST("Лорем ипсум долор сит амет, еу диам тамяуам принципес вис, еяуидем цонцептам диспутандо");
TEST("入ト年媛ろ舗学ラロ準募ケカ社金ス屋検れう策他セヲシ引口ぎ集7独ぱクふ出車ぽでぱ円輪ル受打わ。");
TEST("\U0001f636\u200d\U0001f32b\ufe0f");
#undef TEST
// *** Test invalid UTF-8 ***
test(SV("\ufffd"), "\xc3");
test(SV("\ufffd("), "\xc3\x28");
// Surrogate range
test(SV("\ufffd"), "\xed\xa0\x80"); // U+D800
test(SV("\ufffd"), "\xed\xaf\xbf"); // U+DBFF
test(SV("\ufffd"), "\xed\xbf\x80"); // U+DC00
test(SV("\ufffd"), "\xed\xbf\xbf"); // U+DFFF
// Beyond valid values
test(SV("\ufffd"), "\xf4\x90\x80\x80"); // U+110000
test(SV("\ufffd"), "\xf4\xbf\xbf\xbf"); // U+11FFFF
// Validates http://unicode.org/review/pr-121.html option 3.
test(SV("\u0061\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u0062"), "\x61\xF1\x80\x80\xE1\x80\xC2\x62");
}
constexpr bool test() {
test<char16_t>();
test<char32_t>();
#if !defined(TEST_HAS_NO_WIDE_CHARACTERS)
test<wchar_t>();
#endif
return true;
}
int main(int, char**) {
test();
static_assert(test());
return 0;
}

View File

@ -635,6 +635,12 @@ ostream string
ostream type_traits
ostream typeinfo
ostream version
print cstddef
print cstdint
print initializer_list
print limits
print string_view
print version
queue compare
queue concepts
queue cstddef

1 algorithm atomic
635 ostream type_traits
636 ostream typeinfo
637 ostream version
638 print cstddef
639 print cstdint
640 print initializer_list
641 print limits
642 print string_view
643 print version
644 queue compare
645 queue concepts
646 queue cstddef

View File

@ -636,6 +636,12 @@ ostream string
ostream type_traits
ostream typeinfo
ostream version
print cstddef
print cstdint
print initializer_list
print limits
print string_view
print version
queue compare
queue concepts
queue cstddef

1 algorithm atomic
636 ostream type_traits
637 ostream typeinfo
638 ostream version
639 print cstddef
640 print cstdint
641 print initializer_list
642 print limits
643 print string_view
644 print version
645 queue compare
646 queue concepts
647 queue cstddef

View File

@ -638,6 +638,12 @@ ostream string
ostream type_traits
ostream typeinfo
ostream version
print cstddef
print cstdint
print initializer_list
print limits
print string_view
print version
queue compare
queue concepts
queue cstddef

1 algorithm atomic
638 ostream type_traits
639 ostream typeinfo
640 ostream version
641 print cstddef
642 print cstdint
643 print initializer_list
644 print limits
645 print string_view
646 print version
647 queue compare
648 queue concepts
649 queue cstddef

View File

@ -638,6 +638,12 @@ ostream string
ostream type_traits
ostream typeinfo
ostream version
print cstddef
print cstdint
print initializer_list
print limits
print string_view
print version
queue compare
queue concepts
queue cstddef

1 algorithm atomic
638 ostream type_traits
639 ostream typeinfo
640 ostream version
641 print cstddef
642 print cstdint
643 print initializer_list
644 print limits
645 print string_view
646 print version
647 queue compare
648 queue concepts
649 queue cstddef

View File

@ -644,6 +644,12 @@ ostream string
ostream type_traits
ostream typeinfo
ostream version
print cstddef
print cstdint
print initializer_list
print limits
print string_view
print version
queue compare
queue concepts
queue cstddef

1 algorithm atomic
644 ostream type_traits
645 ostream typeinfo
646 ostream version
647 print cstddef
648 print cstdint
649 print initializer_list
650 print limits
651 print string_view
652 print version
653 queue compare
654 queue concepts
655 queue cstddef

View File

@ -453,6 +453,12 @@ ostream streambuf
ostream string
ostream typeinfo
ostream version
print cstddef
print cstdint
print initializer_list
print limits
print string_view
print version
queue compare
queue cstddef
queue cstdint

1 algorithm climits
453 ostream string
454 ostream typeinfo
455 ostream version
456 print cstddef
457 print cstdint
458 print initializer_list
459 print limits
460 print string_view
461 print version
462 queue compare
463 queue cstddef
464 queue cstdint

View File

@ -453,6 +453,12 @@ ostream streambuf
ostream string
ostream typeinfo
ostream version
print cstddef
print cstdint
print initializer_list
print limits
print string_view
print version
queue compare
queue cstddef
queue cstdint

1 algorithm climits
453 ostream string
454 ostream typeinfo
455 ostream version
456 print cstddef
457 print cstdint
458 print initializer_list
459 print limits
460 print string_view
461 print version
462 queue compare
463 queue cstddef
464 queue cstdint

View File

@ -247,6 +247,7 @@ check-generated-output)
--exclude 'locale-specific_form.pass.cpp' \
--exclude 'ostream.pass.cpp' \
--exclude 'std_format_spec_string_unicode.bench.cpp' \
--exclude 'transcoding.pass.cpp' \
--exclude 'underflow.pass.cpp' \
|| false