[libc++] Implements Unicode grapheme clustering

This implements the Grapheme clustering as required by
P1868R2 width: clarifying units of width and precision in std::format

This was omitted in the initial patch, but the paper was marked as completed. This really completes the paper.

Reviewed By: ldionne, #libc

Differential Revision: https://reviews.llvm.org/D126971
This commit is contained in:
Mark de Wever 2022-05-28 15:30:10 +02:00
parent 8e48783dcb
commit 857a78c04d
20 changed files with 4263 additions and 1076 deletions

View File

@ -6,183 +6,287 @@
#ifndef _LIBCPP_HAS_NO_UNICODE
#include <array>
#include <format>
# include <format>
# include <string_view>
#include "benchmark/benchmark.h"
# include "benchmark/benchmark.h"
#include "test_macros.h"
# include "make_string.h"
template <class CharT, size_t N>
class tester {
static constexpr size_t size_ = N - 1;
std::array<CharT, 100 * size_> data_;
# define SV(S) MAKE_STRING_VIEW(CharT, S)
public:
explicit constexpr tester(const CharT (&input)[N]) {
auto it = data_.begin();
for (int i = 0; i < 100; ++i)
it = std::copy_n(input, size_, it);
}
// generated with https://generator.lorem-ipsum.info/_latin
constexpr size_t size() const noexcept { return data_.size(); }
constexpr const CharT* begin() const noexcept { return data_.begin(); }
constexpr const CharT* end() const noexcept { return data_.end(); }
template <class CharT>
std::basic_string_view<CharT> ascii_text() {
return SV(
R"( Lorem ipsum dolor sit amet, ne sensibus evertitur aliquando his.
Iuvaret fabulas qui ex, ex iriure iisque nostrum mea. Solum
pericula qui ad. Elitr oporteat ius ad.
void test(benchmark::State& state) const {
Quas rationibus ad mel. Appellantur intellegebat ad mei, ius audire volumus
consectetuer id. Ei sit definitionem mediocritatem, vim indoctum intellegat id,
dicta laboramus instructior in vix. Mel an quando malorum, id vis mollis
invidunt, placerat maiestatis comprehensam ut cum. Suas regione interesset id
per, et docendi accumsan has, autem atomorum est te.
Cu debitis ancillae sea, alii definitiones ex cum, vim no erat antiopam. Eam et
unum quas scriptorem. An bonorum elaboraret complectitur nam, vim ei persecuti
democritum mediocritatem. Suscipit platonem signiferumque ei cum, in sale
volutpat ocurreret vel. Te vel nihil nominavi adipiscing, stet ancillae mel ea.
Sit detraxit menandri platonem ea, cum at tale viris virtute.
Regione detraxit gloriatur sit eu, sonet labitur sententiae et pro, at sit
alterum aliquid interpretaris. Sonet voluptua duo id, vix ea accumsan
liberavisse. Nam id commune probatus contentiones. Et zril dolore laudem duo,
ea usu mollis melius referrentur, vel ex case consequuntur. Id nam illum mollis
ponderum. Quis tamquam ullamcorper sed ne, legimus vituperatoribus est id.
Et eum probo consulatu. At eos errem aliquando theophrastus, sea ad eius omnis.
No vis iusto scriptorem adversarium, dicat viderer ea sit. Et veri euripidis
sea, justo putent iudicabit vim id. Sea suas tincidunt vituperatoribus in. Ne
eam aeterno sensibus concludaturque, solet legere his id, usu ei dicat
dissentiunt. Est et autem erant.
Per quod laboramus an. Dico voluptua at mea, an animal minimum eum. Pri an
option salutatus, causae feugiat menandri an sed. Voluptaria dissentiet vix ut,
alii solet te quo, in facer ceteros eos. Ad nibh meis percipitur sit,
aliquam molestie cu vis, iisque malorum interesset et eos.
Eos in feugiat insolens abhorreant. Ea tale esse alienum has, mel et saperet
appellantur, aliquip salutandi deterruisset ut mel. Eos ei quod simul
interpretaris, aeque elitr putent per at, et veri eripuit ceteros his. Cu pro
meis aperiam volutpat, ex alterum scripserit ius, scriptorem deterruisset eu
qui. Graeco debitis lobortis cu mea.
Alii corpora id ius, cu quo oblique eloquentiam. Et duis civibus atomorum sea,
veniam utroque scriptorem vim cu. Ut oratio eruditi mediocritatem est. Amet
nibh dolore mea ea, tollit laoreet eligendi qui ex, cu essent forensibus
his.
Usu ex ipsum apeirian, eos congue scripserit omittantur et. Ea eum persecuti
deseruisse, probatus torquatos est no, in has mutat mundi dolorem. Albucius
sensibus ex cum. Ferri virtute referrentur an per, est choro option bonorum ex.
Quando accusam vis te, tale mazim et pro. Magna dolorem tincidunt
nec te, albucius adipisci ad pri. Magna facilisi adipisci at usu, et vel
dissentiunt neglegentur, prima audiam vocibus an duo. Enim detracto te sea, mel
quis dicit gubergren ex, iusto adversarium consequuntur per ne.
)");
}
template <class CharT>
std::basic_string_view<CharT> unicode_text() {
return SV(
R"(Lōrem ipsūm dolor sīt æmeÞ, ea vel nostrud feuġǣit, muciūs tēmporiȝus
refērrēnÞur no mel, quo placērǽt consecÞetuer . Veri soƿet euripīðis id has,
sumo paulō dissentias duo , dētrāxīt neglēgeƿtur prī. Sēd option oporÞerē
no. Nec ēū nēmore mentitum. Veri prōȝo faċilis āt vīm.
Ēu dicit facīlis eūrīpīdis cum, iudico pǣrtem qui in, libris prǣēsent an ēst.
Æt sit quoðsi impētus, nec ex qūaeque honestǣtīs. Fiērēƿt ƿōluisse verterem iƿ
ēst. Meī apēriæm fierent peÞentīūm. Eæm officiīs reprehēndunt .
Ut vel quodsī contentioƿes, his dignissim īnstruċÞior. Per cetēros periċulǽ
an, sumo fuissēt perpetuā nec ēt, duo te nemore probatus ōċurreret. Mel ǣd
civībus ocūrreret. Ex nostro ǣliquam usu, ex Þātīon adipiscī qui. Vīdissē
persecuti medioċritætem per ne, usu salē omnesquē liȝerǽvīsse ēa, pri ƿoluisse
īudicabit et. No summo quiðǣm nec, vim ēi nūmqūam sænctus concepÞǣm. Reque
doceƿdi īn īus, porro eripuiÞ intērprētaris pri in.
Idquē hǣbēmus nominati vix . prō ǽmēt elit periculæ. Has virīs viderer ān.
Mel in suās pericūlīs āppellantur, nonumes deserūƿt ǽðversarium has. ĒliÞ
possīt commuƿe no ē, niȝh aċcusāmūs volūpÞatum no mel, ut quō ciȝo ðiceret.
Inǣni scripta quālīsque qūi, ad ipsūm persecuÞi mediōcritæÞēm vel.
Ǣppetere definitiōnes mel id. Leġerē āliquip nam , rēgione viderer pǣtrioque
duo te, meƿāƿdri prodēsseÞ ex hīs. Solum quidam iūs, mēl ǣt sapientem
expliċari. Īƿ ǣċcusǣm phǽedrum pro, ex pro dēleƿit detræxit hendrerīt, sit āgam
quidām pertinax . Ēssent rætionibus vēl, quo ān labore nusquæm nominǣti.
Te alii cōnseÞetur ƿam, eam ēt puteƿÞ ðissentiæs. Qūi alii dicānt repuðiære ēā,
mel ferri nūsquam. Ea vim impedīt vertērem, ǣn per veri Þīmeam. SiÞ ōmitÞǽm
necēssitǣÞibus ex, ƿe vis inǣni pærtem invenire. Īd ðolores ċonsēċÞeÞuer usu,
īd vis nisl dēnique luptǣtūm. Pro ǽd ēverti option dēserūƿt, nec te ōð
cīvībūs.
Ēæ nibh æccommodarē eum. Ne etiæm īudico dicunt duo, quo tēmpor populo insōlens
. Ēos ēirmod prǽēsēƿt. Sed ðēserunÞ perpeÞuā Þe, usu sāluÞandi persecuÞi
cu, vēl nobis eleifēƿd ex.
Ƿe zrīl ūtīnam lǣtīne eǣm, vim rebum omitÞǣm aðipisciƿg. Amet inermis
epiċūri ut est, eu duo hīnc periċulis. Mel no reque simul volupÞātum, ex mutat
lāudem tacīmatēs cum. Te hǣs summo iƿteġre recteque. No iūs dicerēt
ðisputǽtioƿi. Vim ōmnis deleƿiÞi honestātis ēǽ.
Nec detrǣcto pērcipitur ne. Ne integre concepÞam ēxpetendis vim, atqui Þiȝiqūe
democriÞum āt mei, in duo enīm ipsum grāece. Rebum ðefīnīÞionem āt pri, ēt sit
brute periculis. Ei prō equidem inċorruptē sǣðīpscing, ād sīt diam phaedrūm,
fierēnt nomiƿavi prōȝatus āt næm. Wisi ƿæÞūm coƿsecteÞuer usū ea.
)");
}
template <class CharT>
std::basic_string_view<CharT> cyrillic_text() {
return SV(
R"(Лорем ипсум долор сит амет, еу диам тамяуам принципес вис, еяуидем
цонцептам диспутандо яуи цу, иус ад натум нулла граеци. Цибо дицит омниум нец
цу, еу бруте номинави диссентиет яуо. Омниум лаборамус еу хас. Дицат
диспутатиони вис еу, цу еос миним атоморум инцидеринт. Пер хабео рецтеяуе
дигниссим ан, ех яуо сенсибус торяуатос, ан.
Ут перпетуа партиендо принципес хис. Ат симул ностер аппареат пер. Пурто вирис
ет хис, мазим дицерет при ет. Хис саперет тибияуе сцаевола еу, сит солет
вивендум цонсеяуат те. Ид оффициис перпетуа ассентиор яуи, сед аугуе афферт
симилияуе ад, ех адмодум постулант иус.
Про дицунт волуптатум диспутатиони ат. Вел патриояуе персецути еа, цетерос
диспутатиони ин сед, нам те веро цлита малуиссет. Цу неглегентур инструцтиор
интерпретарис еам, ипсум фабулас еи вел. Еи адхуц деленити нам, аугуе
демоцритум при ан. Вим мелиоре проприае ид, албуциус волуптуа цоррумпит дуо ан.
Латине иуварет пер ут, иус еа мунере ерипуит санцтус.
Модус тритани иус не, вим ут мелиоре мандамус, лабитур опортере дуо но. Ад нец
витае фацилис инцоррупте, цу сед толлит сцрипторем. Сит лудус инимицус
волуптариа не. Иисяуе антиопам сапиентем сед еу. Путент волуптуа сит ех, ат иус
ребум епицури, яуи моллис елигенди ех. Проприае нолуиссе цу сеа, путент поссит
адверсариум про не.
Ид яуо прима бонорум, дуо форенсибус яуаерендум еи, еум бруте мунере те. Еам
риденс граецо ех, аеяуе санцтус маиорум ан вел. Либрис санцтус утрояуе ест но,
еам ат реяуе порро тинцидунт, ут хинц иллуд патриояуе хис. Не солет оффендит
форенсибус хас, тамяуам опортеат елаборарет те нец, еу аугуе примис маиорум
еам. Аутем вениам импедит вис ин, прима елитр пхаедрум ест еу.)");
}
template <class CharT>
std::basic_string_view<CharT> japanese_text() {
return SV(
R"(入ト年媛ろ舗学ラロ準募ケカ社金ス屋検れう策他セヲシ引口ぎ集7独ぱクふ出車ぽでぱ円輪ル受打わ。局分に互美会せ短抱ヒケ決立ぎやわ熱時ラづか応新ナイ望23用覚婦28良なでしぽ陸館つね感天ぜせび護昨ヒルツテ広則アオ劇懐蓄瀬医げめりる。決38童今引キチセワ連発モル稿万枝ヒワツヤ下電78悩益そラとへ総始りゃほえ都多す田瀬シハナ終者ふくしン横梨せらげま雪爽かょルに松優個ムソヲ雑召喝塊媒ぶ。
20802
8稿9230稿388
宿78姿11895
姿476589
74429345姿
)");
}
template <class CharT>
std::basic_string_view<CharT> emoji_text() {
return SV(
R"(
\U0001F636\u200D\U0001F32B\uFE0F
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
\U0001F636\u200D\U0001F32B\uFE0F
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
\U0001F636\u200D\U0001F32B\uFE0F
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
\U0001F636\u200D\U0001F32B\uFE0F
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
\U0001F636\u200D\U0001F32B\uFE0F
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
\U0001F636\u200D\U0001F32B\uFE0F
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
\U0001F636\u200D\U0001F32B\uFE0F
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
\U0001F636\u200D\U0001F32B\uFE0F
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
\U0001F636\u200D\U0001F32B\uFE0F
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
\U0001F636\u200D\U0001F32B\uFE0F
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
\U0001F636\u200D\U0001F32B\uFE0F
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
\U0001F636\u200D\U0001F32B\uFE0F
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
\U0001F636\u200D\U0001F32B\uFE0F
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
\U0001F636\u200D\U0001F32B\uFE0F
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
\U0001F984
)");
}
template <class CharT>
void BM_text(benchmark::State& state, std::basic_string_view<CharT> input) {
CharT buffer[5'000];
if constexpr (std::same_as<CharT, char>) {
// Make sure the output buffer is large enough.
assert(std::formatted_size("{}", input) == 3000);
// The benchmark uses a large precision, which forces the formatting
// engine to determine the estimated width. (There's no direct way to call
// this function in portable code.)
for (auto _ : state)
benchmark::DoNotOptimize(std::__format_spec::__get_string_alignment(
begin(), end(), 1'000'000, 1'000'000));
state.SetItemsProcessed(state.iterations() * size());
benchmark::DoNotOptimize(std::format_to(buffer, "{:.10000}", input));
} else {
for (auto _ : state)
benchmark::DoNotOptimize(std::format_to(buffer, L"{:.10000}", input));
}
};
#define TEST(u8) \
if constexpr (std::same_as<CharT, char>) { \
constexpr auto p = tester{u8}; \
p.test(state); \
} else if constexpr (std::same_as<CharT, char16_t>) { \
constexpr auto p = tester{TEST_CONCAT(u, u8)}; \
p.test(state); \
} else { \
constexpr auto p = tester{TEST_CONCAT(U, u8)}; \
p.test(state); \
}
template <class CharT>
static void BM_EstimateLengthNoMultiByte(benchmark::State& state) {
TEST("The quick brown fox jumps over the lazy dog");
}
template <class CharT>
static void BM_EstimateLengthTwoByteDE(benchmark::State& state) {
static_assert(sizeof("Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich") == 67);
// https://en.wikipedia.org/wiki/Pangram
TEST("Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich");
void BM_ascii_text(benchmark::State& state) {
BM_text(state, ascii_text<CharT>());
}
template <class CharT>
static void BM_EstimateLengthTwoBytePL(benchmark::State& state) {
static_assert(sizeof("Stróż pchnął kość w quiz gędźb vel fax myjń") == 53);
// https://en.wikipedia.org/wiki/Pangram
TEST("Stróż pchnął kość w quiz gędźb vel fax myjń");
}
// All values below are 1100, which is is the first multi column sequence.
template <class CharT>
static void BM_EstimateLengthThreeByteSingleColumnLow(benchmark::State& state) {
static_assert(sizeof("\u0800\u0801\u0802\u0803\u0804\u0805\u0806\u0807"
"\u0808\u0809\u080a\u080b\u080c\u080d\u080e\u080f") ==
49);
TEST("\u0800\u0801\u0802\u0803\u0804\u0805\u0806\u0807"
"\u0808\u0809\u080a\u080b\u080c\u080d\u080e\u080f");
void BM_unicode_text(benchmark::State& state) {
BM_text(state, unicode_text<CharT>());
}
template <class CharT>
static void
BM_EstimateLengthThreeByteSingleColumnHigh(benchmark::State& state) {
static_assert(sizeof("\u1800\u1801\u1802\u1803\u1804\u1805\u1806\u1807"
"\u1808\u1809\u180a\u180b\u180c\u180d\u180e\u180f") ==
49);
TEST("\u1800\u1801\u1802\u1803\u1804\u1805\u1806\u1807"
"\u1808\u1809\u180a\u180b\u180c\u180d\u180e\u180f");
void BM_cyrillic_text(benchmark::State& state) {
BM_text(state, cyrillic_text<CharT>());
}
template <class CharT>
static void BM_EstimateLengthThreeByteDoubleColumn(benchmark::State& state) {
static_assert(sizeof("\u1100\u0801\u0802\u0803\u0804\u0805\u0806\u0807"
"\u1108\u0809\u080a\u080b\u080c\u080d\u080e\u080f") ==
49);
TEST("\u1100\u0801\u0802\u0803\u0804\u0805\u0806\u0807"
"\u1108\u0809\u080a\u080b\u080c\u080d\u080e\u080f");
void BM_japanese_text(benchmark::State& state) {
BM_text(state, japanese_text<CharT>());
}
template <class CharT>
static void BM_EstimateLengthThreeByte(benchmark::State& state) {
static_assert(sizeof("\u1400\u1501\ubbbb\uff00\u0800\u4099\uabcd\u4000"
"\u8ead\ubeef\u1111\u4987\u4321\uffff\u357a\ud50e") ==
49);
TEST("\u1400\u1501\ubbbb\uff00\u0800\u4099\uabcd\u4000"
"\u8ead\ubeef\u1111\u4987\u4321\uffff\u357a\ud50e");
void BM_emoji_text(benchmark::State& state) {
BM_text(state, emoji_text<CharT>());
}
template <class CharT>
static void BM_EstimateLengthFourByteSingleColumn(benchmark::State& state) {
static_assert(sizeof("\U00010000\U00010001\U00010002\U00010003"
"\U00010004\U00010005\U00010006\U00010007"
"\U00010008\U00010009\U0001000a\U0001000b"
"\U0001000c\U0001000d\U0001000e\U0001000f") == 65);
BENCHMARK_TEMPLATE(BM_ascii_text, char);
BENCHMARK_TEMPLATE(BM_unicode_text, char);
BENCHMARK_TEMPLATE(BM_cyrillic_text, char);
BENCHMARK_TEMPLATE(BM_japanese_text, char);
BENCHMARK_TEMPLATE(BM_emoji_text, char);
TEST("\U00010000\U00010001\U00010002\U00010003"
"\U00010004\U00010005\U00010006\U00010007"
"\U00010008\U00010009\U0001000a\U0001000b"
"\U0001000c\U0001000d\U0001000e\U0001000f");
}
template <class CharT>
static void BM_EstimateLengthFourByteDoubleColumn(benchmark::State& state) {
static_assert(sizeof("\U00020000\U00020002\U00020002\U00020003"
"\U00020004\U00020005\U00020006\U00020007"
"\U00020008\U00020009\U0002000a\U0002000b"
"\U0002000c\U0002000d\U0002000e\U0002000f") == 65);
TEST("\U00020000\U00020002\U00020002\U00020003"
"\U00020004\U00020005\U00020006\U00020007"
"\U00020008\U00020009\U0002000a\U0002000b"
"\U0002000c\U0002000d\U0002000e\U0002000f");
}
template <class CharT>
static void BM_EstimateLengthFourByte(benchmark::State& state) {
static_assert(sizeof("\U00010000\U00010001\U00010002\U00010003"
"\U00020004\U00020005\U00020006\U00020007"
"\U00010008\U00010009\U0001000a\U0001000b"
"\U0002000c\U0002000d\U0002000e\U0002000f") == 65);
TEST("\U00010000\U00010001\U00010002\U00010003"
"\U00020004\U00020005\U00020006\U00020007"
"\U00010008\U00010009\U0001000a\U0001000b"
"\U0002000c\U0002000d\U0002000e\U0002000f");
}
BENCHMARK_TEMPLATE(BM_EstimateLengthNoMultiByte, char);
BENCHMARK_TEMPLATE(BM_EstimateLengthTwoByteDE, char);
BENCHMARK_TEMPLATE(BM_EstimateLengthTwoBytePL, char);
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnLow, char);
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnHigh, char);
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteDoubleColumn, char);
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByte, char);
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteSingleColumn, char);
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteDoubleColumn, char);
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByte, char);
BENCHMARK_TEMPLATE(BM_EstimateLengthNoMultiByte, char16_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthTwoByteDE, char16_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthTwoBytePL, char16_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnLow, char16_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnHigh, char16_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteDoubleColumn, char16_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByte, char16_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteSingleColumn, char16_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteDoubleColumn, char16_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByte, char16_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthNoMultiByte, char32_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthTwoByteDE, char32_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthTwoBytePL, char32_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnLow, char32_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnHigh, char32_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteDoubleColumn, char32_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByte, char32_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteSingleColumn, char32_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteDoubleColumn, char32_t);
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByte, char32_t);
BENCHMARK_TEMPLATE(BM_ascii_text, wchar_t);
BENCHMARK_TEMPLATE(BM_unicode_text, wchar_t);
BENCHMARK_TEMPLATE(BM_cyrillic_text, wchar_t);
BENCHMARK_TEMPLATE(BM_japanese_text, wchar_t);
BENCHMARK_TEMPLATE(BM_emoji_text, wchar_t);
int main(int argc, char** argv) {
benchmark::Initialize(&argc, argv);

View File

@ -275,6 +275,7 @@ set(files
__format/buffer.h
__format/concepts.h
__format/enable_insertable.h
__format/extended_grapheme_cluster_table.h
__format/format_arg.h
__format/format_arg_store.h
__format/format_args.h
@ -294,6 +295,7 @@ set(files
__format/formatter_pointer.h
__format/formatter_string.h
__format/parser_std_format_spec.h
__format/unicode.h
__functional/binary_function.h
__functional/binary_negate.h
__functional/bind.h

View File

@ -0,0 +1,332 @@
// -*- C++ -*-
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// WARNING, this entire header is generated by
// utiles/generate_extended_grapheme_cluster_table.py
// DO NOT MODIFY!
// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
//
// See Terms of Use <https://www.unicode.org/copyright.html>
// for definitions of Unicode Inc.'s Data Files and Software.
//
// NOTICE TO USER: Carefully read the following legal agreement.
// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
// TERMS AND CONDITIONS OF THIS AGREEMENT.
// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
// THE DATA FILES OR SOFTWARE.
//
// COPYRIGHT AND PERMISSION NOTICE
//
// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
// Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
//
// Permission is hereby granted, free of charge, to any person obtaining
// a copy of the Unicode data files and any associated documentation
// (the "Data Files") or Unicode software and any associated documentation
// (the "Software") to deal in the Data Files or Software
// without restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, and/or sell copies of
// the Data Files or Software, and to permit persons to whom the Data Files
// or Software are furnished to do so, provided that either
// (a) this copyright and permission notice appear with all copies
// of the Data Files or Software, or
// (b) this copyright and permission notice appear in associated
// Documentation.
//
// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT OF THIRD PARTY RIGHTS.
// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
// PERFORMANCE OF THE DATA FILES OR SOFTWARE.
//
// Except as contained in this notice, the name of a copyright holder
// shall not be used in advertising or otherwise to promote the sale,
// use or other dealings in these Data Files or Software without prior
// written authorization of the copyright holder.
#ifndef _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
#define _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
#include <__algorithm/upper_bound.h>
#include <__config>
#include <__iterator/access.h>
#include <cstddef>
#include <cstdint>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
#endif
_LIBCPP_BEGIN_NAMESPACE_STD
#if _LIBCPP_STD_VER > 17
namespace __extended_grapheme_custer_property_boundary {
enum class __property : uint8_t {
// Values generated from the data files.
__CR,
__Control,
__Extend,
__Extended_Pictographic,
__L,
__LF,
__LV,
__LVT,
__Prepend,
__Regional_Indicator,
__SpacingMark,
__T,
__V,
__ZWJ,
// The properies below aren't stored in the "database".
// Text position properties.
__sot,
__eot,
// The code unit has none of above properties.
__none
};
/// The entries of the extended grapheme cluster bondary property table.
///
/// The data is generated from
/// - https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
/// - https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
///
/// The data has 3 values
/// - bits [0, 3] The property. One of the values generated form the datafiles
/// of \ref __property
/// - bits [4, 10] The size of the range.
/// - bits [11, 31] The lower bound code point of the range. The upper bound of
/// the range is lower bound + size.
///
/// The 7 bits for the size allow a maximum range of 128 elements. Some ranges
/// in the Unicode tables are larger. They are stored in multiple consecutive
/// ranges in the data table. An alternative would be to store the sizes in a
/// separate 16-bit value. The original MSVC STL code had such an approach, but
/// this approach uses less space for the data and is about 4% faster in the
/// following benchmark.
/// libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp
inline constexpr uint32_t __entries[1480] = {
0x00000091, 0x00005005, 0x00005811, 0x00006800, 0x00007111, 0x0003fa01, 0x00054803, 0x00056801, 0x00057003,
0x001806f2, 0x00241862, 0x002c8ac2, 0x002df802, 0x002e0812, 0x002e2012, 0x002e3802, 0x00300058, 0x003080a2,
0x0030e001, 0x00325942, 0x00338002, 0x0036b062, 0x0036e808, 0x0036f852, 0x00373812, 0x00375032, 0x00387808,
0x00388802, 0x003981a2, 0x003d30a2, 0x003f5882, 0x003fe802, 0x0040b032, 0x0040d882, 0x00412822, 0x00414842,
0x0042c822, 0x00448018, 0x0044c072, 0x00465172, 0x00471008, 0x004719f2, 0x0048180a, 0x0049d002, 0x0049d80a,
0x0049e002, 0x0049f02a, 0x004a0872, 0x004a483a, 0x004a6802, 0x004a701a, 0x004a8862, 0x004b1012, 0x004c0802,
0x004c101a, 0x004de002, 0x004df002, 0x004df81a, 0x004e0832, 0x004e381a, 0x004e581a, 0x004e6802, 0x004eb802,
0x004f1012, 0x004ff002, 0x00500812, 0x0050180a, 0x0051e002, 0x0051f02a, 0x00520812, 0x00523812, 0x00525822,
0x00528802, 0x00538012, 0x0053a802, 0x00540812, 0x0054180a, 0x0055e002, 0x0055f02a, 0x00560842, 0x00563812,
0x0056480a, 0x0056581a, 0x00566802, 0x00571012, 0x0057d052, 0x00580802, 0x0058101a, 0x0059e002, 0x0059f012,
0x005a000a, 0x005a0832, 0x005a381a, 0x005a581a, 0x005a6802, 0x005aa822, 0x005b1012, 0x005c1002, 0x005df002,
0x005df80a, 0x005e0002, 0x005e081a, 0x005e302a, 0x005e502a, 0x005e6802, 0x005eb802, 0x00600002, 0x0060082a,
0x00602002, 0x0061e002, 0x0061f022, 0x0062083a, 0x00623022, 0x00625032, 0x0062a812, 0x00631012, 0x00640802,
0x0064101a, 0x0065e002, 0x0065f00a, 0x0065f802, 0x0066001a, 0x00661002, 0x0066181a, 0x00663002, 0x0066381a,
0x0066501a, 0x00666012, 0x0066a812, 0x00671012, 0x00680012, 0x0068101a, 0x0069d812, 0x0069f002, 0x0069f81a,
0x006a0832, 0x006a302a, 0x006a502a, 0x006a6802, 0x006a7008, 0x006ab802, 0x006b1012, 0x006c0802, 0x006c101a,
0x006e5002, 0x006e7802, 0x006e801a, 0x006e9022, 0x006eb002, 0x006ec06a, 0x006ef802, 0x006f901a, 0x00718802,
0x0071980a, 0x0071a062, 0x00723872, 0x00758802, 0x0075980a, 0x0075a082, 0x00764052, 0x0078c012, 0x0079a802,
0x0079b802, 0x0079c802, 0x0079f01a, 0x007b88d2, 0x007bf80a, 0x007c0042, 0x007c3012, 0x007c68a2, 0x007cca32,
0x007e3002, 0x00816832, 0x0081880a, 0x00819052, 0x0081c812, 0x0081d81a, 0x0081e812, 0x0082b01a, 0x0082c012,
0x0082f022, 0x00838832, 0x00841002, 0x0084200a, 0x00842812, 0x00846802, 0x0084e802, 0x008805f4, 0x008b047c,
0x008d457b, 0x009ae822, 0x00b89022, 0x00b8a80a, 0x00b99012, 0x00b9a00a, 0x00ba9012, 0x00bb9012, 0x00bda012,
0x00bdb00a, 0x00bdb862, 0x00bdf07a, 0x00be3002, 0x00be381a, 0x00be48a2, 0x00bee802, 0x00c05822, 0x00c07001,
0x00c07802, 0x00c42812, 0x00c54802, 0x00c90022, 0x00c9183a, 0x00c93812, 0x00c9482a, 0x00c9801a, 0x00c99002,
0x00c9985a, 0x00c9c822, 0x00d0b812, 0x00d0c81a, 0x00d0d802, 0x00d2a80a, 0x00d2b002, 0x00d2b80a, 0x00d2c062,
0x00d30002, 0x00d31002, 0x00d32872, 0x00d3685a, 0x00d39892, 0x00d3f802, 0x00d581e2, 0x00d80032, 0x00d8200a,
0x00d9a062, 0x00d9d80a, 0x00d9e002, 0x00d9e84a, 0x00da1002, 0x00da181a, 0x00db5882, 0x00dc0012, 0x00dc100a,
0x00dd080a, 0x00dd1032, 0x00dd301a, 0x00dd4012, 0x00dd500a, 0x00dd5822, 0x00df3002, 0x00df380a, 0x00df4012,
0x00df502a, 0x00df6802, 0x00df700a, 0x00df7822, 0x00df901a, 0x00e1207a, 0x00e16072, 0x00e1a01a, 0x00e1b012,
0x00e68022, 0x00e6a0c2, 0x00e7080a, 0x00e71062, 0x00e76802, 0x00e7a002, 0x00e7b80a, 0x00e7c012, 0x00ee03f2,
0x01005801, 0x01006002, 0x0100680d, 0x01007011, 0x01014061, 0x0101e003, 0x01024803, 0x010300f1, 0x01068202,
0x01091003, 0x0109c803, 0x010ca053, 0x010d4813, 0x0118d013, 0x01194003, 0x011c4003, 0x011e7803, 0x011f48a3,
0x011fc023, 0x01261003, 0x012d5013, 0x012db003, 0x012e0003, 0x012fd833, 0x01300053, 0x013038b3, 0x0130a713,
0x01348753, 0x013840a3, 0x0138a003, 0x0138b003, 0x0138e803, 0x01390803, 0x01394003, 0x01399813, 0x013a2003,
0x013a3803, 0x013a6003, 0x013a7003, 0x013a9823, 0x013ab803, 0x013b1843, 0x013ca823, 0x013d0803, 0x013d8003,
0x013df803, 0x0149a013, 0x01582823, 0x0158d813, 0x015a8003, 0x015aa803, 0x01677822, 0x016bf802, 0x016f01f2,
0x01815052, 0x01818003, 0x0181e803, 0x0184c812, 0x0194b803, 0x0194c803, 0x05337832, 0x0533a092, 0x0534f012,
0x05378012, 0x05401002, 0x05403002, 0x05405802, 0x0541181a, 0x05412812, 0x0541380a, 0x05416002, 0x0544001a,
0x0545a0fa, 0x05462012, 0x05470112, 0x0547f802, 0x05493072, 0x054a38a2, 0x054a901a, 0x054b01c4, 0x054c0022,
0x054c180a, 0x054d9802, 0x054da01a, 0x054db032, 0x054dd01a, 0x054de012, 0x054df02a, 0x054f2802, 0x05514852,
0x0551781a, 0x05518812, 0x0551981a, 0x0551a812, 0x05521802, 0x05526002, 0x0552680a, 0x0553e002, 0x05558002,
0x05559022, 0x0555b812, 0x0555f012, 0x05560802, 0x0557580a, 0x05576012, 0x0557701a, 0x0557a80a, 0x0557b002,
0x055f181a, 0x055f2802, 0x055f301a, 0x055f4002, 0x055f481a, 0x055f600a, 0x055f6802, 0x05600006, 0x056009a7,
0x0560e006, 0x0560e9a7, 0x0561c006, 0x0561c9a7, 0x0562a006, 0x0562a9a7, 0x05638006, 0x056389a7, 0x05646006,
0x056469a7, 0x05654006, 0x056549a7, 0x05662006, 0x056629a7, 0x05670006, 0x056709a7, 0x0567e006, 0x0567e9a7,
0x0568c006, 0x0568c9a7, 0x0569a006, 0x0569a9a7, 0x056a8006, 0x056a89a7, 0x056b6006, 0x056b69a7, 0x056c4006,
0x056c49a7, 0x056d2006, 0x056d29a7, 0x056e0006, 0x056e09a7, 0x056ee006, 0x056ee9a7, 0x056fc006, 0x056fc9a7,
0x0570a006, 0x0570a9a7, 0x05718006, 0x057189a7, 0x05726006, 0x057269a7, 0x05734006, 0x057349a7, 0x05742006,
0x057429a7, 0x05750006, 0x057509a7, 0x0575e006, 0x0575e9a7, 0x0576c006, 0x0576c9a7, 0x0577a006, 0x0577a9a7,
0x05788006, 0x057889a7, 0x05796006, 0x057969a7, 0x057a4006, 0x057a49a7, 0x057b2006, 0x057b29a7, 0x057c0006,
0x057c09a7, 0x057ce006, 0x057ce9a7, 0x057dc006, 0x057dc9a7, 0x057ea006, 0x057ea9a7, 0x057f8006, 0x057f89a7,
0x05806006, 0x058069a7, 0x05814006, 0x058149a7, 0x05822006, 0x058229a7, 0x05830006, 0x058309a7, 0x0583e006,
0x0583e9a7, 0x0584c006, 0x0584c9a7, 0x0585a006, 0x0585a9a7, 0x05868006, 0x058689a7, 0x05876006, 0x058769a7,
0x05884006, 0x058849a7, 0x05892006, 0x058929a7, 0x058a0006, 0x058a09a7, 0x058ae006, 0x058ae9a7, 0x058bc006,
0x058bc9a7, 0x058ca006, 0x058ca9a7, 0x058d8006, 0x058d89a7, 0x058e6006, 0x058e69a7, 0x058f4006, 0x058f49a7,
0x05902006, 0x059029a7, 0x05910006, 0x059109a7, 0x0591e006, 0x0591e9a7, 0x0592c006, 0x0592c9a7, 0x0593a006,
0x0593a9a7, 0x05948006, 0x059489a7, 0x05956006, 0x059569a7, 0x05964006, 0x059649a7, 0x05972006, 0x059729a7,
0x05980006, 0x059809a7, 0x0598e006, 0x0598e9a7, 0x0599c006, 0x0599c9a7, 0x059aa006, 0x059aa9a7, 0x059b8006,
0x059b89a7, 0x059c6006, 0x059c69a7, 0x059d4006, 0x059d49a7, 0x059e2006, 0x059e29a7, 0x059f0006, 0x059f09a7,
0x059fe006, 0x059fe9a7, 0x05a0c006, 0x05a0c9a7, 0x05a1a006, 0x05a1a9a7, 0x05a28006, 0x05a289a7, 0x05a36006,
0x05a369a7, 0x05a44006, 0x05a449a7, 0x05a52006, 0x05a529a7, 0x05a60006, 0x05a609a7, 0x05a6e006, 0x05a6e9a7,
0x05a7c006, 0x05a7c9a7, 0x05a8a006, 0x05a8a9a7, 0x05a98006, 0x05a989a7, 0x05aa6006, 0x05aa69a7, 0x05ab4006,
0x05ab49a7, 0x05ac2006, 0x05ac29a7, 0x05ad0006, 0x05ad09a7, 0x05ade006, 0x05ade9a7, 0x05aec006, 0x05aec9a7,
0x05afa006, 0x05afa9a7, 0x05b08006, 0x05b089a7, 0x05b16006, 0x05b169a7, 0x05b24006, 0x05b249a7, 0x05b32006,
0x05b329a7, 0x05b40006, 0x05b409a7, 0x05b4e006, 0x05b4e9a7, 0x05b5c006, 0x05b5c9a7, 0x05b6a006, 0x05b6a9a7,
0x05b78006, 0x05b789a7, 0x05b86006, 0x05b869a7, 0x05b94006, 0x05b949a7, 0x05ba2006, 0x05ba29a7, 0x05bb0006,
0x05bb09a7, 0x05bbe006, 0x05bbe9a7, 0x05bcc006, 0x05bcc9a7, 0x05bda006, 0x05bda9a7, 0x05be8006, 0x05be89a7,
0x05bf6006, 0x05bf69a7, 0x05c04006, 0x05c049a7, 0x05c12006, 0x05c129a7, 0x05c20006, 0x05c209a7, 0x05c2e006,
0x05c2e9a7, 0x05c3c006, 0x05c3c9a7, 0x05c4a006, 0x05c4a9a7, 0x05c58006, 0x05c589a7, 0x05c66006, 0x05c669a7,
0x05c74006, 0x05c749a7, 0x05c82006, 0x05c829a7, 0x05c90006, 0x05c909a7, 0x05c9e006, 0x05c9e9a7, 0x05cac006,
0x05cac9a7, 0x05cba006, 0x05cba9a7, 0x05cc8006, 0x05cc89a7, 0x05cd6006, 0x05cd69a7, 0x05ce4006, 0x05ce49a7,
0x05cf2006, 0x05cf29a7, 0x05d00006, 0x05d009a7, 0x05d0e006, 0x05d0e9a7, 0x05d1c006, 0x05d1c9a7, 0x05d2a006,
0x05d2a9a7, 0x05d38006, 0x05d389a7, 0x05d46006, 0x05d469a7, 0x05d54006, 0x05d549a7, 0x05d62006, 0x05d629a7,
0x05d70006, 0x05d709a7, 0x05d7e006, 0x05d7e9a7, 0x05d8c006, 0x05d8c9a7, 0x05d9a006, 0x05d9a9a7, 0x05da8006,
0x05da89a7, 0x05db6006, 0x05db69a7, 0x05dc4006, 0x05dc49a7, 0x05dd2006, 0x05dd29a7, 0x05de0006, 0x05de09a7,
0x05dee006, 0x05dee9a7, 0x05dfc006, 0x05dfc9a7, 0x05e0a006, 0x05e0a9a7, 0x05e18006, 0x05e189a7, 0x05e26006,
0x05e269a7, 0x05e34006, 0x05e349a7, 0x05e42006, 0x05e429a7, 0x05e50006, 0x05e509a7, 0x05e5e006, 0x05e5e9a7,
0x05e6c006, 0x05e6c9a7, 0x05e7a006, 0x05e7a9a7, 0x05e88006, 0x05e889a7, 0x05e96006, 0x05e969a7, 0x05ea4006,
0x05ea49a7, 0x05eb2006, 0x05eb29a7, 0x05ec0006, 0x05ec09a7, 0x05ece006, 0x05ece9a7, 0x05edc006, 0x05edc9a7,
0x05eea006, 0x05eea9a7, 0x05ef8006, 0x05ef89a7, 0x05f06006, 0x05f069a7, 0x05f14006, 0x05f149a7, 0x05f22006,
0x05f229a7, 0x05f30006, 0x05f309a7, 0x05f3e006, 0x05f3e9a7, 0x05f4c006, 0x05f4c9a7, 0x05f5a006, 0x05f5a9a7,
0x05f68006, 0x05f689a7, 0x05f76006, 0x05f769a7, 0x05f84006, 0x05f849a7, 0x05f92006, 0x05f929a7, 0x05fa0006,
0x05fa09a7, 0x05fae006, 0x05fae9a7, 0x05fbc006, 0x05fbc9a7, 0x05fca006, 0x05fca9a7, 0x05fd8006, 0x05fd89a7,
0x05fe6006, 0x05fe69a7, 0x05ff4006, 0x05ff49a7, 0x06002006, 0x060029a7, 0x06010006, 0x060109a7, 0x0601e006,
0x0601e9a7, 0x0602c006, 0x0602c9a7, 0x0603a006, 0x0603a9a7, 0x06048006, 0x060489a7, 0x06056006, 0x060569a7,
0x06064006, 0x060649a7, 0x06072006, 0x060729a7, 0x06080006, 0x060809a7, 0x0608e006, 0x0608e9a7, 0x0609c006,
0x0609c9a7, 0x060aa006, 0x060aa9a7, 0x060b8006, 0x060b89a7, 0x060c6006, 0x060c69a7, 0x060d4006, 0x060d49a7,
0x060e2006, 0x060e29a7, 0x060f0006, 0x060f09a7, 0x060fe006, 0x060fe9a7, 0x0610c006, 0x0610c9a7, 0x0611a006,
0x0611a9a7, 0x06128006, 0x061289a7, 0x06136006, 0x061369a7, 0x06144006, 0x061449a7, 0x06152006, 0x061529a7,
0x06160006, 0x061609a7, 0x0616e006, 0x0616e9a7, 0x0617c006, 0x0617c9a7, 0x0618a006, 0x0618a9a7, 0x06198006,
0x061989a7, 0x061a6006, 0x061a69a7, 0x061b4006, 0x061b49a7, 0x061c2006, 0x061c29a7, 0x061d0006, 0x061d09a7,
0x061de006, 0x061de9a7, 0x061ec006, 0x061ec9a7, 0x061fa006, 0x061fa9a7, 0x06208006, 0x062089a7, 0x06216006,
0x062169a7, 0x06224006, 0x062249a7, 0x06232006, 0x062329a7, 0x06240006, 0x062409a7, 0x0624e006, 0x0624e9a7,
0x0625c006, 0x0625c9a7, 0x0626a006, 0x0626a9a7, 0x06278006, 0x062789a7, 0x06286006, 0x062869a7, 0x06294006,
0x062949a7, 0x062a2006, 0x062a29a7, 0x062b0006, 0x062b09a7, 0x062be006, 0x062be9a7, 0x062cc006, 0x062cc9a7,
0x062da006, 0x062da9a7, 0x062e8006, 0x062e89a7, 0x062f6006, 0x062f69a7, 0x06304006, 0x063049a7, 0x06312006,
0x063129a7, 0x06320006, 0x063209a7, 0x0632e006, 0x0632e9a7, 0x0633c006, 0x0633c9a7, 0x0634a006, 0x0634a9a7,
0x06358006, 0x063589a7, 0x06366006, 0x063669a7, 0x06374006, 0x063749a7, 0x06382006, 0x063829a7, 0x06390006,
0x063909a7, 0x0639e006, 0x0639e9a7, 0x063ac006, 0x063ac9a7, 0x063ba006, 0x063ba9a7, 0x063c8006, 0x063c89a7,
0x063d6006, 0x063d69a7, 0x063e4006, 0x063e49a7, 0x063f2006, 0x063f29a7, 0x06400006, 0x064009a7, 0x0640e006,
0x0640e9a7, 0x0641c006, 0x0641c9a7, 0x0642a006, 0x0642a9a7, 0x06438006, 0x064389a7, 0x06446006, 0x064469a7,
0x06454006, 0x064549a7, 0x06462006, 0x064629a7, 0x06470006, 0x064709a7, 0x0647e006, 0x0647e9a7, 0x0648c006,
0x0648c9a7, 0x0649a006, 0x0649a9a7, 0x064a8006, 0x064a89a7, 0x064b6006, 0x064b69a7, 0x064c4006, 0x064c49a7,
0x064d2006, 0x064d29a7, 0x064e0006, 0x064e09a7, 0x064ee006, 0x064ee9a7, 0x064fc006, 0x064fc9a7, 0x0650a006,
0x0650a9a7, 0x06518006, 0x065189a7, 0x06526006, 0x065269a7, 0x06534006, 0x065349a7, 0x06542006, 0x065429a7,
0x06550006, 0x065509a7, 0x0655e006, 0x0655e9a7, 0x0656c006, 0x0656c9a7, 0x0657a006, 0x0657a9a7, 0x06588006,
0x065889a7, 0x06596006, 0x065969a7, 0x065a4006, 0x065a49a7, 0x065b2006, 0x065b29a7, 0x065c0006, 0x065c09a7,
0x065ce006, 0x065ce9a7, 0x065dc006, 0x065dc9a7, 0x065ea006, 0x065ea9a7, 0x065f8006, 0x065f89a7, 0x06606006,
0x066069a7, 0x06614006, 0x066149a7, 0x06622006, 0x066229a7, 0x06630006, 0x066309a7, 0x0663e006, 0x0663e9a7,
0x0664c006, 0x0664c9a7, 0x0665a006, 0x0665a9a7, 0x06668006, 0x066689a7, 0x06676006, 0x066769a7, 0x06684006,
0x066849a7, 0x06692006, 0x066929a7, 0x066a0006, 0x066a09a7, 0x066ae006, 0x066ae9a7, 0x066bc006, 0x066bc9a7,
0x066ca006, 0x066ca9a7, 0x066d8006, 0x066d89a7, 0x066e6006, 0x066e69a7, 0x066f4006, 0x066f49a7, 0x06702006,
0x067029a7, 0x06710006, 0x067109a7, 0x0671e006, 0x0671e9a7, 0x0672c006, 0x0672c9a7, 0x0673a006, 0x0673a9a7,
0x06748006, 0x067489a7, 0x06756006, 0x067569a7, 0x06764006, 0x067649a7, 0x06772006, 0x067729a7, 0x06780006,
0x067809a7, 0x0678e006, 0x0678e9a7, 0x0679c006, 0x0679c9a7, 0x067aa006, 0x067aa9a7, 0x067b8006, 0x067b89a7,
0x067c6006, 0x067c69a7, 0x067d4006, 0x067d49a7, 0x067e2006, 0x067e29a7, 0x067f0006, 0x067f09a7, 0x067fe006,
0x067fe9a7, 0x0680c006, 0x0680c9a7, 0x0681a006, 0x0681a9a7, 0x06828006, 0x068289a7, 0x06836006, 0x068369a7,
0x06844006, 0x068449a7, 0x06852006, 0x068529a7, 0x06860006, 0x068609a7, 0x0686e006, 0x0686e9a7, 0x0687c006,
0x0687c9a7, 0x0688a006, 0x0688a9a7, 0x06898006, 0x068989a7, 0x068a6006, 0x068a69a7, 0x068b4006, 0x068b49a7,
0x068c2006, 0x068c29a7, 0x068d0006, 0x068d09a7, 0x068de006, 0x068de9a7, 0x068ec006, 0x068ec9a7, 0x068fa006,
0x068fa9a7, 0x06908006, 0x069089a7, 0x06916006, 0x069169a7, 0x06924006, 0x069249a7, 0x06932006, 0x069329a7,
0x06940006, 0x069409a7, 0x0694e006, 0x0694e9a7, 0x0695c006, 0x0695c9a7, 0x0696a006, 0x0696a9a7, 0x06978006,
0x069789a7, 0x06986006, 0x069869a7, 0x06994006, 0x069949a7, 0x069a2006, 0x069a29a7, 0x069b0006, 0x069b09a7,
0x069be006, 0x069be9a7, 0x069cc006, 0x069cc9a7, 0x069da006, 0x069da9a7, 0x069e8006, 0x069e89a7, 0x069f6006,
0x069f69a7, 0x06a04006, 0x06a049a7, 0x06a12006, 0x06a129a7, 0x06a20006, 0x06a209a7, 0x06a2e006, 0x06a2e9a7,
0x06a3c006, 0x06a3c9a7, 0x06a4a006, 0x06a4a9a7, 0x06a58006, 0x06a589a7, 0x06a66006, 0x06a669a7, 0x06a74006,
0x06a749a7, 0x06a82006, 0x06a829a7, 0x06a90006, 0x06a909a7, 0x06a9e006, 0x06a9e9a7, 0x06aac006, 0x06aac9a7,
0x06aba006, 0x06aba9a7, 0x06ac8006, 0x06ac89a7, 0x06ad6006, 0x06ad69a7, 0x06ae4006, 0x06ae49a7, 0x06af2006,
0x06af29a7, 0x06b00006, 0x06b009a7, 0x06b0e006, 0x06b0e9a7, 0x06b1c006, 0x06b1c9a7, 0x06b2a006, 0x06b2a9a7,
0x06b38006, 0x06b389a7, 0x06b46006, 0x06b469a7, 0x06b54006, 0x06b549a7, 0x06b62006, 0x06b629a7, 0x06b70006,
0x06b709a7, 0x06b7e006, 0x06b7e9a7, 0x06b8c006, 0x06b8c9a7, 0x06b9a006, 0x06b9a9a7, 0x06ba8006, 0x06ba89a7,
0x06bb6006, 0x06bb69a7, 0x06bc4006, 0x06bc49a7, 0x06bd816c, 0x06be5b0b, 0x07d8f002, 0x07f000f2, 0x07f100f2,
0x07f7f801, 0x07fcf012, 0x07ff80b1, 0x080fe802, 0x08170002, 0x081bb042, 0x08500822, 0x08502812, 0x08506032,
0x0851c022, 0x0851f802, 0x08572812, 0x08692032, 0x08755812, 0x087a30a2, 0x087c1032, 0x0880000a, 0x08800802,
0x0880100a, 0x0881c0e2, 0x08838002, 0x08839812, 0x0883f822, 0x0884100a, 0x0885802a, 0x08859832, 0x0885b81a,
0x0885c812, 0x0885e808, 0x08861002, 0x08866808, 0x08880022, 0x08893842, 0x0889600a, 0x08896872, 0x088a281a,
0x088b9802, 0x088c0012, 0x088c100a, 0x088d982a, 0x088db082, 0x088df81a, 0x088e1018, 0x088e4832, 0x088e700a,
0x088e7802, 0x0891602a, 0x08917822, 0x0891901a, 0x0891a002, 0x0891a80a, 0x0891b012, 0x0891f002, 0x0896f802,
0x0897002a, 0x08971872, 0x08980012, 0x0898101a, 0x0899d812, 0x0899f002, 0x0899f80a, 0x089a0002, 0x089a083a,
0x089a381a, 0x089a582a, 0x089ab802, 0x089b101a, 0x089b3062, 0x089b8042, 0x08a1a82a, 0x08a1c072, 0x08a2001a,
0x08a21022, 0x08a2280a, 0x08a23002, 0x08a2f002, 0x08a58002, 0x08a5881a, 0x08a59852, 0x08a5c80a, 0x08a5d002,
0x08a5d81a, 0x08a5e802, 0x08a5f00a, 0x08a5f812, 0x08a6080a, 0x08a61012, 0x08ad7802, 0x08ad801a, 0x08ad9032,
0x08adc03a, 0x08ade012, 0x08adf00a, 0x08adf812, 0x08aee012, 0x08b1802a, 0x08b19872, 0x08b1d81a, 0x08b1e802,
0x08b1f00a, 0x08b1f812, 0x08b55802, 0x08b5600a, 0x08b56802, 0x08b5701a, 0x08b58052, 0x08b5b00a, 0x08b5b802,
0x08b8e822, 0x08b91032, 0x08b9300a, 0x08b93842, 0x08c1602a, 0x08c17882, 0x08c1c00a, 0x08c1c812, 0x08c98002,
0x08c9884a, 0x08c9b81a, 0x08c9d812, 0x08c9e80a, 0x08c9f002, 0x08c9f808, 0x08ca000a, 0x08ca0808, 0x08ca100a,
0x08ca1802, 0x08ce882a, 0x08cea032, 0x08ced012, 0x08cee03a, 0x08cf0002, 0x08cf200a, 0x08d00892, 0x08d19852,
0x08d1c80a, 0x08d1d008, 0x08d1d832, 0x08d23802, 0x08d28852, 0x08d2b81a, 0x08d2c822, 0x08d42058, 0x08d450c2,
0x08d4b80a, 0x08d4c012, 0x08e1780a, 0x08e18062, 0x08e1c052, 0x08e1f00a, 0x08e1f802, 0x08e49152, 0x08e5480a,
0x08e55062, 0x08e5880a, 0x08e59012, 0x08e5a00a, 0x08e5a812, 0x08e98852, 0x08e9d002, 0x08e9e012, 0x08e9f862,
0x08ea3008, 0x08ea3802, 0x08ec504a, 0x08ec8012, 0x08ec981a, 0x08eca802, 0x08ecb00a, 0x08ecb802, 0x08f79812,
0x08f7a81a, 0x09a18081, 0x0b578042, 0x0b598062, 0x0b7a7802, 0x0b7a8b6a, 0x0b7c7832, 0x0b7f2002, 0x0b7f801a,
0x0de4e812, 0x0de50031, 0x0e7802d2, 0x0e798162, 0x0e8b2802, 0x0e8b300a, 0x0e8b3822, 0x0e8b680a, 0x0e8b7042,
0x0e8b9871, 0x0e8bd872, 0x0e8c2862, 0x0e8d5032, 0x0e921022, 0x0ed00362, 0x0ed1db12, 0x0ed3a802, 0x0ed42002,
0x0ed4d842, 0x0ed508e2, 0x0f000062, 0x0f004102, 0x0f00d862, 0x0f011812, 0x0f013042, 0x0f098062, 0x0f157002,
0x0f176032, 0x0f468062, 0x0f4a2062, 0x0f8007f3, 0x0f8407f3, 0x0f886823, 0x0f897803, 0x0f8b6053, 0x0f8bf013,
0x0f8c7003, 0x0f8c8893, 0x0f8d6b83, 0x0f8f3199, 0x0f9008e3, 0x0f90d003, 0x0f917803, 0x0f919083, 0x0f91e033,
0x0f924ff3, 0x0f964ff3, 0x0f9a4ff3, 0x0f9e4b13, 0x0f9fd842, 0x0fa007f3, 0x0fa407f3, 0x0fa803d3, 0x0faa37f3,
0x0fae37f3, 0x0fb23093, 0x0fb407f3, 0x0fbba0b3, 0x0fbeaaa3, 0x0fc06033, 0x0fc24073, 0x0fc2d053, 0x0fc44073,
0x0fc57513, 0x0fc862e3, 0x0fc9e093, 0x0fca3ff3, 0x0fce3ff3, 0x0fd23ff3, 0x0fd63b83, 0x0fe007f3, 0x0fe407f3,
0x0fe807f3, 0x0fec07f3, 0x0ff007f3, 0x0ff407f3, 0x0ff807f3, 0x0ffc07d3, 0x700001f1, 0x700105f2, 0x700407f1,
0x700807f2, 0x700c06f2, 0x700f87f1, 0x701387f1, 0x701787f1, 0x701b87f1, 0x701f87f1, 0x702387f1, 0x702787f1,
0x702b87f1, 0x702f87f1, 0x703387f1, 0x703787f1, 0x703b87f1, 0x703f87f1, 0x704387f1, 0x704787f1, 0x704b87f1,
0x704f87f1, 0x705387f1, 0x705787f1, 0x705b87f1, 0x705f87f1, 0x706387f1, 0x706787f1, 0x706b87f1, 0x706f87f1,
0x707387f1, 0x707787f1, 0x707b87f1, 0x707f80f1};
/// Returns the extended grapheme cluster bondary property of a code point.
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __property __get_property(const char32_t __code_point) noexcept {
// TODO FMT use std::ranges::upper_bound.
// The algorithm searches for the upper bound of the range and, when found,
// steps back one entry. This algorithm is used since the code point can be
// anywhere in the range. After a lower bound is found the next step is to
// compare whether the code unit is indeed in the range.
//
// Since the entry contains a code unit, size, and property the code point
// being sought needs to be adjusted. Just shifting the code point to the
// proper position doesn't work; suppose an entry has property 0, size 1,
// and lower bound 3. This results in the entry 0x1810.
// When searching for code point 3 it will search for 0x1800, find 0x1810
// and moves to the previous entry. Thus the lower bound value will never
// be found.
// The simple solution is to set the bits belonging to the property and
// size. Then the upper bound for code point 3 will return the entry after
// 0x1810. After moving to the previous entry the algorithm arrives at the
// correct entry.
ptrdiff_t __i = std::upper_bound(__entries, std::end(__entries), (__code_point << 11) | 0x7ffu) - __entries;
if (__i == 0)
return __property::__none;
--__i;
uint32_t __upper_bound = (__entries[__i] >> 11) + ((__entries[__i] >> 4) & 0x7f);
if (__code_point <= __upper_bound)
return static_cast<__property>(__entries[__i] & 0xf);
return __property::__none;
}
} // namespace __extended_grapheme_custer_property_boundary
#endif //_LIBCPP_STD_VER > 17
_LIBCPP_END_NAMESPACE_STD
#endif // _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H

View File

@ -343,7 +343,7 @@ __format_bool(bool __value, auto& __ctx, __format_spec::__parsed_specifications<
if (__specs.__std_.__locale_specific_form_) {
const auto& __np = use_facet<numpunct<_CharT>>(__ctx.locale());
basic_string<_CharT> __str = __value ? __np.truename() : __np.falsename();
return __formatter::__write_unicode_no_precision(basic_string_view<_CharT>{__str}, __ctx.out(), __specs);
return __formatter::__write_string_no_precision(basic_string_view<_CharT>{__str}, __ctx.out(), __specs);
}
# endif
basic_string_view<_CharT> __str =

View File

@ -17,6 +17,7 @@
#include <__config>
#include <__format/formatter.h>
#include <__format/parser_std_format_spec.h>
#include <__format/unicode.h>
#include <__utility/move.h>
#include <__utility/unreachable.h>
#include <cstddef>
@ -59,8 +60,8 @@ struct _LIBCPP_TYPE_VIS __padding_size_result {
_LIBCPP_HIDE_FROM_ABI constexpr __padding_size_result
__padding_size(size_t __size, size_t __width, __format_spec::__alignment __align) {
_LIBCPP_ASSERT(__width > __size, "don't call this function when no padding is required");
_LIBCPP_ASSERT(__align != __format_spec::__alignment::__zero_padding,
"the caller should have handled the zero-padding");
_LIBCPP_ASSERT(
__align != __format_spec::__alignment::__zero_padding, "the caller should have handled the zero-padding");
size_t __fill = __width - __size;
switch (__align) {
@ -75,7 +76,7 @@ __padding_size(size_t __size, size_t __width, __format_spec::__alignment __align
// __before = floor(__fill, 2);
// __after = ceil(__fill, 2);
size_t __before = __fill / 2;
size_t __after = __fill - __before;
size_t __after = __fill - __before;
return {__before, __after};
}
case __format_spec::__alignment::__default:
@ -173,10 +174,12 @@ _LIBCPP_HIDE_FROM_ABI _OutIt __write_using_decimal_separators(_OutIt __out_it, c
/// conversion, which means the [\a __first, \a __last) always contains elements
/// of the type \c char.
template <class _CharT, class _ParserCharT>
_LIBCPP_HIDE_FROM_ABI auto __write(const _CharT* __first, const _CharT* __last,
output_iterator<const _CharT&> auto __out_it,
__format_spec::__parsed_specifications<_ParserCharT> __specs, ptrdiff_t __size)
-> decltype(__out_it) {
_LIBCPP_HIDE_FROM_ABI auto __write(
const _CharT* __first,
const _CharT* __last,
output_iterator<const _CharT&> auto __out_it,
__format_spec::__parsed_specifications<_ParserCharT> __specs,
ptrdiff_t __size) -> decltype(__out_it) {
_LIBCPP_ASSERT(__first <= __last, "Not a valid range");
if (__size >= __specs.__width_)
@ -189,6 +192,7 @@ _LIBCPP_HIDE_FROM_ABI auto __write(const _CharT* __first, const _CharT* __last,
}
/// \overload
///
/// Calls the function above where \a __size = \a __last - \a __first.
template <class _CharT, class _ParserCharT>
_LIBCPP_HIDE_FROM_ABI auto __write(const _CharT* __first, const _CharT* __last,
@ -243,77 +247,56 @@ _LIBCPP_HIDE_FROM_ABI auto __write_using_trailing_zeros(
return _VSTD::fill_n(_VSTD::move(__out_it), __padding.__after_, __specs.__fill_);
}
# ifndef _LIBCPP_HAS_NO_UNICODE
/// Writes a string using format's width estimation algorithm.
///
/// \pre !__specs.__has_precision()
///
/// \note When \c _LIBCPP_HAS_NO_UNICODE is defined the function assumes the
/// input is ASCII.
template <class _CharT>
_LIBCPP_HIDE_FROM_ABI auto __write_unicode_no_precision(basic_string_view<_CharT> __str,
output_iterator<const _CharT&> auto __out_it,
__format_spec::__parsed_specifications<_CharT> __specs)
-> decltype(__out_it) {
_LIBCPP_HIDE_FROM_ABI auto __write_string_no_precision(
basic_string_view<_CharT> __str,
output_iterator<const _CharT&> auto __out_it,
__format_spec::__parsed_specifications<_CharT> __specs) -> decltype(__out_it) {
_LIBCPP_ASSERT(!__specs.__has_precision(), "use __write_string");
_LIBCPP_ASSERT(!__specs.__has_precision(), "use __write_unicode");
// No padding -> copy the string
if (!__specs.__has_width())
return _VSTD::copy(__str.begin(), __str.end(), _VSTD::move(__out_it));
// Non Unicode part larger than width -> copy the string
auto __last = __format_spec::__detail::__estimate_column_width_fast(__str.begin(), __str.end());
ptrdiff_t __size = __last - __str.begin();
if (__size >= __specs.__width_)
return _VSTD::copy(__str.begin(), __str.end(), _VSTD::move(__out_it));
// Is there a non Unicode part?
if (__last != __str.end()) {
// Non Unicode and Unicode part larger than width -> copy the string
__format_spec::__detail::__column_width_result __column_width =
__format_spec::__detail::__estimate_column_width(__last, __str.end(), __specs.__width_);
__size += __column_width.__width; // Note this new size is used when __size < __specs.__width_
if (__size >= __specs.__width_)
return _VSTD::copy(__str.begin(), __str.end(), _VSTD::move(__out_it));
}
// Note when the estimated width is larger than size there's no padding. So
// there's no reason to get the real size when the estimate is larger than or
// equal to the minimum field width.
size_t __size =
__format_spec::__estimate_column_width(__str, __specs.__width_, __format_spec::__column_width_rounding::__up)
.__width_;
return __formatter::__write(__str.begin(), __str.end(), _VSTD::move(__out_it), __specs, __size);
}
# endif
template <class _CharT>
_LIBCPP_HIDE_FROM_ABI auto __write_unicode(basic_string_view<_CharT> __str,
output_iterator<const _CharT&> auto __out_it,
__format_spec::__parsed_specifications<_CharT> __specs)
-> decltype(__out_it) {
# ifndef _LIBCPP_HAS_NO_UNICODE
_LIBCPP_HIDE_FROM_ABI int __truncate(basic_string_view<_CharT>& __str, int __precision_) {
__format_spec::__column_width_result<_CharT> __result =
__format_spec::__estimate_column_width(__str, __precision_, __format_spec::__column_width_rounding::__down);
__str = basic_string_view<_CharT>{__str.begin(), __result.__last_};
return __result.__width_;
}
/// Writes a string using format's width estimation algorithm.
///
/// \note When \c _LIBCPP_HAS_NO_UNICODE is defined the function assumes the
/// input is ASCII.
template <class _CharT>
_LIBCPP_HIDE_FROM_ABI auto __write_string(
basic_string_view<_CharT> __str,
output_iterator<const _CharT&> auto __out_it,
__format_spec::__parsed_specifications<_CharT> __specs) -> decltype(__out_it) {
if (!__specs.__has_precision())
return __formatter::__write_unicode_no_precision(__str, _VSTD::move(__out_it), __specs);
return __formatter::__write_string_no_precision(__str, _VSTD::move(__out_it), __specs);
// Non unicode part larger than precision -> truncate the output and use the normal write operation.
auto __last = __format_spec::__detail::__estimate_column_width_fast(__str.begin(), __str.end());
ptrdiff_t __size = __last - __str.begin();
if (__size >= __specs.__precision_)
return __formatter::__write(__str.begin(), __str.begin() + __specs.__precision_, _VSTD::move(__out_it), __specs,
__specs.__precision_);
int __size = __formatter::__truncate(__str, __specs.__precision_);
// No non Unicode part, implies __size < __specs.__precision_ -> use normal write operation
if (__last == __str.end())
return __formatter::__write(__str.begin(), __str.end(), _VSTD::move(__out_it), __specs, __str.size());
__format_spec::__detail::__column_width_result __column_width =
__format_spec::__detail::__estimate_column_width(__last, __str.end(), __specs.__precision_ - __size);
__size += __column_width.__width;
// Truncate the output
if (__column_width.__ptr != __str.end())
__str.remove_suffix(__str.end() - __column_width.__ptr);
return __formatter::__write(__str.begin(), __str.end(), _VSTD::move(__out_it), __specs, __size);
# else
if (__specs.__has_precision()) {
ptrdiff_t __size = __str.size();
if (__size > __specs.__precision_)
return __formatter::__write(__str.begin(), __str.begin() + __specs.__precision_, _VSTD::move(__out_it), __specs,
__specs.__precision_);
}
return __formatter::__write(__str.begin(), __str.end(), _VSTD::move(__out_it), __specs, __str.size());
# endif
return __write(__str.begin(), __str.end(), _VSTD::move(__out_it), __specs, __size);
}
} // namespace __formatter

View File

@ -40,7 +40,7 @@ public:
}
_LIBCPP_HIDE_FROM_ABI auto format(basic_string_view<_CharT> __str, auto& __ctx) const -> decltype(__ctx.out()) {
return __formatter::__write_unicode(__str, __ctx.out(), __parser_.__get_parsed_std_specifications(__ctx));
return __formatter::__write_string(__str, __ctx.out(), __parser_.__get_parsed_std_specifications(__ctx));
}
__format_spec::__parser<_CharT> __parser_;
@ -69,7 +69,7 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<const _CharT*,
// TODO FMT Implement these improvements.
__format_spec::__parsed_specifications<_CharT> __specs = _Base::__parser_.__get_parsed_std_specifications(__ctx);
if (__specs.__has_width() || __specs.__has_precision())
return __formatter::__write_unicode(basic_string_view<_CharT>{__str}, __ctx.out(), __specs);
return __formatter::__write_string(basic_string_view<_CharT>{__str}, __ctx.out(), __specs);
// No formatting required, copy the string to the output.
auto __out_it = __ctx.out();

View File

@ -25,10 +25,12 @@
#include <__format/format_error.h>
#include <__format/format_parse_context.h>
#include <__format/format_string.h>
#include <__format/unicode.h>
#include <__variant/monostate.h>
#include <bit>
#include <concepts>
#include <cstdint>
#include <string_view>
#include <type_traits>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@ -91,462 +93,6 @@ __substitute_arg_id(basic_format_arg<_Context> __format_arg) {
__format_arg);
}
/** Helper struct returned from @ref __get_string_alignment. */
template <class _CharT>
struct _LIBCPP_TEMPLATE_VIS __string_alignment {
/** Points beyond the last character to write to the output. */
const _CharT* __last;
/**
* The estimated number of columns in the output or 0.
*
* Only when the output needs to be aligned it's required to know the exact
* number of columns in the output. So if the formatted output has only a
* minimum width the exact size isn't important. It's only important to know
* the minimum has been reached. The minimum width is the width specified in
* the format-spec.
*
* For example in this code @code std::format("{:10}", MyString); @endcode
* the width estimation can stop once the algorithm has determined the output
* width is 10 columns.
*
* So if:
* * @ref __align == @c true the @ref __size is the estimated number of
* columns required.
* * @ref __align == @c false the @ref __size is the estimated number of
* columns required or 0 when the estimation algorithm stopped prematurely.
*/
ptrdiff_t __size;
/**
* Does the output need to be aligned.
*
* When alignment is needed the output algorithm needs to add the proper
* padding. Else the output algorithm just needs to copy the input up to
* @ref __last.
*/
bool __align;
};
#ifndef _LIBCPP_HAS_NO_UNICODE
namespace __detail {
/**
* Unicode column width estimates.
*
* Unicode can be stored in several formats: UTF-8, UTF-16, and UTF-32.
* Depending on format the relation between the number of code units stored and
* the number of output columns differs. The first relation is the number of
* code units forming a code point. (The text assumes the code units are
* unsigned.)
* - UTF-8 The number of code units is between one and four. The first 127
* Unicode code points match the ASCII character set. When the highest bit is
* set it means the code point has more than one code unit.
* - UTF-16: The number of code units is between 1 and 2. When the first
* code unit is in the range [0xd800,0xdfff) it means the code point uses two
* code units.
* - UTF-32: The number of code units is always one.
*
* The code point to the number of columns isn't well defined. The code uses the
* estimations defined in [format.string.std]/11. This list might change in the
* future.
*
* The algorithm of @ref __get_string_alignment uses two different scanners:
* - The simple scanner @ref __estimate_column_width_fast. This scanner assumes
* 1 code unit is 1 column. This scanner stops when it can't be sure the
* assumption is valid:
* - UTF-8 when the code point is encoded in more than 1 code unit.
* - UTF-16 and UTF-32 when the first multi-column code point is encountered.
* (The code unit's value is lower than 0xd800 so the 2 code unit encoding
* is irrelevant for this scanner.)
* Due to these assumptions the scanner is faster than the full scanner. It
* can process all text only containing ASCII. For UTF-16/32 it can process
* most (all?) European languages. (Note the set it can process might be
* reduced in the future, due to updates in the scanning rules.)
* - The full scanner @ref __estimate_column_width. This scanner, if needed,
* converts multiple code units into one code point then converts the code
* point to a column width.
*
* See also:
* - [format.string.general]/11
* - https://en.wikipedia.org/wiki/UTF-8#Encoding
* - https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
*/
/**
* The first 2 column code point.
*
* This is the point where the fast UTF-16/32 scanner needs to stop processing.
*/
inline constexpr uint32_t __two_column_code_point = 0x1100;
/** Helper concept for an UTF-8 character type. */
template <class _CharT>
concept __utf8_character = same_as<_CharT, char> || same_as<_CharT, char8_t>;
/** Helper concept for an UTF-16 character type. */
template <class _CharT>
concept __utf16_character = (same_as<_CharT, wchar_t> && sizeof(wchar_t) == 2) || same_as<_CharT, char16_t>;
/** Helper concept for an UTF-32 character type. */
template <class _CharT>
concept __utf32_character = (same_as<_CharT, wchar_t> && sizeof(wchar_t) == 4) || same_as<_CharT, char32_t>;
/** Helper concept for an UTF-16 or UTF-32 character type. */
template <class _CharT>
concept __utf16_or_32_character = __utf16_character<_CharT> || __utf32_character<_CharT>;
/**
* Converts a code point to the column width.
*
* The estimations are conforming to [format.string.general]/11
*
* This version expects a value less than 0x1'0000, which is a 3-byte UTF-8
* character.
*/
_LIBCPP_HIDE_FROM_ABI inline constexpr int __column_width_3(uint32_t __c) noexcept {
_LIBCPP_ASSERT(__c < 0x10000,
"Use __column_width_4 or __column_width for larger values");
// clang-format off
return 1 + (__c >= 0x1100 && (__c <= 0x115f ||
(__c >= 0x2329 && (__c <= 0x232a ||
(__c >= 0x2e80 && (__c <= 0x303e ||
(__c >= 0x3040 && (__c <= 0xa4cf ||
(__c >= 0xac00 && (__c <= 0xd7a3 ||
(__c >= 0xf900 && (__c <= 0xfaff ||
(__c >= 0xfe10 && (__c <= 0xfe19 ||
(__c >= 0xfe30 && (__c <= 0xfe6f ||
(__c >= 0xff00 && (__c <= 0xff60 ||
(__c >= 0xffe0 && (__c <= 0xffe6
))))))))))))))))))));
// clang-format on
}
/**
* @overload
*
* This version expects a value greater than or equal to 0x1'0000, which is a
* 4-byte UTF-8 character.
*/
_LIBCPP_HIDE_FROM_ABI inline constexpr int __column_width_4(uint32_t __c) noexcept {
_LIBCPP_ASSERT(__c >= 0x10000,
"Use __column_width_3 or __column_width for smaller values");
// clang-format off
return 1 + (__c >= 0x1'f300 && (__c <= 0x1'f64f ||
(__c >= 0x1'f900 && (__c <= 0x1'f9ff ||
(__c >= 0x2'0000 && (__c <= 0x2'fffd ||
(__c >= 0x3'0000 && (__c <= 0x3'fffd
))))))));
// clang-format on
}
/**
* @overload
*
* The general case, accepting all values.
*/
_LIBCPP_HIDE_FROM_ABI inline constexpr int __column_width(uint32_t __c) noexcept {
if (__c < 0x10000)
return __column_width_3(__c);
return __column_width_4(__c);
}
/**
* Estimate the column width for the UTF-8 sequence using the fast algorithm.
*/
template <__utf8_character _CharT>
_LIBCPP_HIDE_FROM_ABI constexpr const _CharT*
__estimate_column_width_fast(const _CharT* __first,
const _CharT* __last) noexcept {
return _VSTD::find_if(__first, __last,
[](unsigned char __c) { return __c & 0x80; });
}
/**
* @overload
*
* The implementation for UTF-16/32.
*/
template <__utf16_or_32_character _CharT>
_LIBCPP_HIDE_FROM_ABI constexpr const _CharT*
__estimate_column_width_fast(const _CharT* __first,
const _CharT* __last) noexcept {
return _VSTD::find_if(__first, __last,
[](uint32_t __c) { return __c >= 0x1100; });
}
template <class _CharT>
struct _LIBCPP_TEMPLATE_VIS __column_width_result {
/** The number of output columns. */
size_t __width;
/**
* The last parsed element.
*
* This limits the original output to fit in the wanted number of columns.
*/
const _CharT* __ptr;
};
/**
* Small helper to determine the width of malformed Unicode.
*
* @note This function's only needed for UTF-8. During scanning UTF-8 there
* are multiple place where it can be detected that the Unicode is malformed.
* UTF-16 only requires 1 test and UTF-32 requires no testing.
*/
template <__utf8_character _CharT>
_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT>
__estimate_column_width_malformed(const _CharT* __first, const _CharT* __last,
size_t __maximum, size_t __result) noexcept {
size_t __size = __last - __first;
size_t __n = _VSTD::min(__size, __maximum);
return {__result + __n, __first + __n};
}
/**
* Determines the number of output columns needed to render the input.
*
* @note When the scanner encounters malformed Unicode it acts as-if every code
* unit at the end of the input is one output column. It's expected the output
* terminal will replace these malformed code units with a one column
* replacement characters.
*
* @param __first Points to the first element of the input range.
* @param __last Points beyond the last element of the input range.
* @param __maximum The maximum number of output columns. The returned number
* of estimated output columns will not exceed this value.
*/
template <__utf8_character _CharT>
_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT>
__estimate_column_width(const _CharT* __first, const _CharT* __last,
size_t __maximum) noexcept {
size_t __result = 0;
while (__first != __last) {
// Based on the number of leading 1 bits the number of code units in the
// code point can be determined. See
// https://en.wikipedia.org/wiki/UTF-8#Encoding
switch (_VSTD::countl_one(static_cast<unsigned char>(*__first))) {
case 0: // 1-code unit encoding: all 1 column
++__result;
++__first;
break;
case 2: // 2-code unit encoding: all 1 column
// Malformed Unicode.
if (__last - __first < 2) [[unlikely]]
return __estimate_column_width_malformed(__first, __last, __maximum,
__result);
__first += 2;
++__result;
break;
case 3: // 3-code unit encoding: either 1 or 2 columns
// Malformed Unicode.
if (__last - __first < 3) [[unlikely]]
return __estimate_column_width_malformed(__first, __last, __maximum,
__result);
{
uint32_t __c = static_cast<unsigned char>(*__first++) & 0x0f;
__c <<= 6;
__c |= static_cast<unsigned char>(*__first++) & 0x3f;
__c <<= 6;
__c |= static_cast<unsigned char>(*__first++) & 0x3f;
__result += __column_width_3(__c);
if (__result > __maximum)
return {__result - 2, __first - 3};
}
break;
case 4: // 4-code unit encoding: either 1 or 2 columns
// Malformed Unicode.
if (__last - __first < 4) [[unlikely]]
return __estimate_column_width_malformed(__first, __last, __maximum,
__result);
{
uint32_t __c = static_cast<unsigned char>(*__first++) & 0x07;
__c <<= 6;
__c |= static_cast<unsigned char>(*__first++) & 0x3f;
__c <<= 6;
__c |= static_cast<unsigned char>(*__first++) & 0x3f;
__c <<= 6;
__c |= static_cast<unsigned char>(*__first++) & 0x3f;
__result += __column_width_4(__c);
if (__result > __maximum)
return {__result - 2, __first - 4};
}
break;
default:
// Malformed Unicode.
return __estimate_column_width_malformed(__first, __last, __maximum,
__result);
}
if (__result >= __maximum)
return {__result, __first};
}
return {__result, __first};
}
template <__utf16_character _CharT>
_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT>
__estimate_column_width(const _CharT* __first, const _CharT* __last,
size_t __maximum) noexcept {
size_t __result = 0;
while (__first != __last) {
uint32_t __c = *__first;
// Is the code unit part of a surrogate pair? See
// https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
if (__c >= 0xd800 && __c <= 0xDfff) {
// Malformed Unicode.
if (__last - __first < 2) [[unlikely]]
return {__result + 1, __first + 1};
__c -= 0xd800;
__c <<= 10;
__c += (*(__first + 1) - 0xdc00);
__c += 0x10000;
__result += __column_width_4(__c);
if (__result > __maximum)
return {__result - 2, __first};
__first += 2;
} else {
__result += __column_width_3(__c);
if (__result > __maximum)
return {__result - 2, __first};
++__first;
}
if (__result >= __maximum)
return {__result, __first};
}
return {__result, __first};
}
template <__utf32_character _CharT>
_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT>
__estimate_column_width(const _CharT* __first, const _CharT* __last,
size_t __maximum) noexcept {
size_t __result = 0;
while (__first != __last) {
uint32_t __c = *__first;
__result += __column_width(__c);
if (__result > __maximum)
return {__result - 2, __first};
++__first;
if (__result >= __maximum)
return {__result, __first};
}
return {__result, __first};
}
} // namespace __detail
template <class _CharT>
_LIBCPP_HIDE_FROM_ABI constexpr __string_alignment<_CharT>
__get_string_alignment(const _CharT* __first, const _CharT* __last,
ptrdiff_t __width, ptrdiff_t __precision) noexcept {
_LIBCPP_ASSERT(__width != 0 || __precision != -1,
"The function has no effect and shouldn't be used");
// TODO FMT There might be more optimizations possible:
// If __precision == __format::__number_max and the encoding is:
// * UTF-8 : 4 * (__last - __first) >= __width
// * UTF-16 : 2 * (__last - __first) >= __width
// * UTF-32 : (__last - __first) >= __width
// In these cases it's certain the output is at least the requested width.
// It's unknown how often this happens in practice. For now the improvement
// isn't implemented.
/*
* First assume there are no special Unicode code units in the input.
* - Apply the precision (this may reduce the size of the input). When
* __precison == -1 this step is omitted.
* - Scan for special code units in the input.
* If our assumption was correct the __pos will be at the end of the input.
*/
const ptrdiff_t __length = __last - __first;
const _CharT* __limit =
__first +
(__precision == -1 ? __length : _VSTD::min(__length, __precision));
ptrdiff_t __size = __limit - __first;
const _CharT* __pos =
__detail::__estimate_column_width_fast(__first, __limit);
if (__pos == __limit)
return {__limit, __size, __size < __width};
/*
* Our assumption was wrong, there are special Unicode code units.
* The range [__first, __pos) contains a set of code units with the
* following property:
* Every _CharT in the range will be rendered in 1 column.
*
* If there's no maximum width and the parsed size already exceeds the
* minimum required width. The real size isn't important. So bail out.
*/
if (__precision == -1 && (__pos - __first) >= __width)
return {__last, 0, false};
/* If there's a __precision, truncate the output to that width. */
ptrdiff_t __prefix = __pos - __first;
if (__precision != -1) {
_LIBCPP_ASSERT(__precision > __prefix, "Logic error.");
auto __lengh_info = __detail::__estimate_column_width(
__pos, __last, __precision - __prefix);
__size = __lengh_info.__width + __prefix;
return {__lengh_info.__ptr, __size, __size < __width};
}
/* Else use __width to determine the number of required padding characters. */
_LIBCPP_ASSERT(__width > __prefix, "Logic error.");
/*
* The column width is always one or two columns. For the precision the wanted
* column width is the maximum, for the width it's the minimum. Using the
* width estimation with its truncating behavior will result in the wrong
* result in the following case:
* - The last code unit processed requires two columns and exceeds the
* maximum column width.
* By increasing the __maximum by one avoids this issue. (It means it may
* pass one code point more than required to determine the proper result;
* that however isn't a problem for the algorithm.)
*/
size_t __maximum = 1 + __width - __prefix;
auto __lengh_info =
__detail::__estimate_column_width(__pos, __last, __maximum);
if (__lengh_info.__ptr != __last) {
// Consumed the width number of code units. The exact size of the string
// is unknown. We only know we don't need to align the output.
_LIBCPP_ASSERT(static_cast<ptrdiff_t>(__lengh_info.__width + __prefix) >=
__width,
"Logic error");
return {__last, 0, false};
}
__size = __lengh_info.__width + __prefix;
return {__last, __size, __size < __width};
}
#else // _LIBCPP_HAS_NO_UNICODE
template <class _CharT>
_LIBCPP_HIDE_FROM_ABI constexpr __string_alignment<_CharT>
__get_string_alignment(const _CharT* __first, const _CharT* __last,
ptrdiff_t __width, ptrdiff_t __precision) noexcept {
const ptrdiff_t __length = __last - __first;
const _CharT* __limit =
__first +
(__precision == -1 ? __length : _VSTD::min(__length, __precision));
ptrdiff_t __size = __limit - __first;
return {__limit, __size, __size < __width};
}
#endif // _LIBCPP_HAS_NO_UNICODE
/// These fields are a filter for which elements to parse.
///
/// They default to false so when a new field is added it needs to be opted in
@ -1143,6 +689,212 @@ _LIBCPP_HIDE_FROM_ABI constexpr void __process_display_type_pointer(__format_spe
}
}
template <class _CharT>
struct __column_width_result {
/// The number of output columns.
size_t __width_;
/// One beyond the last code unit used in the estimation.
///
/// This limits the original output to fit in the wanted number of columns.
const _CharT* __last_;
};
/// Since a column width can be two it's possible that the requested column
/// width can't be achieved. Depending on the intended usage the policy can be
/// selected.
/// - When used as precision the maximum width may not be exceeded and the
/// result should be "rounded down" to the previous boundary.
/// - When used as a width we're done once the minimum is reached, but
/// exceeding is not an issue. Rounding down is an issue since that will
/// result in writing fill characters. Therefore the result needs to be
/// "rounded up".
enum class __column_width_rounding { __down, __up };
# ifndef _LIBCPP_HAS_NO_UNICODE
namespace __detail {
/// Converts a code point to the column width.
///
/// The estimations are conforming to [format.string.general]/11
///
/// This version expects a value less than 0x1'0000, which is a 3-byte UTF-8
/// character.
_LIBCPP_HIDE_FROM_ABI constexpr int __column_width_3(uint32_t __c) noexcept {
_LIBCPP_ASSERT(__c < 0x10000, "Use __column_width_4 or __column_width for larger values");
// clang-format off
return 1 + (__c >= 0x1100 && (__c <= 0x115f ||
(__c >= 0x2329 && (__c <= 0x232a ||
(__c >= 0x2e80 && (__c <= 0x303e ||
(__c >= 0x3040 && (__c <= 0xa4cf ||
(__c >= 0xac00 && (__c <= 0xd7a3 ||
(__c >= 0xf900 && (__c <= 0xfaff ||
(__c >= 0xfe10 && (__c <= 0xfe19 ||
(__c >= 0xfe30 && (__c <= 0xfe6f ||
(__c >= 0xff00 && (__c <= 0xff60 ||
(__c >= 0xffe0 && (__c <= 0xffe6
))))))))))))))))))));
// clang-format on
}
/// @overload
///
/// This version expects a value greater than or equal to 0x1'0000, which is a
/// 4-byte UTF-8 character.
_LIBCPP_HIDE_FROM_ABI constexpr int __column_width_4(uint32_t __c) noexcept {
_LIBCPP_ASSERT(__c >= 0x10000, "Use __column_width_3 or __column_width for smaller values");
// clang-format off
return 1 + (__c >= 0x1'f300 && (__c <= 0x1'f64f ||
(__c >= 0x1'f900 && (__c <= 0x1'f9ff ||
(__c >= 0x2'0000 && (__c <= 0x2'fffd ||
(__c >= 0x3'0000 && (__c <= 0x3'fffd
))))))));
// clang-format on
}
/// @overload
///
/// The general case, accepting all values.
_LIBCPP_HIDE_FROM_ABI constexpr int __column_width(uint32_t __c) noexcept {
if (__c < 0x10000)
return __detail::__column_width_3(__c);
return __detail::__column_width_4(__c);
}
template <class _CharT>
_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT> __estimate_column_width_grapheme_clustering(
const _CharT* __first, const _CharT* __last, size_t __maximum, __column_width_rounding __rounding) noexcept {
__unicode::__extended_grapheme_cluster_view<_CharT> __view{__first, __last};
__column_width_result<_CharT> __result{0, __first};
while (__result.__last_ != __last && __result.__width_ <= __maximum) {
typename __unicode::__extended_grapheme_cluster_view<_CharT>::__cluster __cluster = __view.__consume();
int __width = __detail::__column_width(__cluster.__code_point_);
// When the next entry would exceed the maximum width the previous width
// might be returned. For example when a width of 100 is requested the
// returned width might be 99, since the next code point has an estimated
// column width of 2. This depends on the rounding flag.
// When the maximum is exceeded the loop will abort the next iteration.
if (__rounding == __column_width_rounding::__down && __result.__width_ + __width > __maximum)
return __result;
__result.__width_ += __width;
__result.__last_ = __cluster.__last_;
}
return __result;
}
} // namespace __detail
// Unicode can be stored in several formats: UTF-8, UTF-16, and UTF-32.
// Depending on format the relation between the number of code units stored and
// the number of output columns differs. The first relation is the number of
// code units forming a code point. (The text assumes the code units are
// unsigned.)
// - UTF-8 The number of code units is between one and four. The first 127
// Unicode code points match the ASCII character set. When the highest bit is
// set it means the code point has more than one code unit.
// - UTF-16: The number of code units is between 1 and 2. When the first
// code unit is in the range [0xd800,0xdfff) it means the code point uses two
// code units.
// - UTF-32: The number of code units is always one.
//
// The code point to the number of columns is specified in
// [format.string.std]/11. This list might change in the future.
//
// Another thing to be taken into account is Grapheme clustering. This means
// that in some cases multiple code points are combined one element in the
// output. For example:
// - an ASCII character with a combined diacritical mark
// - an emoji with a skin tone modifier
// - a group of combined people emoji to create a family
// - a combination of flag emoji
//
// See also:
// - [format.string.general]/11
// - https://en.wikipedia.org/wiki/UTF-8#Encoding
// - https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
_LIBCPP_HIDE_FROM_ABI constexpr bool __is_ascii(char32_t __c) { return __c < 0x80; }
/// Determines the number of output columns needed to render the input.
///
/// \note When the scanner encounters malformed Unicode it acts as-if every
/// code unit is a one column code point. Typically a terminal uses the same
/// strategy and replaces every malformed code unit with a one column
/// replacement character.
///
/// \param __first Points to the first element of the input range.
/// \param __last Points beyond the last element of the input range.
/// \param __maximum The maximum number of output columns. The returned number
/// of estimated output columns will not exceed this value.
/// \param __rounding Selects the rounding method.
/// \c __down result.__width_ <= __maximum
/// \c __up result.__width_ <= __maximum + 1
template <class _CharT>
_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT> __estimate_column_width(
basic_string_view<_CharT> __str, size_t __maximum, __column_width_rounding __rounding) noexcept {
// The width estimation is done in two steps:
// - Quickly process for the ASCII part. ASCII has the following properties
// - One code unit is one code point
// - Every code point has an estimated width of one
// - When needed it will a Unicode Grapheme clustering algorithm to find
// the proper place for truncation.
if (__str.empty() || __maximum == 0)
return {0, __str.begin()};
// ASCII has one caveat; when an ASCII character is followed by a non-ASCII
// character they might be part of an extended grapheme cluster. For example:
// an ASCII letter and a COMBINING ACUTE ACCENT
// The truncate should happen after the COMBINING ACUTE ACCENT. Therefore we
// need to scan one code unit beyond the requested precision. When this code
// unit is non-ASCII we omit the current code unit and let the Grapheme
// clustering algorithm do its work.
const _CharT* __it = __str.begin();
if (__is_ascii(*__it)) {
do {
--__maximum;
++__it;
if (__it == __str.end())
return {__str.size(), __str.end()};
if (__maximum == 0) {
if (__is_ascii(*__it))
return {static_cast<size_t>(__it - __str.begin()), __it};
break;
}
} while (__is_ascii(*__it));
--__it;
++__maximum;
}
ptrdiff_t __ascii_size = __it - __str.begin();
__column_width_result __result =
__detail::__estimate_column_width_grapheme_clustering(__it, __str.end(), __maximum, __rounding);
__result.__width_ += __ascii_size;
return __result;
}
# else // !defined(_LIBCPP_HAS_NO_UNICODE)
template <class _CharT>
_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT>
__estimate_column_width(basic_string_view<_CharT> __str, size_t __maximum, __column_width_rounding) noexcept {
// When Unicode isn't supported assume ASCII and every code unit is one code
// point. In ASCII the estimated column width is always one. Thus there's no
// need for rounding.
size_t __width_ = _VSTD::min(__str.size(), __maximum);
return {__width_, __str.begin() + __width_};
}
# endif // !defined(_LIBCPP_HAS_NO_UNICODE)
} // namespace __format_spec
#endif //_LIBCPP_STD_VER > 17

View File

@ -0,0 +1,339 @@
// -*- C++ -*-
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef _LIBCPP___FORMAT_UNICODE_H
#define _LIBCPP___FORMAT_UNICODE_H
#include <__assert>
#include <__config>
#include <__format/extended_grapheme_cluster_table.h>
#include <__utility/unreachable.h>
#include <bit>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
#endif
_LIBCPP_BEGIN_NAMESPACE_STD
#if _LIBCPP_STD_VER > 17
# ifndef _LIBCPP_HAS_NO_UNICODE
/// Implements the grapheme cluster boundary rules
///
/// These rules are used to implement format's width estimation as stated in
/// [format.string.std]/11
///
/// The Standard refers to UAX \#29 for Unicode 12.0.0
/// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
///
/// The data tables used are
/// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
/// https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
/// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt (for testing only)
namespace __unicode {
inline constexpr char32_t __replacement_character = U'\ufffd';
_LIBCPP_HIDE_FROM_ABI constexpr bool __is_continuation(const char* __char, int __count) {
do {
if ((*__char & 0b1000'0000) != 0b1000'0000)
return false;
--__count;
++__char;
} while (__count);
return true;
}
/// Helper class to extract a code unit from a Unicode character range.
///
/// The stored range is a view. There are multiple specialization for different
/// character types.
template <class _CharT>
class __code_point_view;
/// UTF-8 specialization.
template <>
class __code_point_view<char> {
public:
_LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(const char* __first, const char* __last)
: __first_(__first), __last_(__last) {}
_LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
_LIBCPP_HIDE_FROM_ABI constexpr const char* __position() const noexcept { return __first_; }
_LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
_LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
// Based on the number of leading 1 bits the number of code units in the
// code point can be determined. See
// https://en.wikipedia.org/wiki/UTF-8#Encoding
switch (_VSTD::countl_one(static_cast<unsigned char>(*__first_))) {
case 0:
return *__first_++;
case 2:
if (__last_ - __first_ < 2 || !__unicode::__is_continuation(__first_ + 1, 1)) [[unlikely]]
break;
else {
char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
__value <<= 6;
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
return __value;
}
case 3:
if (__last_ - __first_ < 3 || !__unicode::__is_continuation(__first_ + 1, 2)) [[unlikely]]
break;
else {
char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
__value <<= 6;
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
__value <<= 6;
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
return __value;
}
case 4:
if (__last_ - __first_ < 4 || !__unicode::__is_continuation(__first_ + 1, 3)) [[unlikely]]
break;
else {
char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
__value <<= 6;
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
__value <<= 6;
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
__value <<= 6;
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
return __value;
}
}
// An invalid number of leading ones can be garbage or a code unit in the
// middle of a code point. By consuming one code unit the parser may get
// "in sync" after a few code units.
++__first_;
return __replacement_character;
}
private:
const char* __first_;
const char* __last_;
};
# ifndef TEST_HAS_NO_WIDE_CHARACTERS
/// This specialization depends on the size of wchar_t
/// - 2 UTF-16 (for example Windows and AIX)
/// - 4 UTF-32 (for example Linux)
template <>
class __code_point_view<wchar_t> {
public:
_LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(const wchar_t* __first, const wchar_t* __last)
: __first_(__first), __last_(__last) {}
_LIBCPP_HIDE_FROM_ABI constexpr const wchar_t* __position() const noexcept { return __first_; }
_LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
_LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
_LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
if constexpr (sizeof(wchar_t) == 2) {
char32_t __result = *__first_++;
// Is the code unit part of a surrogate pair? See
// https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
if (__result >= 0xd800 && __result <= 0xDfff) {
// Malformed Unicode.
if (__first_ == __last_) [[unlikely]]
return __replacement_character;
__result -= 0xd800;
__result <<= 10;
__result += *__first_++ - 0xdc00;
__result += 0x10000;
}
return __result;
} else if constexpr (sizeof(wchar_t) == 4) {
char32_t __result = *__first_++;
if (__result > 0x10FFFF) [[unlikely]]
return __replacement_character;
return __result;
} else {
// TODO FMT P2593R0 Use static_assert(false, "sizeof(wchar_t) has a not implemented value");
_LIBCPP_ASSERT(sizeof(wchar_t) == 0, "sizeof(wchar_t) has a not implemented value");
__libcpp_unreachable();
}
}
private:
const wchar_t* __first_;
const wchar_t* __last_;
};
# endif
_LIBCPP_HIDE_FROM_ABI constexpr bool __at_extended_grapheme_cluster_break(
bool& __RI_break_allowed,
bool __has_extened_pictographic,
__extended_grapheme_custer_property_boundary::__property __prev,
__extended_grapheme_custer_property_boundary::__property __next_) {
using __extended_grapheme_custer_property_boundary::__property;
__has_extened_pictographic |= __prev == __property::__Extended_Pictographic;
// https://www.unicode.org/reports/tr29/tr29-39.html#Grapheme_Cluster_Boundary_Rules
// *** Break at the start and end of text, unless the text is empty. ***
_LIBCPP_ASSERT(__prev != __property::__sot, "should be handled in the constructor"); // GB1
_LIBCPP_ASSERT(__prev != __property::__eot, "should be handled by our caller"); // GB2
// *** Do not break between a CR and LF. Otherwise, break before and after controls. ***
if (__prev == __property::__CR && __next_ == __property::__LF) // GB3
return false;
if (__prev == __property::__Control || __prev == __property::__CR || __prev == __property::__LF) // GB4
return true;
if (__next_ == __property::__Control || __next_ == __property::__CR || __next_ == __property::__LF) // GB5
return true;
// *** Do not break Hangul syllable sequences. ***
if (__prev == __property::__L &&
(__next_ == __property::__L || __next_ == __property::__V || __next_ == __property::__LV ||
__next_ == __property::__LVT)) // GB6
return false;
if ((__prev == __property::__LV || __prev == __property::__V) &&
(__next_ == __property::__V || __next_ == __property::__T)) // GB7
return false;
if ((__prev == __property::__LVT || __prev == __property::__T) && __next_ == __property::__T) // GB8
return false;
// *** Do not break before extending characters or ZWJ. ***
if (__next_ == __property::__Extend || __next_ == __property::__ZWJ)
return false; // GB9
// *** Do not break before SpacingMarks, or after Prepend characters. ***
if (__next_ == __property::__SpacingMark) // GB9a
return false;
if (__prev == __property::__Prepend) // GB9b
return false;
// *** Do not break within emoji modifier sequences or emoji zwj sequences. ***
// GB11 \p{Extended_Pictographic} Extend* ZWJ x \p{Extended_Pictographic}
//
// Note that several parts of this rule are matched by GB9: Any x (Extend | ZWJ)
// - \p{Extended_Pictographic} x Extend
// - Extend x Extend
// - \p{Extended_Pictographic} x ZWJ
// - Extend x ZWJ
//
// So the only case left to test is
// - \p{Extended_Pictographic}' x ZWJ x \p{Extended_Pictographic}
// where \p{Extended_Pictographic}' is stored in __has_extened_pictographic
if (__has_extened_pictographic && __prev == __property::__ZWJ && __next_ == __property::__Extended_Pictographic)
return false;
// *** Do not break within emoji flag sequences ***
// That is, do not break between regional indicator (RI) symbols if there
// is an odd number of RI characters before the break point.
if (__prev == __property::__Regional_Indicator && __next_ == __property::__Regional_Indicator) { // GB12 + GB13
__RI_break_allowed = !__RI_break_allowed;
if (__RI_break_allowed)
return true;
return false;
}
// *** Otherwise, break everywhere. ***
return true; // GB999
}
/// Helper class to extract an extended grapheme cluster from a Unicode character range.
///
/// This function is used to determine the column width of an extended grapheme
/// cluster. In order to do that only the first code point is evaluated.
/// Therefore only this code point is extracted.
template <class _CharT>
class __extended_grapheme_cluster_view {
public:
_LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_view(const _CharT* __first, const _CharT* __last)
: __code_point_view_(__first, __last),
__next_code_point_(__code_point_view_.__consume()),
__next_prop_(__extended_grapheme_custer_property_boundary::__get_property(__next_code_point_)) {}
struct __cluster {
/// The first code point of the extended grapheme cluster.
///
/// The first code point is used to estimate the width of the extended
/// grapheme cluster.
char32_t __code_point_;
/// Points one beyond the last code unit in the extended grapheme cluster.
///
/// It's expected the caller has the start position and thus can determine
/// the code unit range of the extended grapheme cluster.
const _CharT* __last_;
};
_LIBCPP_HIDE_FROM_ABI constexpr __cluster __consume() {
_LIBCPP_ASSERT(
__next_prop_ != __extended_grapheme_custer_property_boundary::__property::__eot,
"can't move beyond the end of input");
char32_t __code_point = __next_code_point_;
if (!__code_point_view_.__at_end())
return {__code_point, __get_break()};
__next_prop_ = __extended_grapheme_custer_property_boundary::__property::__eot;
return {__code_point, __code_point_view_.__position()};
}
private:
__code_point_view<_CharT> __code_point_view_;
char32_t __next_code_point_;
__extended_grapheme_custer_property_boundary::__property __next_prop_;
_LIBCPP_HIDE_FROM_ABI constexpr const _CharT* __get_break() {
bool __RI_break_allowed = true;
bool __has_extened_pictographic = false;
while (true) {
const _CharT* __result = __code_point_view_.__position();
__extended_grapheme_custer_property_boundary::__property __prev = __next_prop_;
if (__code_point_view_.__at_end()) {
__next_prop_ = __extended_grapheme_custer_property_boundary::__property::__eot;
return __result;
}
__next_code_point_ = __code_point_view_.__consume();
__next_prop_ = __extended_grapheme_custer_property_boundary::__get_property(__next_code_point_);
__has_extened_pictographic |=
__prev == __extended_grapheme_custer_property_boundary::__property::__Extended_Pictographic;
if (__at_extended_grapheme_cluster_break(__RI_break_allowed, __has_extened_pictographic, __prev, __next_prop_))
return __result;
}
}
};
} // namespace __unicode
# endif // _LIBCPP_HAS_NO_UNICODE
#endif //_LIBCPP_STD_VER > 17
_LIBCPP_END_NAMESPACE_STD
#endif // _LIBCPP___FORMAT_UNICODE_H

View File

@ -157,6 +157,7 @@ namespace std {
#include <__format/formatter_pointer.h>
#include <__format/formatter_string.h>
#include <__format/parser_std_format_spec.h>
#include <__format/unicode.h>
#include <__iterator/back_insert_iterator.h>
#include <__iterator/incrementable_traits.h>
#include <__variant/monostate.h>

View File

@ -616,32 +616,34 @@ module std [system] {
export *
module __format {
module buffer { private header "__format/buffer.h" }
module concepts { private header "__format/concepts.h" }
module enable_insertable { private header "__format/enable_insertable.h" }
module format_arg { private header "__format/format_arg.h" }
module format_arg_store { private header "__format/format_arg_store.h" }
module format_args { private header "__format/format_args.h" }
module buffer { private header "__format/buffer.h" }
module concepts { private header "__format/concepts.h" }
module enable_insertable { private header "__format/enable_insertable.h" }
module extended_grapheme_cluster_table { private header "__format/extended_grapheme_cluster_table.h" }
module format_arg { private header "__format/format_arg.h" }
module format_arg_store { private header "__format/format_arg_store.h" }
module format_args { private header "__format/format_args.h" }
module format_context {
private header "__format/format_context.h"
export optional
export locale
}
module format_error { private header "__format/format_error.h" }
module format_fwd { private header "__format/format_fwd.h" }
module format_parse_context { private header "__format/format_parse_context.h" }
module format_string { private header "__format/format_string.h" }
module format_to_n_result { private header "__format/format_to_n_result.h" }
module formatter { private header "__format/formatter.h" }
module formatter_bool { private header "__format/formatter_bool.h" }
module formatter_char { private header "__format/formatter_char.h" }
module formatter_floating_point { private header "__format/formatter_floating_point.h" }
module formatter_integer { private header "__format/formatter_integer.h" }
module formatter_integral { private header "__format/formatter_integral.h" }
module formatter_output { private header "__format/formatter_output.h" }
module formatter_pointer { private header "__format/formatter_pointer.h" }
module formatter_string { private header "__format/formatter_string.h" }
module parser_std_format_spec { private header "__format/parser_std_format_spec.h" }
module format_error { private header "__format/format_error.h" }
module format_fwd { private header "__format/format_fwd.h" }
module format_parse_context { private header "__format/format_parse_context.h" }
module format_string { private header "__format/format_string.h" }
module format_to_n_result { private header "__format/format_to_n_result.h" }
module formatter { private header "__format/formatter.h" }
module formatter_bool { private header "__format/formatter_bool.h" }
module formatter_char { private header "__format/formatter_char.h" }
module formatter_floating_point { private header "__format/formatter_floating_point.h" }
module formatter_integer { private header "__format/formatter_integer.h" }
module formatter_integral { private header "__format/formatter_integral.h" }
module formatter_output { private header "__format/formatter_output.h" }
module formatter_pointer { private header "__format/formatter_pointer.h" }
module formatter_string { private header "__format/formatter_string.h" }
module parser_std_format_spec { private header "__format/parser_std_format_spec.h" }
module unicode { private header "__format/unicode.h" }
}
}
module forward_list {

View File

@ -307,6 +307,7 @@ END-SCRIPT
#include <__format/buffer.h> // expected-error@*:* {{use of private header from outside its module: '__format/buffer.h'}}
#include <__format/concepts.h> // expected-error@*:* {{use of private header from outside its module: '__format/concepts.h'}}
#include <__format/enable_insertable.h> // expected-error@*:* {{use of private header from outside its module: '__format/enable_insertable.h'}}
#include <__format/extended_grapheme_cluster_table.h> // expected-error@*:* {{use of private header from outside its module: '__format/extended_grapheme_cluster_table.h'}}
#include <__format/format_arg.h> // expected-error@*:* {{use of private header from outside its module: '__format/format_arg.h'}}
#include <__format/format_arg_store.h> // expected-error@*:* {{use of private header from outside its module: '__format/format_arg_store.h'}}
#include <__format/format_args.h> // expected-error@*:* {{use of private header from outside its module: '__format/format_args.h'}}
@ -326,6 +327,7 @@ END-SCRIPT
#include <__format/formatter_pointer.h> // expected-error@*:* {{use of private header from outside its module: '__format/formatter_pointer.h'}}
#include <__format/formatter_string.h> // expected-error@*:* {{use of private header from outside its module: '__format/formatter_string.h'}}
#include <__format/parser_std_format_spec.h> // expected-error@*:* {{use of private header from outside its module: '__format/parser_std_format_spec.h'}}
#include <__format/unicode.h> // expected-error@*:* {{use of private header from outside its module: '__format/unicode.h'}}
#include <__functional/binary_function.h> // expected-error@*:* {{use of private header from outside its module: '__functional/binary_function.h'}}
#include <__functional/binary_negate.h> // expected-error@*:* {{use of private header from outside its module: '__functional/binary_negate.h'}}
#include <__functional/bind.h> // expected-error@*:* {{use of private header from outside its module: '__functional/bind.h'}}

View File

@ -0,0 +1,92 @@
//===----------------------------------------------------------------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: c++03, c++11, c++14, c++17
// UNSUPPORTED: libcpp-has-no-incomplete-format
// TODO FMT Fix this test using GCC, it currently times out.
// UNSUPPORTED: gcc-12
// <format>
// Tests the implementation of the extended grapheme cluster boundaries per
// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
//
// The tests are based on the test data provided by Unicode
// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt
#include <cassert>
#include <format>
#include <functional>
#include <numeric>
#include "extended_grapheme_cluster.h"
// Validates whether the number of code points in our "database" matches with
// the number in the Unicode. The assumption is when the number of items per
// property matches the code points themselves also match.
namespace {
namespace cluster = std::__extended_grapheme_custer_property_boundary;
constexpr int count_entries(cluster::__property property) {
return std::transform_reduce(
std::begin(cluster::__entries), std::end(cluster::__entries), 0, std::plus{}, [property](auto entry) {
if (static_cast<cluster::__property>(entry & 0xf) != property)
return 0;
return 1 + static_cast<int>((entry >> 4) & 0x7f);
});
}
static_assert(count_entries(cluster::__property::__Prepend) == 26);
static_assert(count_entries(cluster::__property::__CR) == 1);
static_assert(count_entries(cluster::__property::__LF) == 1);
static_assert(count_entries(cluster::__property::__Control) == 3886);
static_assert(count_entries(cluster::__property::__Extend) == 2095);
static_assert(count_entries(cluster::__property::__Regional_Indicator) == 26);
static_assert(count_entries(cluster::__property::__SpacingMark) == 388);
static_assert(count_entries(cluster::__property::__L) == 125);
static_assert(count_entries(cluster::__property::__V) == 95);
static_assert(count_entries(cluster::__property::__T) == 137);
static_assert(count_entries(cluster::__property::__LV) == 399);
static_assert(count_entries(cluster::__property::__LVT) == 10773);
static_assert(count_entries(cluster::__property::__ZWJ) == 1);
static_assert(count_entries(cluster::__property::__Extended_Pictographic) == 3537);
} // namespace
template <class Data>
constexpr void test(const Data& data) {
for (const auto& d : data) {
assert(d.code_points.size() == d.breaks.size());
std::__unicode::__extended_grapheme_cluster_view view{d.input.data(), d.input.data() + d.input.size()};
for (size_t i = 0; i < d.breaks.size(); ++i) {
auto r = view.__consume();
assert(r.__code_point_ == d.code_points[i]);
assert(r.__last_ == d.input.data() + d.breaks[i]);
}
}
}
constexpr bool test() {
test(data_utf8);
#ifndef TEST_HAS_NO_WIDE_CHARACTERS
if constexpr (sizeof(wchar_t) == 2)
test(data_utf16);
else
test(data_utf32);
#endif
return true;
}
int main(int, char**) {
test();
// static_assert(test());
return 0;
}

View File

@ -1,109 +0,0 @@
//===----------------------------------------------------------------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: c++03, c++11, c++14, c++17
// UNSUPPORTED: libcpp-has-no-incomplete-format
// UTF-32 doesn't work properly
// XFAIL: windows
// <format>
// Tests the Unicode width support of the standard format specifiers.
// It tests [format.string.std]/8 - 11:
// - Properly determining the estimated with of a unicode string.
// - Properly truncating to the wanted maximum width.
// This version runs the test when the platform doesn't have Unicode support.
// REQUIRES: libcpp-has-no-unicode
#include <format>
#include <cassert>
#include "test_macros.h"
#include "make_string.h"
#define CSTR(S) MAKE_CSTRING(CharT, S)
using namespace std::__format_spec;
template <class CharT>
constexpr bool operator==(const __string_alignment<CharT>& lhs,
const __string_alignment<CharT>& rhs) noexcept {
return lhs.__last == rhs.__last && lhs.__size == rhs.__size &&
lhs.__align == rhs.__align;
}
template <class CharT>
constexpr void get_string_alignment(size_t offset, ptrdiff_t size, bool align,
const CharT* str, size_t width,
size_t precision) {
std::basic_string_view<CharT> sv{str};
__string_alignment<CharT> expected{sv.begin() + offset, size, align};
__string_alignment<CharT> traits =
__get_string_alignment(sv.begin(), sv.end(), width, precision);
assert(traits == expected);
}
template <class CharT>
constexpr void get_string_alignment() {
// Truncate the input.
get_string_alignment(2, 2, false, CSTR("abc"), 0, 2);
// The 2-column character gets half accepted.
get_string_alignment(2, 2, false, CSTR("a\u115f"), 0, 2);
// No alignment since the number of characters fits.
get_string_alignment(2, 2, false, CSTR("a\u115f"), 2, 2);
// Same but for a 2-column 4-byte UTF-8 sequence
get_string_alignment(2, 2, false, CSTR("a\U0001f300"), 0, 2);
get_string_alignment(2, 2, false, CSTR("a\U0001f300"), 2, 2);
// No alignment required.
get_string_alignment(3, 3, false, CSTR("abc"), 2, -1);
get_string_alignment(3, 3, false, CSTR("abc"), 3, -1);
get_string_alignment(3 + 2 * (sizeof(CharT) == 1),
3 + 2 * (sizeof(CharT) == 1), false, CSTR("ab\u1111"), 2,
-1);
// Doesn't evaluate 'c' so size -> 0
get_string_alignment(3 + 2 * (sizeof(CharT) == 1),
3 + 2 * (sizeof(CharT) == 1), false,
CSTR("a\u115fc") /* 2-column character */, 3, -1);
// Extend width
get_string_alignment(3, 3, true, CSTR("abc"), 4, -1);
get_string_alignment(3 + 2 * (sizeof(CharT) == 1),
3 + 2 * (sizeof(CharT) == 1), true,
CSTR("a\u1160c") /* 1-column character */, 6, -1);
}
template <class CharT>
constexpr void test() {
get_string_alignment<CharT>();
}
constexpr bool test() {
test<char>();
#ifndef TEST_HAS_NO_WIDE_CHARACTERS
test<wchar_t>();
#endif
#ifndef _LIBCPP_HAS_NO_CHAR8_T
test<char8_t>();
#endif
test<char16_t>();
test<char32_t>();
return true;
}
int main(int, char**) {
test();
static_assert(test());
return 0;
}

View File

@ -1,266 +0,0 @@
//===----------------------------------------------------------------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: c++03, c++11, c++14, c++17
// UNSUPPORTED: libcpp-has-no-incomplete-format
// <format>
// Tests the Unicode width support of the standard format specifiers.
// It tests [format.string.std]/8 - 11:
// - Properly determining the estimated with of a unicode string.
// - Properly truncating to the wanted maximum width.
// This version runs the test when the platform has Unicode support.
// UNSUPPORTED: libcpp-has-no-unicode
#include <format>
#include <cassert>
#include "test_macros.h"
#include "make_string.h"
#define CSTR(S) MAKE_CSTRING(CharT, S)
using namespace std::__format_spec;
template <class CharT>
constexpr bool operator==(const __string_alignment<CharT>& lhs,
const __string_alignment<CharT>& rhs) noexcept {
return lhs.__last == rhs.__last && lhs.__size == rhs.__size &&
lhs.__align == rhs.__align;
}
template <class CharT>
constexpr void get_string_alignment(size_t offset, ptrdiff_t size, bool align,
const CharT* str, size_t width,
size_t precision) {
std::basic_string_view<CharT> sv{str};
__string_alignment<CharT> expected{sv.begin() + offset, size, align};
__string_alignment<CharT> traits =
__get_string_alignment(sv.begin(), sv.end(), width, precision);
assert(traits == expected);
}
template <class CharT>
constexpr void estimate_column_width_fast(size_t expected, const CharT* str) {
std::basic_string_view<CharT> sv{str};
const CharT* out =
__detail::__estimate_column_width_fast(sv.begin(), sv.end());
assert(out == sv.begin() + expected);
}
template <class CharT>
constexpr void estimate_column_width_fast() {
// No unicode
estimate_column_width_fast(3, CSTR("abc"));
estimate_column_width_fast(3, CSTR("a\u007fc"));
if constexpr (sizeof(CharT) == 1) {
// UTF-8 stop at the first multi-byte character.
estimate_column_width_fast(0, CSTR("\u0080bc"));
estimate_column_width_fast(1, CSTR("a\u0080c"));
estimate_column_width_fast(2, CSTR("ab\u0080"));
estimate_column_width_fast(1, CSTR("aßc"));
estimate_column_width_fast(1, CSTR("a\u07ffc"));
estimate_column_width_fast(1, CSTR("a\u0800c"));
estimate_column_width_fast(1, CSTR("a\u10ffc"));
} else {
// UTF-16/32 stop at the first multi-column character.
estimate_column_width_fast(3, CSTR("\u0080bc"));
estimate_column_width_fast(3, CSTR("a\u0080c"));
estimate_column_width_fast(3, CSTR("ab\u0080"));
estimate_column_width_fast(3, CSTR("aßc"));
estimate_column_width_fast(3, CSTR("a\u07ffc"));
estimate_column_width_fast(3, CSTR("a\u0800c"));
estimate_column_width_fast(3, CSTR("a\u10ffc"));
}
// First 2-column character
estimate_column_width_fast(1, CSTR("a\u1100c"));
estimate_column_width_fast(1, CSTR("a\U0000ffffc"));
estimate_column_width_fast(1, CSTR("a\U00010000c"));
estimate_column_width_fast(1, CSTR("a\U0010FFFFc"));
}
template <class CharT>
constexpr void estimate_column_width(size_t expected, const CharT* str) {
std::basic_string_view<CharT> sv{str};
std::__format_spec::__detail::__column_width_result<CharT> column_info =
__detail::__estimate_column_width(sv.begin(), sv.end(), -1);
assert(column_info.__width == expected);
}
template <class CharT>
constexpr void estimate_column_width() {
//*** 1-byte code points ***
estimate_column_width(1, CSTR(" "));
estimate_column_width(1, CSTR("~"));
//*** 2-byte code points ***
estimate_column_width(1, CSTR("\u00a1")); // INVERTED EXCLAMATION MARK
estimate_column_width(1, CSTR("\u07ff")); // NKO TAMAN SIGN
//*** 3-byte code points ***
estimate_column_width(1, CSTR("\u0800")); // SAMARITAN LETTER ALAF
estimate_column_width(1, CSTR("\ufffd")); // REPLACEMENT CHARACTER
// 2 column ranges
estimate_column_width(2, CSTR("\u1100")); // HANGUL CHOSEONG KIYEOK
estimate_column_width(2, CSTR("\u115f")); // HANGUL CHOSEONG FILLER
estimate_column_width(2, CSTR("\u2329")); // LEFT-POINTING ANGLE BRACKET
estimate_column_width(2, CSTR("\u232a")); // RIGHT-POINTING ANGLE BRACKET
estimate_column_width(2, CSTR("\u2e80")); // CJK RADICAL REPEAT
estimate_column_width(2, CSTR("\u303e")); // IDEOGRAPHIC VARIATION INDICATOR
estimate_column_width(2, CSTR("\u3040")); // U+3041 HIRAGANA LETTER SMALL A
estimate_column_width(2, CSTR("\ua4cf")); // U+A4D0 LISU LETTER BA
estimate_column_width(2, CSTR("\uac00")); // <Hangul Syllable, First>
estimate_column_width(2, CSTR("\ud7a3")); // Hangul Syllable Hih
estimate_column_width(2, CSTR("\uf900")); // CJK COMPATIBILITY IDEOGRAPH-F900
estimate_column_width(2, CSTR("\ufaff")); // U+FB00 LATIN SMALL LIGATURE FF
estimate_column_width(2,
CSTR("\ufe10")); // PRESENTATION FORM FOR VERTICAL COMMA
estimate_column_width(
2, CSTR("\ufe19")); // PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
estimate_column_width(
2, CSTR("\ufe30")); // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
estimate_column_width(2,
CSTR("\ufe6f")); // U+FE70 ARABIC FATHATAN ISOLATED FORM
estimate_column_width(2, CSTR("\uff00")); // U+FF01 FULLWIDTH EXCLAMATION MARK
estimate_column_width(2, CSTR("\uff60")); // FULLWIDTH RIGHT WHITE PARENTHESIS
estimate_column_width(2, CSTR("\uffe0")); // FULLWIDTH CENT SIGN
estimate_column_width(2, CSTR("\uffe6")); // FULLWIDTH WON SIGN
//*** 4-byte code points ***
estimate_column_width(1, CSTR("\U00010000")); // LINEAR B SYLLABLE B008 A
estimate_column_width(1, CSTR("\U0010FFFF")); // Undefined Character
// 2 column ranges
estimate_column_width(2, CSTR("\U0001f300")); // CYCLONE
estimate_column_width(2, CSTR("\U0001f64f")); // PERSON WITH FOLDED HANDS
estimate_column_width(
2, CSTR("\U0001f900")); // CIRCLED CROSS FORMEE WITH FOUR DOTS
estimate_column_width(2, CSTR("\U0001f9ff")); // NAZAR AMULET
estimate_column_width(
2, CSTR("\U00020000")); // <CJK Ideograph Extension B, First>
estimate_column_width(2, CSTR("\U0002fffd")); // Undefined Character
estimate_column_width(
2, CSTR("\U00030000")); // <CJK Ideograph Extension G, First>
estimate_column_width(2, CSTR("\U0003fffd")); // Undefined Character
}
template <class CharT>
constexpr void get_string_alignment() {
// Truncate the input.
get_string_alignment(2, 2, false, CSTR("abc"), 0, 2);
// The 2-column character gets entirely rejected.
get_string_alignment(1, 1, false, CSTR("a\u115f"), 0, 2);
// Due to the requested width extra alignment is required.
get_string_alignment(1, 1, true, CSTR("a\u115f"), 2, 2);
// Same but for a 2-column 4-byte UTF-8 sequence
get_string_alignment(1, 1, false, CSTR("a\U0001f300"), 0, 2);
get_string_alignment(1, 1, true, CSTR("a\U0001f300"), 2, 2);
// No alignment required.
get_string_alignment(3, 3, false, CSTR("abc"), 2, -1);
get_string_alignment(3, 3, false, CSTR("abc"), 3, -1);
// Special case, we have a special character already parsed and have enough
// withd to satisfy the minumum required width.
get_string_alignment(3 + 2 * (sizeof(CharT) == 1), 0, false, CSTR("ab\u1111"),
2, -1);
// Evaluates all so size ->4
get_string_alignment(3 + 2 * (sizeof(CharT) == 1), 4, false,
CSTR("a\u115fc") /* 2-column character */, 3, -1);
// Evaluates all so size ->4
get_string_alignment(3 + 2 * (sizeof(CharT) == 1), 4, false,
CSTR("a\u115fc") /* 2-column character */, 4, -1);
// Evaluates all so size ->5
get_string_alignment(4 + 2 * (sizeof(CharT) == 1), 5, false,
CSTR("a\u115fcd") /* 2-column character */, 4, -1);
// Evaluates all so size ->5
get_string_alignment(4 + 2 * (sizeof(CharT) == 1), 5, false,
CSTR("a\u115fcd") /* 2-column character */, 5, -1);
// Extend width
get_string_alignment(3, 3, true, CSTR("abc"), 4, -1);
get_string_alignment(3 + 2 * (sizeof(CharT) == 1), 3, true,
CSTR("a\u1160c") /* 1-column character */, 4, -1);
// In this case the threshold where the width is still determined.
get_string_alignment(2 + 2 * (sizeof(CharT) == 1), 3, false, CSTR("i\u1110"),
2, -1);
// The width is no longer exactly determined.
get_string_alignment(2 + 2 * (sizeof(CharT) == 1), 0, false, CSTR("i\u1110"),
1, -1);
// Extend width and truncate input.
get_string_alignment(1, 1, true, CSTR("abc"), 3, 1);
if constexpr (sizeof(CharT) == 1) {
// Corrupt UTF-8 sequence.
get_string_alignment(2, 2, false, CSTR("a\xc0"), 0, 3);
get_string_alignment(2, 2, false, CSTR("a\xe0"), 0, 3);
get_string_alignment(2, 2, false, CSTR("a\xf0"), 0, 3);
} else if constexpr (sizeof(CharT) == 2) {
// Corrupt UTF-16 sequence.
if constexpr (std::same_as<CharT, char16_t>)
get_string_alignment(2, 2, false, u"a\xdddd", 0, 3);
else
// Corrupt UTF-16 wchar_t seqence.
get_string_alignment(2, 2, false, L"a\xdddd", 0, 3);
}
// UTF-32 doesn't combine characters, thus no corruption tests.
}
template <class CharT>
constexpr void test() {
estimate_column_width_fast<CharT>();
estimate_column_width<CharT>();
get_string_alignment<CharT>();
}
constexpr bool test() {
test<char>();
#ifndef TEST_HAS_NO_WIDE_CHARACTERS
test<wchar_t>();
#endif
#ifndef _LIBCPP_HAS_NO_CHAR8_T
test<char8_t>();
#endif
test<char16_t>();
test<char32_t>();
return true;
}
int main(int, char**) {
test();
static_assert(test());
return 0;
}

View File

@ -0,0 +1,158 @@
//===----------------------------------------------------------------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: c++03, c++11, c++14, c++17
// UNSUPPORTED: libcpp-has-no-incomplete-format
// Force unicode to be disabled.
// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_HAS_NO_UNICODE
// <format>
// Tests Unicode is ignored and handled as ASCII.
#include <format>
#include <cassert>
#include <vector>
#include "test_macros.h"
// Note the comment annotations are from the Unicode test
static void test_char() {
//*** 1-byte code points ***
assert(std::format("{:*^3}", " ") == "* *");
assert(std::format("{:*^3}", "~") == "*~*");
//*** 2-byte code points ***
assert(std::format("{:*^4}", "\u00a1") == "*\u00a1*"); // INVERTED EXCLAMATION MARK
assert(std::format("{:*^4}", "\u07ff") == "*\u07ff*"); // NKO TAMAN SIGN
//*** 3-byte code points ***
assert(std::format("{:*^5}", "\u0800") == "*\u0800*"); // SAMARITAN LETTER ALAF
assert(std::format("{:*^5}", "\ufffd") == "*\ufffd*"); // REPLACEMENT CHARACTER
// 2 column ranges
assert(std::format("{:*^5}", "\u1100") == "*\u1100*"); // HANGUL CHOSEONG KIYEOK
assert(std::format("{:*^5}", "\u115f") == "*\u115f*"); // HANGUL CHOSEONG FILLER
assert(std::format("{:*^5}", "\u2329") == "*\u2329*"); // LEFT-POINTING ANGLE BRACKET
assert(std::format("{:*^5}", "\u232a") == "*\u232a*"); // RIGHT-POINTING ANGLE BRACKET
assert(std::format("{:*^5}", "\u2e80") == "*\u2e80*"); // CJK RADICAL REPEAT
assert(std::format("{:*^5}", "\u303e") == "*\u303e*"); // IDEOGRAPHIC VARIATION INDICATOR
assert(std::format("{:*^5}", "\u3040") == "*\u3040*"); // U+3041 HIRAGANA LETTER SMALL A
assert(std::format("{:*^5}", "\ua4cf") == "*\ua4cf*"); // U+A4D0 LISU LETTER BA
assert(std::format("{:*^5}", "\uac00") == "*\uac00*"); // <Hangul Syllable, First>
assert(std::format("{:*^5}", "\ud7a3") == "*\ud7a3*"); // Hangul Syllable Hih
assert(std::format("{:*^5}", "\uf900") == "*\uf900*"); // CJK COMPATIBILITY IDEOGRAPH-F900
assert(std::format("{:*^5}", "\ufaff") == "*\ufaff*"); // U+FB00 LATIN SMALL LIGATURE FF
assert(std::format("{:*^5}", "\ufe10") == "*\ufe10*"); // PRESENTATION FORM FOR VERTICAL COMMA
assert(std::format("{:*^5}", "\ufe19") == "*\ufe19*"); // PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
assert(std::format("{:*^5}", "\ufe30") == "*\ufe30*"); // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
assert(std::format("{:*^5}", "\ufe6f") == "*\ufe6f*"); // U+FE70 ARABIC FATHATAN ISOLATED FORM
assert(std::format("{:*^5}", "\uff00") == "*\uff00*"); // U+FF01 FULLWIDTH EXCLAMATION MARK
assert(std::format("{:*^5}", "\uff60") == "*\uff60*"); // FULLWIDTH RIGHT WHITE PARENTHESIS
assert(std::format("{:*^5}", "\uffe0") == "*\uffe0*"); // FULLWIDTH CENT SIGN
assert(std::format("{:*^5}", "\uffe6") == "*\uffe6*"); // FULLWIDTH WON SIGN
//*** 4-byte code points ***
assert(std::format("{:*^6}", "\U00010000") == "*\U00010000*"); // LINEAR B SYLLABLE B008 A
assert(std::format("{:*^6}", "\U0010FFFF") == "*\U0010FFFF*"); // Undefined Character
// 2 column ranges
assert(std::format("{:*^6}", "\U0001f300") == "*\U0001f300*"); // CYCLONE
assert(std::format("{:*^6}", "\U0001f64f") == "*\U0001f64f*"); // PERSON WITH FOLDED HANDS
assert(std::format("{:*^6}", "\U0001f900") == "*\U0001f900*"); // CIRCLED CROSS FORMEE WITH FOUR DOTS
assert(std::format("{:*^6}", "\U0001f9ff") == "*\U0001f9ff*"); // NAZAR AMULET
assert(std::format("{:*^6}", "\U00020000") == "*\U00020000*"); // <CJK Ideograph Extension B, First>
assert(std::format("{:*^6}", "\U0002fffd") == "*\U0002fffd*"); // Undefined Character
assert(std::format("{:*^6}", "\U00030000") == "*\U00030000*"); // <CJK Ideograph Extension G, First>
assert(std::format("{:*^6}", "\U0003fffd") == "*\U0003fffd*"); // Undefined Character
}
#ifndef TEST_HAS_NO_WIDE_CHARACTERS
// Note the comment annotations are from the Unicode test
static void test_wchar_t() {
if constexpr (sizeof(wchar_t) == 2) {
// TODO FMT Add these tests.
} else {
//*** 1-byte code points ***
assert(std::format(L"{:*^3}", L" ") == L"* *");
assert(std::format(L"{:*^3}", L"~") == L"*~*");
//*** 2-byte code points ***
assert(std::format(L"{:*^3}", L"\u00a1") == L"*\u00a1*"); // INVERTED EXCLAMATION MARK
assert(std::format(L"{:*^3}", L"\u07ff") == L"*\u07ff*"); // NKO TAMAN SIGN
//*** 3-byte code points ***
assert(std::format(L"{:*^3}", L"\u0800") == L"*\u0800*"); // SAMARITAN LETTER ALAF
assert(std::format(L"{:*^3}", L"\ufffd") == L"*\ufffd*"); // REPLACEMENT CHARACTER
// 2 column ranges
assert(std::format(L"{:*^3}", L"\u1100") == L"*\u1100*"); // HANGUL CHOSEONG KIYEOK
assert(std::format(L"{:*^3}", L"\u115f") == L"*\u115f*"); // HANGUL CHOSEONG FILLER
assert(std::format(L"{:*^3}", L"\u2329") == L"*\u2329*"); // LEFT-POINTING ANGLE BRACKET
assert(std::format(L"{:*^3}", L"\u232a") == L"*\u232a*"); // RIGHT-POINTING ANGLE BRACKET
assert(std::format(L"{:*^3}", L"\u2e80") == L"*\u2e80*"); // CJK RADICAL REPEAT
assert(std::format(L"{:*^3}", L"\u303e") == L"*\u303e*"); // IDEOGRAPHIC VARIATION INDICATOR
assert(std::format(L"{:*^3}", L"\u3040") == L"*\u3040*"); // U+3041 HIRAGANA LETTER SMALL A
assert(std::format(L"{:*^3}", L"\ua4cf") == L"*\ua4cf*"); // U+A4D0 LISU LETTER BA
assert(std::format(L"{:*^3}", L"\uac00") == L"*\uac00*"); // <Hangul Syllable, First>
assert(std::format(L"{:*^3}", L"\ud7a3") == L"*\ud7a3*"); // Hangul Syllable Hih
assert(std::format(L"{:*^3}", L"\uf900") == L"*\uf900*"); // CJK COMPATIBILITY IDEOGRAPH-F900
assert(std::format(L"{:*^3}", L"\ufaff") == L"*\ufaff*"); // U+FB00 LATIN SMALL LIGATURE FF
assert(std::format(L"{:*^3}", L"\ufe10") == L"*\ufe10*"); // PRESENTATION FORM FOR VERTICAL COMMA
assert(std::format(L"{:*^3}", L"\ufe19") == L"*\ufe19*"); // PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
assert(std::format(L"{:*^3}", L"\ufe30") == L"*\ufe30*"); // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
assert(std::format(L"{:*^3}", L"\ufe6f") == L"*\ufe6f*"); // U+FE70 ARABIC FATHATAN ISOLATED FORM
assert(std::format(L"{:*^3}", L"\uff00") == L"*\uff00*"); // U+FF01 FULLWIDTH EXCLAMATION MARK
assert(std::format(L"{:*^3}", L"\uff60") == L"*\uff60*"); // FULLWIDTH RIGHT WHITE PARENTHESIS
assert(std::format(L"{:*^3}", L"\uffe0") == L"*\uffe0*"); // FULLWIDTH CENT SIGN
assert(std::format(L"{:*^3}", L"\uffe6") == L"*\uffe6*"); // FULLWIDTH WON SIGN
//*** 4-byte code points ***
assert(std::format(L"{:*^3}", L"\U00010000") == L"*\U00010000*"); // LINEAR B SYLLABLE B008 A
assert(std::format(L"{:*^3}", L"\U0010FFFF") == L"*\U0010FFFF*"); // Undefined Character
// 2 column ranges
assert(std::format(L"{:*^3}", L"\U0001f300") == L"*\U0001f300*"); // CYCLONE
assert(std::format(L"{:*^3}", L"\U0001f64f") == L"*\U0001f64f*"); // PERSON WITH FOLDED HANDS
assert(std::format(L"{:*^3}", L"\U0001f900") == L"*\U0001f900*"); // CIRCLED CROSS FORMEE WITH FOUR DOTS
assert(std::format(L"{:*^3}", L"\U0001f9ff") == L"*\U0001f9ff*"); // NAZAR AMULET
assert(std::format(L"{:*^3}", L"\U00020000") == L"*\U00020000*"); // <CJK Ideograph Extension B, First>
assert(std::format(L"{:*^3}", L"\U0002fffd") == L"*\U0002fffd*"); // Undefined Character
assert(std::format(L"{:*^3}", L"\U00030000") == L"*\U00030000*"); // <CJK Ideograph Extension G, First>
assert(std::format(L"{:*^3}", L"\U0003fffd") == L"*\U0003fffd*"); // Undefined Character
}
}
#endif
int main(int, char**) {
test_char();
#ifndef TEST_HAS_NO_WIDE_CHARACTERS
test_wchar_t();
#endif
return 0;
}

View File

@ -293,8 +293,8 @@ void format_test_string(const W& world, const U& universe, TestFunction check, E
}
template <class CharT, class TestFunction>
void format_test_string_unicode(TestFunction check) {
(void)check;
void format_test_string_unicode([[maybe_unused]] TestFunction check) {
// unicode.pass.cpp and ascii.pass.cpp have additional tests.
#ifndef TEST_HAS_NO_UNICODE
// Make sure all possible types are tested. For clarity don't use macros.
if constexpr (std::same_as<CharT, char>) {
@ -332,7 +332,7 @@ void format_test_string_unicode(TestFunction check) {
check.template operator()<"{:*^5}">(SV("*aßc*"), std::wstring_view(L"aßc"));
check.template operator()<"{:*^4.2}">(SV("*aß*"), std::wstring_view(L"aßc"));
}
# endif
# endif // TEST_HAS_NO_WIDE_CHARACTERS
// ß requires one column
check.template operator()<"{}">(SV("aßc"), STR("aßc"));
@ -365,6 +365,25 @@ void format_test_string_unicode(TestFunction check) {
check.template operator()<"{:-<7}">(SV("a\u1110c---"), STR("a\u1110c"));
check.template operator()<"{:-^7}">(SV("-a\u1110c--"), STR("a\u1110c"));
check.template operator()<"{:->7}">(SV("---a\u1110c"), STR("a\u1110c"));
// Examples used in P1868R2
check.template operator()<"{:*^3}">(SV("*\u0041*"), STR("\u0041")); // { LATIN CAPITAL LETTER A }
check.template operator()<"{:*^3}">(SV("*\u00c1*"), STR("\u00c1")); // { LATIN CAPITAL LETTER A WITH ACUTE }
check.template operator()<"{:*^3}">(
SV("*\u0041\u0301*"),
STR("\u0041\u0301")); // { LATIN CAPITAL LETTER A } { COMBINING ACUTE ACCENT }
check.template operator()<"{:*^3}">(SV("*\u0132*"), STR("\u0132")); // { LATIN CAPITAL LIGATURE IJ }
check.template operator()<"{:*^3}">(SV("*\u0394*"), STR("\u0394")); // { GREEK CAPITAL LETTER DELTA }
check.template operator()<"{:*^3}">(SV("*\u0429*"), STR("\u0429")); // { CYRILLIC CAPITAL LETTER SHCHA }
check.template operator()<"{:*^3}">(SV("*\u05d0*"), STR("\u05d0")); // { HEBREW LETTER ALEF }
check.template operator()<"{:*^3}">(SV("*\u0634*"), STR("\u0634")); // { ARABIC LETTER SHEEN }
check.template operator()<"{:*^4}">(SV("*\u3009*"), STR("\u3009")); // { RIGHT-POINTING ANGLE BRACKET }
check.template operator()<"{:*^4}">(SV("*\u754c*"), STR("\u754c")); // { CJK Unified Ideograph-754C }
check.template operator()<"{:*^4}">(SV("*\U0001f921*"), STR("\U0001f921")); // { UNICORN FACE }
check.template operator()<"{:*^4}">(
SV("*\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466*"),
STR("\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466")); // { Family: Man, Woman, Girl, Boy }
#endif // TEST_HAS_NO_UNICODE
}
@ -2580,9 +2599,8 @@ void format_tests(TestFunction check, ExceptionTest check_exception) {
// *** Test char format argument ***
// The `char` to `wchar_t` formatting is tested separately.
check.template operator()<"hello {}{}{}{}{}{}{}">(SV("hello 09azAZ!"), CharT('0'), CharT('9'), CharT('a'), CharT('z'),
CharT('A'), CharT('Z'), CharT('!'));
check.template operator()<"hello {}{}{}{}{}{}{}">(
SV("hello 09azAZ!"), CharT('0'), CharT('9'), CharT('a'), CharT('z'), CharT('A'), CharT('Z'), CharT('!'));
format_test_char<CharT>(check, check_exception);
format_test_char_as_integer<CharT>(check, check_exception);

View File

@ -0,0 +1,301 @@
//===----------------------------------------------------------------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: c++03, c++11, c++14, c++17
// UNSUPPORTED: libcpp-has-no-incomplete-format
// This version runs the test when the platform has Unicode support.
// UNSUPPORTED: libcpp-has-no-unicode
// <format>
// Tests the Unicode width support of the standard format specifiers.
// It tests [format.string.std]/8 - 11:
// - Properly determining the estimated with of a unicode string.
// - Properly truncating to the wanted maximum width.
// More specific extended grapheme cluster boundary rules are tested in
// test/libcxx/utilities/format/format.string/format.string.std/extended_grapheme_cluster.pass.cpp
// this test is based on test data provided by the Unicode Consortium.
#include <format>
#include <cassert>
#include <vector>
#include "make_string.h"
#include "test_macros.h"
#include "string_literal.h"
#ifndef TEST_HAS_NO_LOCALIZATION
# include <iostream>
# include <type_traits>
#endif
#define SV(S) MAKE_STRING_VIEW(CharT, S)
auto check = []<string_literal fmt, class CharT, class... Args>(
std::basic_string_view<CharT> expected, const Args&... args) constexpr {
std::basic_string<CharT> out = std::format(fmt.template sv<CharT>(), args...);
#ifndef TEST_HAS_NO_LOCALIZATION
if constexpr (std::same_as<CharT, char>)
if (out != expected)
std::cerr << "\nFormat string " << fmt.template sv<char>() << "\nExpected output " << expected
<< "\nActual output " << out << '\n';
#endif
assert(out == expected);
};
template <class CharT>
static void test_single_code_point_fill() {
//*** 1-byte code points ***
check.template operator()<"{:*^3}">(SV("* *"), SV(" "));
check.template operator()<"{:*^3}">(SV("*~*"), SV("~"));
//*** 2-byte code points ***
check.template operator()<"{:*^3}">(SV("*\u00a1*"), SV("\u00a1")); // INVERTED EXCLAMATION MARK
check.template operator()<"{:*^3}">(SV("*\u07ff*"), SV("\u07ff")); // NKO TAMAN SIGN
//*** 3-byte code points ***
check.template operator()<"{:*^3}">(SV("*\u0800*"), SV("\u0800")); // SAMARITAN LETTER ALAF
check.template operator()<"{:*^3}">(SV("*\ufffd*"), SV("\ufffd")); // REPLACEMENT CHARACTER
// 2 column ranges
check.template operator()<"{:*^4}">(SV("*\u1100*"), SV("\u1100")); // HANGUL CHOSEONG KIYEOK
check.template operator()<"{:*^4}">(SV("*\u115f*"), SV("\u115f")); // HANGUL CHOSEONG FILLER
check.template operator()<"{:*^4}">(SV("*\u2329*"), SV("\u2329")); // LEFT-POINTING ANGLE BRACKET
check.template operator()<"{:*^4}">(SV("*\u232a*"), SV("\u232a")); // RIGHT-POINTING ANGLE BRACKET
check.template operator()<"{:*^4}">(SV("*\u2e80*"), SV("\u2e80")); // CJK RADICAL REPEAT
check.template operator()<"{:*^4}">(SV("*\u303e*"), SV("\u303e")); // IDEOGRAPHIC VARIATION INDICATOR
check.template operator()<"{:*^4}">(SV("*\u3040*"), SV("\u3040")); // U+3041 HIRAGANA LETTER SMALL A
check.template operator()<"{:*^4}">(SV("*\ua4cf*"), SV("\ua4cf")); // U+A4D0 LISU LETTER BA
check.template operator()<"{:*^4}">(SV("*\uac00*"), SV("\uac00")); // <Hangul Syllable, First>
check.template operator()<"{:*^4}">(SV("*\ud7a3*"), SV("\ud7a3")); // Hangul Syllable Hih
check.template operator()<"{:*^4}">(SV("*\uf900*"), SV("\uf900")); // CJK COMPATIBILITY IDEOGRAPH-F900
check.template operator()<"{:*^4}">(SV("*\ufaff*"), SV("\ufaff")); // U+FB00 LATIN SMALL LIGATURE FF
check.template operator()<"{:*^4}">(SV("*\ufe10*"), SV("\ufe10")); // PRESENTATION FORM FOR VERTICAL COMMA
check.template
operator()<"{:*^4}">(SV("*\ufe19*"), SV("\ufe19")); // PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
check.template operator()<"{:*^4}">(SV("*\ufe30*"), SV("\ufe30")); // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
check.template operator()<"{:*^4}">(SV("*\ufe6f*"), SV("\ufe6f")); // U+FE70 ARABIC FATHATAN ISOLATED FORM
check.template operator()<"{:*^4}">(SV("*\uff00*"), SV("\uff00")); // U+FF01 FULLWIDTH EXCLAMATION MARK
check.template operator()<"{:*^4}">(SV("*\uff60*"), SV("\uff60")); // FULLWIDTH RIGHT WHITE PARENTHESIS
check.template operator()<"{:*^4}">(SV("*\uffe0*"), SV("\uffe0")); // FULLWIDTH CENT SIGN
check.template operator()<"{:*^4}">(SV("*\uffe6*"), SV("\uffe6")); // FULLWIDTH WON SIGN
//*** 4-byte code points ***
check.template operator()<"{:*^3}">(SV("*\U00010000*"), SV("\U00010000")); // LINEAR B SYLLABLE B008 A
check.template operator()<"{:*^3}">(SV("*\U0010FFFF*"), SV("\U0010FFFF")); // Undefined Character
// 2 column ranges
check.template operator()<"{:*^4}">(SV("*\U0001f300*"), SV("\U0001f300")); // CYCLONE
check.template operator()<"{:*^4}">(SV("*\U0001f64f*"), SV("\U0001f64f")); // PERSON WITH FOLDED HANDS
check.template operator()<"{:*^4}">(SV("*\U0001f900*"), SV("\U0001f900")); // CIRCLED CROSS FORMEE WITH FOUR DOTS
check.template operator()<"{:*^4}">(SV("*\U0001f9ff*"), SV("\U0001f9ff")); // NAZAR AMULET
check.template operator()<"{:*^4}">(SV("*\U00020000*"), SV("\U00020000")); // <CJK Ideograph Extension B, First>
check.template operator()<"{:*^4}">(SV("*\U0002fffd*"), SV("\U0002fffd")); // Undefined Character
check.template operator()<"{:*^4}">(SV("*\U00030000*"), SV("\U00030000")); // <CJK Ideograph Extension G, First>
check.template operator()<"{:*^4}">(SV("*\U0003fffd*"), SV("\U0003fffd")); // Undefined Character
}
// One column output is unaffected.
// Two column output is removed, thus the result is only the fill character.
template <class CharT>
static void test_single_code_point_truncate() {
//*** 1-byte code points ***
check.template operator()<"{:*^3.1}">(SV("* *"), SV(" "));
check.template operator()<"{:*^3.1}">(SV("*~*"), SV("~"));
//*** 2-byte code points ***
check.template operator()<"{:*^3.1}">(SV("*\u00a1*"), SV("\u00a1")); // INVERTED EXCLAMATION MARK
check.template operator()<"{:*^3.1}">(SV("*\u07ff*"), SV("\u07ff")); // NKO TAMAN SIGN
//*** 3.1-byte code points ***
check.template operator()<"{:*^3.1}">(SV("*\u0800*"), SV("\u0800")); // SAMARITAN LETTER ALAF
check.template operator()<"{:*^3.1}">(SV("*\ufffd*"), SV("\ufffd")); // REPLACEMENT CHARACTER
// 2 column ranges
check.template operator()<"{:*^3.1}">(SV("***"), SV("\u1100")); // HANGUL CHOSEONG KIYEOK
check.template operator()<"{:*^3.1}">(SV("***"), SV("\u115f")); // HANGUL CHOSEONG FILLER
check.template operator()<"{:*^3.1}">(SV("***"), SV("\u2329")); // LEFT-POINTING ANGLE BRACKET
check.template operator()<"{:*^3.1}">(SV("***"), SV("\u232a")); // RIGHT-POINTING ANGLE BRACKET
check.template operator()<"{:*^3.1}">(SV("***"), SV("\u2e80")); // CJK RADICAL REPEAT
check.template operator()<"{:*^3.1}">(SV("***"), SV("\u303e")); // IDEOGRAPHIC VARIATION INDICATOR
check.template operator()<"{:*^3.1}">(SV("***"), SV("\u3040")); // U+3041 HIRAGANA LETTER SMALL A
check.template operator()<"{:*^3.1}">(SV("***"), SV("\ua4cf")); // U+A4D0 LISU LETTER BA
check.template operator()<"{:*^3.1}">(SV("***"), SV("\uac00")); // <Hangul Syllable, First>
check.template operator()<"{:*^3.1}">(SV("***"), SV("\ud7a3")); // Hangul Syllable Hih
check.template operator()<"{:*^3.1}">(SV("***"), SV("\uf900")); // CJK COMPATIBILITY IDEOGRAPH-F900
check.template operator()<"{:*^3.1}">(SV("***"), SV("\ufaff")); // U+FB00 LATIN SMALL LIGATURE FF
check.template operator()<"{:*^3.1}">(SV("***"), SV("\ufe10")); // PRESENTATION FORM FOR VERTICAL COMMA
check.template operator()<"{:*^3.1}">(SV("***"), SV("\ufe19")); // PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
check.template operator()<"{:*^3.1}">(SV("***"), SV("\ufe30")); // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
check.template operator()<"{:*^3.1}">(SV("***"), SV("\ufe6f")); // U+FE70 ARABIC FATHATAN ISOLATED FORM
check.template operator()<"{:*^3.1}">(SV("***"), SV("\uff00")); // U+FF01 FULLWIDTH EXCLAMATION MARK
check.template operator()<"{:*^3.1}">(SV("***"), SV("\uff60")); // FULLWIDTH RIGHT WHITE PARENTHESIS
check.template operator()<"{:*^3.1}">(SV("***"), SV("\uffe0")); // FULLWIDTH CENT SIGN
check.template operator()<"{:*^3.1}">(SV("***"), SV("\uffe6")); // FULLWIDTH WON SIGN
//*** 3.1-byte code points ***
check.template operator()<"{:*^3.1}">(SV("*\U00010000*"), SV("\U00010000")); // LINEAR B SYLLABLE B008 A
check.template operator()<"{:*^3.1}">(SV("*\U0010FFFF*"), SV("\U0010FFFF")); // Undefined Character
// 2 column ranges
check.template operator()<"{:*^3.1}">(SV("***"), SV("\U0001f300")); // CYCLONE
check.template operator()<"{:*^3.1}">(SV("***"), SV("\U0001f64f")); // PERSON WITH FOLDED HANDS
check.template operator()<"{:*^3.1}">(SV("***"), SV("\U0001f900")); // CIRCLED CROSS FORMEE WITH FOUR DOTS
check.template operator()<"{:*^3.1}">(SV("***"), SV("\U0001f9ff")); // NAZAR AMULET
check.template operator()<"{:*^3.1}">(SV("***"), SV("\U00020000")); // <CJK Ideograph Extension B, First>
check.template operator()<"{:*^3.1}">(SV("***"), SV("\U0002fffd")); // Undefined Character
check.template operator()<"{:*^3.1}">(SV("***"), SV("\U00030000")); // <CJK Ideograph Extension G, First>
check.template operator()<"{:*^3.1}">(SV("***"), SV("\U0003fffd")); // Undefined Character
}
// The examples used in that paper.
template <class CharT>
static void test_P1868() {
// Fill
check.template operator()<"{:*^3}">(SV("*\u0041*"), SV("\u0041")); // { LATIN CAPITAL LETTER A }
check.template operator()<"{:*^3}">(SV("*\u00c1*"), SV("\u00c1")); // { LATIN CAPITAL LETTER A WITH ACUTE }
check.template operator()<"{:*^3}">(
SV("*\u0041\u0301*"),
SV("\u0041\u0301")); // { LATIN CAPITAL LETTER A } { COMBINING ACUTE ACCENT }
check.template operator()<"{:*^3}">(SV("*\u0132*"), SV("\u0132")); // { LATIN CAPITAL LIGATURE IJ }
check.template operator()<"{:*^3}">(SV("*\u0394*"), SV("\u0394")); // { GREEK CAPITAL LETTER DELTA }
check.template operator()<"{:*^3}">(SV("*\u0429*"), SV("\u0429")); // { CYRILLIC CAPITAL LETTER SHCHA }
check.template operator()<"{:*^3}">(SV("*\u05d0*"), SV("\u05d0")); // { HEBREW LETTER ALEF }
check.template operator()<"{:*^3}">(SV("*\u0634*"), SV("\u0634")); // { ARABIC LETTER SHEEN }
check.template operator()<"{:*^4}">(SV("*\u3009*"), SV("\u3009")); // { RIGHT-POINTING ANGLE BRACKET }
check.template operator()<"{:*^4}">(SV("*\u754c*"), SV("\u754c")); // { CJK Unified Ideograph-754C }
check.template operator()<"{:*^4}">(SV("*\U0001f921*"), SV("\U0001f921")); // { UNICORN FACE }
check.template operator()<"{:*^4}">(
SV("*\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466*"),
SV("\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466")); // { Family: Man, Woman, Girl, Boy }
// Truncate to 1 column: 1 column grapheme clusters are kept together.
check.template operator()<"{:*^3.1}">(SV("*\u0041*"), SV("\u0041")); // { LATIN CAPITAL LETTER A }
check.template operator()<"{:*^3.1}">(SV("*\u00c1*"), SV("\u00c1")); // { LATIN CAPITAL LETTER A WITH ACUTE }
check.template operator()<"{:*^3.1}">(
SV("*\u0041\u0301*"),
SV("\u0041\u0301")); // { LATIN CAPITAL LETTER A } { COMBINING ACUTE ACCENT }
check.template operator()<"{:*^3.1}">(SV("*\u0132*"), SV("\u0132")); // { LATIN CAPITAL LIGATURE IJ }
check.template operator()<"{:*^3.1}">(SV("*\u0394*"), SV("\u0394")); // { GREEK CAPITAL LETTER DELTA }
check.template operator()<"{:*^3.1}">(SV("*\u0429*"), SV("\u0429")); // { CYRILLIC CAPITAL LETTER SHCHA }
check.template operator()<"{:*^3.1}">(SV("*\u05d0*"), SV("\u05d0")); // { HEBREW LETTER ALEF }
check.template operator()<"{:*^3.1}">(SV("*\u0634*"), SV("\u0634")); // { ARABIC LETTER SHEEN }
check.template operator()<"{:*^3.1}">(SV("***"), SV("\u3009")); // { RIGHT-POINTING ANGLE BRACKET }
check.template operator()<"{:*^3.1}">(SV("***"), SV("\u754c")); // { CJK Unified Ideograph-754C }
check.template operator()<"{:*^3.1}">(SV("***"), SV("\U0001f921")); // { UNICORN FACE }
check.template operator()<"{:*^3.1}">(
SV("***"),
SV("\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466")); // { Family: Man, Woman, Girl, Boy }
// Truncate to 2 column: 2 column grapheme clusters are kept together.
check.template operator()<"{:*^3.2}">(SV("*\u0041*"), SV("\u0041")); // { LATIN CAPITAL LETTER A }
check.template operator()<"{:*^3.2}">(SV("*\u00c1*"), SV("\u00c1")); // { LATIN CAPITAL LETTER A WITH ACUTE }
check.template operator()<"{:*^3.2}">(
SV("*\u0041\u0301*"),
SV("\u0041\u0301")); // { LATIN CAPITAL LETTER A } { COMBINING ACUTE ACCENT }
check.template operator()<"{:*^3.2}">(SV("*\u0132*"), SV("\u0132")); // { LATIN CAPITAL LIGATURE IJ }
check.template operator()<"{:*^3.2}">(SV("*\u0394*"), SV("\u0394")); // { GREEK CAPITAL LETTER DELTA }
check.template operator()<"{:*^3.2}">(SV("*\u0429*"), SV("\u0429")); // { CYRILLIC CAPITAL LETTER SHCHA }
check.template operator()<"{:*^3.2}">(SV("*\u05d0*"), SV("\u05d0")); // { HEBREW LETTER ALEF }
check.template operator()<"{:*^3.2}">(SV("*\u0634*"), SV("\u0634")); // { ARABIC LETTER SHEEN }
check.template operator()<"{:*^4.2}">(SV("*\u3009*"), SV("\u3009")); // { RIGHT-POINTING ANGLE BRACKET }
check.template operator()<"{:*^4.2}">(SV("*\u754c*"), SV("\u754c")); // { CJK Unified Ideograph-754C }
check.template operator()<"{:*^4.2}">(SV("*\U0001f921*"), SV("\U0001f921")); // { UNICORN FACE }
check.template operator()<"{:*^4.2}">(
SV("*\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466*"),
SV("\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466")); // { Family: Man, Woman, Girl, Boy }
}
#ifdef _LIBCPP_VERSION
// Tests the libc++ specific behaviour for malformed UTF-sequences. The
// Standard doesn't specify how to handle this.
template <class CharT>
static void test_malformed_code_point() {
if constexpr (sizeof(CharT) == 1) {
// Malformed at end.
check.template operator()<"{:*^7}">(SV("*ZZZZ\x8f*"), SV("ZZZZ\x8f"));
check.template operator()<"{:*^7}">(SV("*ZZZZ\xcf*"), SV("ZZZZ\xcf"));
check.template operator()<"{:*^7}">(SV("*ZZZZ\xef*"), SV("ZZZZ\xef"));
check.template operator()<"{:*^7}">(SV("*ZZZZ\xff*"), SV("ZZZZ\xff"));
// Malformed in middle, no continuation
check.template operator()<"{:*^8}">(SV("*ZZZZ\x8fZ*"), SV("ZZZZ\x8fZ"));
check.template operator()<"{:*^8}">(SV("*ZZZZ\xcfZ*"), SV("ZZZZ\xcfZ"));
check.template operator()<"{:*^8}">(SV("*ZZZZ\xefZ*"), SV("ZZZZ\xefZ"));
check.template operator()<"{:*^8}">(SV("*ZZZZ\xffZ*"), SV("ZZZZ\xffZ"));
check.template operator()<"{:*^9}">(SV("*ZZZZ\x8fZZ*"), SV("ZZZZ\x8fZZ"));
check.template operator()<"{:*^9}">(SV("*ZZZZ\xcfZZ*"), SV("ZZZZ\xcfZZ"));
check.template operator()<"{:*^9}">(SV("*ZZZZ\xefZZ*"), SV("ZZZZ\xefZZ"));
check.template operator()<"{:*^9}">(SV("*ZZZZ\xffZZ*"), SV("ZZZZ\xffZZ"));
check.template operator()<"{:*^10}">(SV("*ZZZZ\x8fZZZ*"), SV("ZZZZ\x8fZZZ"));
check.template operator()<"{:*^10}">(SV("*ZZZZ\xcfZZZ*"), SV("ZZZZ\xcfZZZ"));
check.template operator()<"{:*^10}">(SV("*ZZZZ\xefZZZ*"), SV("ZZZZ\xefZZZ"));
check.template operator()<"{:*^10}">(SV("*ZZZZ\xffZZZ*"), SV("ZZZZ\xffZZZ"));
check.template operator()<"{:*^11}">(SV("*ZZZZ\x8fZZZZ*"), SV("ZZZZ\x8fZZZZ"));
check.template operator()<"{:*^11}">(SV("*ZZZZ\xcfZZZZ*"), SV("ZZZZ\xcfZZZZ"));
check.template operator()<"{:*^11}">(SV("*ZZZZ\xefZZZZ*"), SV("ZZZZ\xefZZZZ"));
check.template operator()<"{:*^11}">(SV("*ZZZZ\xffZZZZ*"), SV("ZZZZ\xffZZZZ"));
// Premature end.
check.template operator()<"{:*^8}">(SV("*ZZZZ\xef\xf5*"), SV("ZZZZ\xef\xf5"));
check.template operator()<"{:*^12}">(SV("*ZZZZ\xef\xf5ZZZZ*"), SV("ZZZZ\xef\xf5ZZZZ"));
check.template operator()<"{:*^9}">(SV("*ZZZZ\xff\xf5\xf5*"), SV("ZZZZ\xff\xf5\xf5"));
check.template operator()<"{:*^13}">(SV("*ZZZZ\xff\xf5\xf5ZZZZ*"), SV("ZZZZ\xff\xf5\xf5ZZZZ"));
} else if constexpr (sizeof(CharT) == 2) {
// TODO FMT Add these tests.
}
// UTF-32 doesn't combine characters, thus no corruption tests.
}
#endif
template <class CharT>
static void test() {
test_single_code_point_fill<CharT>();
test_single_code_point_truncate<CharT>();
test_P1868<CharT>();
#ifdef _LIBCPP_VERSION
test_malformed_code_point<CharT>();
#endif
}
int main(int, char**) {
test<char>();
#ifndef TEST_HAS_NO_WIDE_CHARACTERS
test<wchar_t>();
#endif
return 0;
}

View File

@ -0,0 +1,326 @@
#!/usr/bin/env python
# ===----------------------------------------------------------------------===##
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# ===----------------------------------------------------------------------===##
# The code is based on
# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py
#
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
from io import StringIO
from pathlib import Path
from dataclasses import dataclass, field
from typing import Optional
import re
@dataclass
class PropertyRange:
lower: int = -1
upper: int = -1
prop: str = None
@dataclass
class Entry:
lower: int = -1
offset: int = -1
prop: int = -1
LINE_REGEX = re.compile(
r"^(?P<lower>[0-9A-F]{4,5})(?:\.\.(?P<upper>[0-9A-F]{4,5}))?\s*;\s*(?P<prop>\w+)"
)
def parsePropertyLine(inputLine: str) -> Optional[PropertyRange]:
result = PropertyRange()
if m := LINE_REGEX.match(inputLine):
lower_str, upper_str, result.prop = m.group("lower", "upper", "prop")
result.lower = int(lower_str, base=16)
result.upper = result.lower
if upper_str is not None:
result.upper = int(upper_str, base=16)
return result
else:
return None
def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]:
"""
Merges consecutive ranges with the same property to one range.
Merging the ranges results in fewer ranges in the output table,
reducing binary and improving lookup performance.
"""
result = list()
for x in input:
if (
len(result)
and result[-1].prop == x.prop
and result[-1].upper + 1 == x.lower
):
result[-1].upper = x.upper
continue
result.append(x)
return result
PROP_VALUE_ENUMERATOR_TEMPLATE = "__{}"
PROP_VALUE_ENUM_TEMPLATE = """
enum class __property : uint8_t {{
// Values generated from the data files.
{enumerators},
// The properies below aren't stored in the "database".
// Text position properties.
__sot,
__eot,
// The code unit has none of above properties.
__none
}};
"""
DATA_ARRAY_TEMPLATE = """
/// The entries of the extended grapheme cluster bondary property table.
///
/// The data is generated from
/// - https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
/// - https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
///
/// The data has 3 values
/// - bits [0, 3] The property. One of the values generated form the datafiles
/// of \\ref __property
/// - bits [4, 10] The size of the range.
/// - bits [11, 31] The lower bound code point of the range. The upper bound of
/// the range is lower bound + size.
///
/// The 7 bits for the size allow a maximum range of 128 elements. Some ranges
/// in the Unicode tables are larger. They are stored in multiple consecutive
/// ranges in the data table. An alternative would be to store the sizes in a
/// separate 16-bit value. The original MSVC STL code had such an approach, but
/// this approach uses less space for the data and is about 4% faster in the
/// following benchmark.
/// libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp
inline constexpr uint32_t __entries[{size}] = {{{entries}}};
/// Returns the extended grapheme cluster bondary property of a code point.
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __property __get_property(const char32_t __code_point) noexcept {{
// TODO FMT use std::ranges::upper_bound.
// The algorithm searches for the upper bound of the range and, when found,
// steps back one entry. This algorithm is used since the code point can be
// anywhere in the range. After a lower bound is found the next step is to
// compare whether the code unit is indeed in the range.
//
// Since the entry contains a code unit, size, and property the code point
// being sought needs to be adjusted. Just shifting the code point to the
// proper position doesn't work; suppose an entry has property 0, size 1,
// and lower bound 3. This results in the entry 0x1810.
// When searching for code point 3 it will search for 0x1800, find 0x1810
// and moves to the previous entry. Thus the lower bound value will never
// be found.
// The simple solution is to set the bits belonging to the property and
// size. Then the upper bound for code point 3 will return the entry after
// 0x1810. After moving to the previous entry the algorithm arrives at the
// correct entry.
ptrdiff_t __i = std::upper_bound(__entries, std::end(__entries), (__code_point << 11) | 0x7ffu) - __entries;
if (__i == 0)
return __property::__none;
--__i;
uint32_t __upper_bound = (__entries[__i] >> 11) + ((__entries[__i] >> 4) & 0x7f);
if (__code_point <= __upper_bound)
return static_cast<__property>(__entries[__i] & 0xf);
return __property::__none;
}}
"""
MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE = """
// -*- C++ -*-
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// WARNING, this entire header is generated by
// utiles/generate_extended_grapheme_cluster_table.py
// DO NOT MODIFY!
// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
//
// See Terms of Use <https://www.unicode.org/copyright.html>
// for definitions of Unicode Inc.'s Data Files and Software.
//
// NOTICE TO USER: Carefully read the following legal agreement.
// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
// TERMS AND CONDITIONS OF THIS AGREEMENT.
// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
// THE DATA FILES OR SOFTWARE.
//
// COPYRIGHT AND PERMISSION NOTICE
//
// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
// Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
//
// Permission is hereby granted, free of charge, to any person obtaining
// a copy of the Unicode data files and any associated documentation
// (the "Data Files") or Unicode software and any associated documentation
// (the "Software") to deal in the Data Files or Software
// without restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, and/or sell copies of
// the Data Files or Software, and to permit persons to whom the Data Files
// or Software are furnished to do so, provided that either
// (a) this copyright and permission notice appear with all copies
// of the Data Files or Software, or
// (b) this copyright and permission notice appear in associated
// Documentation.
//
// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT OF THIRD PARTY RIGHTS.
// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
// PERFORMANCE OF THE DATA FILES OR SOFTWARE.
//
// Except as contained in this notice, the name of a copyright holder
// shall not be used in advertising or otherwise to promote the sale,
// use or other dealings in these Data Files or Software without prior
// written authorization of the copyright holder.
#ifndef _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
#define _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
#include <__algorithm/upper_bound.h>
#include <__config>
#include <__iterator/access.h>
#include <cstddef>
#include <cstdint>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
#endif
_LIBCPP_BEGIN_NAMESPACE_STD
#if _LIBCPP_STD_VER > 17
namespace __extended_grapheme_custer_property_boundary {{
{content}
}} // __extended_grapheme_custer_property_boundary
#endif //_LIBCPP_STD_VER > 17
_LIBCPP_END_NAMESPACE_STD
#endif // _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
"""
def property_ranges_to_table(
ranges: list[PropertyRange], props: list[str]
) -> list[Entry]:
assert len(props) < 16
result = list[Entry]()
high = -1
for range in sorted(ranges, key=lambda x: x.lower):
# Validate overlapping ranges
assert range.lower > high
high = range.upper
while True:
e = Entry(range.lower, range.upper - range.lower, props.index(range.prop))
if e.offset <= 127:
result.append(e)
break
e.offset = 127
result.append(e)
range.lower += 128
return result
cpp_entrytemplate = "0x{:08x}"
def generate_cpp_data(prop_name: str, ranges: list[PropertyRange]) -> str:
result = StringIO()
prop_values = sorted(set(x.prop for x in ranges))
table = property_ranges_to_table(ranges, prop_values)
enumerator_values = [PROP_VALUE_ENUMERATOR_TEMPLATE.format(x) for x in prop_values]
result.write(
PROP_VALUE_ENUM_TEMPLATE.format(enumerators=",".join(enumerator_values))
)
result.write(
DATA_ARRAY_TEMPLATE.format(
prop_name=prop_name,
size=len(table),
entries=",".join(
[
cpp_entrytemplate.format(x.lower << 11 | x.offset << 4 | x.prop)
for x in table
]
),
)
)
return result.getvalue()
def generate_data_tables() -> str:
"""
Generate Unicode data for inclusion into <format> from
GraphemeBreakProperty.txt and emoji-data.txt.
GraphemeBreakProperty.txt can be found at
https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
emoji-data.txt can be found at
https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
Both files are expected to be in the same directory as this script.
"""
gbp_data_path = Path(__file__).absolute().with_name("GraphemeBreakProperty.txt")
emoji_data_path = Path(__file__).absolute().with_name("emoji-data.txt")
gbp_ranges = list()
emoji_ranges = list()
with gbp_data_path.open(encoding="utf-8") as f:
gbp_ranges = compactPropertyRanges(
[x for line in f if (x := parsePropertyLine(line))]
)
with emoji_data_path.open(encoding="utf-8") as f:
emoji_ranges = compactPropertyRanges(
[x for line in f if (x := parsePropertyLine(line))]
)
[gbp_ranges.append(x) for x in emoji_ranges if x.prop == "Extended_Pictographic"]
gpb_cpp_data = generate_cpp_data("Grapheme_Break", gbp_ranges)
return "\n".join([gpb_cpp_data])
if __name__ == "__main__":
print(
MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE.lstrip().format(
content=generate_data_tables()
)
)

View File

@ -0,0 +1,245 @@
#!/usr/bin/env python
# ===----------------------------------------------------------------------===##
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# ===----------------------------------------------------------------------===##
# The code is based on
# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_test_data_gen.py
#
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
from pathlib import Path
from dataclasses import dataclass, field
from typing import Optional, TextIO
from array import array
@dataclass
class BreakTestItem:
code_points: list[int] = field(default_factory=list)
encoded: str = ""
breaks_utf8: list[int] = field(default_factory=list)
breaks_utf16: list[int] = field(default_factory=list)
breaks_utf32: list[int] = field(default_factory=list)
class CommentLine:
pass
class EOF:
pass
def parseBreakTestLine(input: TextIO) -> Optional[BreakTestItem]:
result = BreakTestItem()
code_point = -1
utf8 = 0
utf16 = 0
utf32 = 0
while True:
c = input.read(1)
if c == "\N{DIVISION SIGN}":
# The line starts with a division sign, don't add it to the output.
if code_point != -1:
result.code_points.append(code_point)
code_point = -1
result.breaks_utf8.append(utf8)
result.breaks_utf16.append(utf16)
result.breaks_utf32.append(utf32)
assert input.read(1).isspace()
continue
if c == "\N{MULTIPLICATION SIGN}":
assert input.read(1).isspace()
continue
if c.isalnum():
while next := input.read(1):
if next.isalnum():
c += next
else:
assert next.isspace()
break
i = int(c, base=16)
if code_point == -1:
code_point = i
result.encoded += f"\\U{i:08x}"
c = chr(i)
utf8 += c.encode().__len__()
# Since we only care about the number of code units the byte order
# doesn't matter. The byte order is specified to avoid the BOM
utf16 += int(c.encode("utf-16-le").__len__() / 2)
utf32 += int(c.encode("utf-32-le").__len__() / 4)
continue
if c == "#":
input.readline()
return result
if c == "\n":
return result
if c == "":
return None
assert False
cpp_template = """// -*- C++ -*-
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// WARNING, this entire header is generated by
// utiles/generate_extended_grapheme_cluster_test.py
// DO NOT MODIFY!
// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
//
// See Terms of Use <https://www.unicode.org/copyright.html>
// for definitions of Unicode Inc.'s Data Files and Software.
//
// NOTICE TO USER: Carefully read the following legal agreement.
// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
// TERMS AND CONDITIONS OF THIS AGREEMENT.
// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
// THE DATA FILES OR SOFTWARE.
//
// COPYRIGHT AND PERMISSION NOTICE
//
// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
// Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
//
// Permission is hereby granted, free of charge, to any person obtaining
// a copy of the Unicode data files and any associated documentation
// (the "Data Files") or Unicode software and any associated documentation
// (the "Software") to deal in the Data Files or Software
// without restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, and/or sell copies of
// the Data Files or Software, and to permit persons to whom the Data Files
// or Software are furnished to do so, provided that either
// (a) this copyright and permission notice appear with all copies
// of the Data Files or Software, or
// (b) this copyright and permission notice appear in associated
// Documentation.
//
// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT OF THIRD PARTY RIGHTS.
// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
// PERFORMANCE OF THE DATA FILES OR SOFTWARE.
//
// Except as contained in this notice, the name of a copyright holder
// shall not be used in advertising or otherwise to promote the sale,
// use or other dealings in these Data Files or Software without prior
// written authorization of the copyright holder.
#ifndef LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
#define LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
#include <array>
#include <string_view>
#include <vector>
template <class CharT>
struct data {{
/// The input to parse.
std::basic_string_view<CharT> input;
/// The first code point all extended grapheme clusters in the input.
std::vector<char32_t> code_points;
/// The offset of the last code units of the extended grapheme clusters in the input.
///
/// The vector has the same number of entries as \\ref code_points.
std::vector<size_t> breaks;
}};
/// The data for UTF-8.
std::array<data<char>, {0}> data_utf8 = {{{{ {1} }}}};
/// The data for UTF-16.
///
/// Note that most of the data for the UTF-16 and UTF-32 are identical. However
/// since the size of the code units differ the breaks can contain different
/// values.
std::array<data<wchar_t>, {0}> data_utf16 = {{{{ {2} }}}};
/// The data for UTF-8.
///
/// Note that most of the data for the UTF-16 and UTF-32 are identical. However
/// since the size of the code units differ the breaks can contain different
/// values.
std::array<data<wchar_t>, {0}> data_utf32 = {{{{ {3} }}}};
#endif // LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
"""
cpp_test_data_line_template = "{{ {}, {{ {} }}, {{ {} }} }}"
def lineToCppDataLineUtf8(line: BreakTestItem) -> str:
return cpp_test_data_line_template.format(
f'"{line.encoded}"',
",".join([str(x) for x in line.code_points]),
",".join([str(x) for x in line.breaks_utf8]),
)
def lineToCppDataLineUtf16(line: BreakTestItem) -> str:
return cpp_test_data_line_template.format(
f'L"{line.encoded}"',
",".join([str(x) for x in line.code_points]),
",".join([str(x) for x in line.breaks_utf16]),
)
def lineToCppDataLineUtf32(line: BreakTestItem) -> str:
return cpp_test_data_line_template.format(
f'L"{line.encoded}"',
",".join([str(x) for x in line.code_points]),
",".join([str(x) for x in line.breaks_utf32]),
)
"""
Generate test data from "GraphemeBreakText.txt"
This file can be downloaded from:
https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt
This script looks for GraphemeBreakTest.txt in same directory as this script
"""
def generate_all() -> str:
test_data_path = Path(__file__)
test_data_path = test_data_path.absolute()
test_data_path = test_data_path.with_name("GraphemeBreakTest.txt")
lines = list()
with open(test_data_path, mode="rt", encoding="utf-8") as file:
while line := parseBreakTestLine(file):
if len(line.encoded) > 0:
lines.append(line)
return cpp_template.format(
len(lines),
",".join(map(lineToCppDataLineUtf8, lines)),
",".join(map(lineToCppDataLineUtf16, lines)),
",".join(map(lineToCppDataLineUtf32, lines)),
)
if __name__ == "__main__":
print(generate_all())