mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2024-10-07 10:54:01 +00:00
[libc++] Implements Unicode grapheme clustering
This implements the Grapheme clustering as required by P1868R2 width: clarifying units of width and precision in std::format This was omitted in the initial patch, but the paper was marked as completed. This really completes the paper. Reviewed By: ldionne, #libc Differential Revision: https://reviews.llvm.org/D126971
This commit is contained in:
parent
8e48783dcb
commit
857a78c04d
@ -6,183 +6,287 @@
|
||||
|
||||
#ifndef _LIBCPP_HAS_NO_UNICODE
|
||||
|
||||
#include <array>
|
||||
#include <format>
|
||||
# include <format>
|
||||
# include <string_view>
|
||||
|
||||
#include "benchmark/benchmark.h"
|
||||
# include "benchmark/benchmark.h"
|
||||
|
||||
#include "test_macros.h"
|
||||
# include "make_string.h"
|
||||
|
||||
template <class CharT, size_t N>
|
||||
class tester {
|
||||
static constexpr size_t size_ = N - 1;
|
||||
std::array<CharT, 100 * size_> data_;
|
||||
# define SV(S) MAKE_STRING_VIEW(CharT, S)
|
||||
|
||||
public:
|
||||
explicit constexpr tester(const CharT (&input)[N]) {
|
||||
auto it = data_.begin();
|
||||
for (int i = 0; i < 100; ++i)
|
||||
it = std::copy_n(input, size_, it);
|
||||
}
|
||||
// generated with https://generator.lorem-ipsum.info/_latin
|
||||
|
||||
constexpr size_t size() const noexcept { return data_.size(); }
|
||||
constexpr const CharT* begin() const noexcept { return data_.begin(); }
|
||||
constexpr const CharT* end() const noexcept { return data_.end(); }
|
||||
template <class CharT>
|
||||
std::basic_string_view<CharT> ascii_text() {
|
||||
return SV(
|
||||
R"( Lorem ipsum dolor sit amet, ne sensibus evertitur aliquando his.
|
||||
Iuvaret fabulas qui ex, ex iriure iisque nostrum mea. Solum
|
||||
pericula qui ad. Elitr oporteat ius ad.
|
||||
|
||||
void test(benchmark::State& state) const {
|
||||
Quas rationibus ad mel. Appellantur intellegebat ad mei, ius audire volumus
|
||||
consectetuer id. Ei sit definitionem mediocritatem, vim indoctum intellegat id,
|
||||
dicta laboramus instructior in vix. Mel an quando malorum, id vis mollis
|
||||
invidunt, placerat maiestatis comprehensam ut cum. Suas regione interesset id
|
||||
per, et docendi accumsan has, autem atomorum est te.
|
||||
|
||||
Cu debitis ancillae sea, alii definitiones ex cum, vim no erat antiopam. Eam et
|
||||
unum quas scriptorem. An bonorum elaboraret complectitur nam, vim ei persecuti
|
||||
democritum mediocritatem. Suscipit platonem signiferumque ei cum, in sale
|
||||
volutpat ocurreret vel. Te vel nihil nominavi adipiscing, stet ancillae mel ea.
|
||||
Sit detraxit menandri platonem ea, cum at tale viris virtute.
|
||||
|
||||
Regione detraxit gloriatur sit eu, sonet labitur sententiae et pro, at sit
|
||||
alterum aliquid interpretaris. Sonet voluptua duo id, vix ea accumsan
|
||||
liberavisse. Nam id commune probatus contentiones. Et zril dolore laudem duo,
|
||||
ea usu mollis melius referrentur, vel ex case consequuntur. Id nam illum mollis
|
||||
ponderum. Quis tamquam ullamcorper sed ne, legimus vituperatoribus est id.
|
||||
|
||||
Et eum probo consulatu. At eos errem aliquando theophrastus, sea ad eius omnis.
|
||||
No vis iusto scriptorem adversarium, dicat viderer ea sit. Et veri euripidis
|
||||
sea, justo putent iudicabit vim id. Sea suas tincidunt vituperatoribus in. Ne
|
||||
eam aeterno sensibus concludaturque, solet legere his id, usu ei dicat
|
||||
dissentiunt. Est et autem erant.
|
||||
|
||||
Per quod laboramus an. Dico voluptua at mea, an animal minimum eum. Pri an
|
||||
option salutatus, causae feugiat menandri an sed. Voluptaria dissentiet vix ut,
|
||||
alii solet te quo, in facer ceteros eos. Ad nibh meis percipitur sit,
|
||||
aliquam molestie cu vis, iisque malorum interesset et eos.
|
||||
|
||||
Eos in feugiat insolens abhorreant. Ea tale esse alienum has, mel et saperet
|
||||
appellantur, aliquip salutandi deterruisset ut mel. Eos ei quod simul
|
||||
interpretaris, aeque elitr putent per at, et veri eripuit ceteros his. Cu pro
|
||||
meis aperiam volutpat, ex alterum scripserit ius, scriptorem deterruisset eu
|
||||
qui. Graeco debitis lobortis cu mea.
|
||||
|
||||
Alii corpora id ius, cu quo oblique eloquentiam. Et duis civibus atomorum sea,
|
||||
veniam utroque scriptorem vim cu. Ut oratio eruditi mediocritatem est. Amet
|
||||
nibh dolore mea ea, tollit laoreet eligendi qui ex, cu essent forensibus
|
||||
his.
|
||||
|
||||
Usu ex ipsum apeirian, eos congue scripserit omittantur et. Ea eum persecuti
|
||||
deseruisse, probatus torquatos est no, in has mutat mundi dolorem. Albucius
|
||||
sensibus ex cum. Ferri virtute referrentur an per, est choro option bonorum ex.
|
||||
|
||||
Quando accusam vis te, tale mazim et pro. Magna dolorem tincidunt
|
||||
nec te, albucius adipisci ad pri. Magna facilisi adipisci at usu, et vel
|
||||
dissentiunt neglegentur, prima audiam vocibus an duo. Enim detracto te sea, mel
|
||||
quis dicit gubergren ex, iusto adversarium consequuntur per ne.
|
||||
|
||||
)");
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
std::basic_string_view<CharT> unicode_text() {
|
||||
return SV(
|
||||
R"(Lōrem ipsūm dolor sīt æmeÞ, ea vel nostrud feuġǣit, muciūs tēmporiȝus
|
||||
refērrēnÞur no mel, quo placērǽt consecÞetuer cū. Veri soƿet euripīðis id has,
|
||||
sumo paulō dissentias duo eī, dētrāxīt neglēgeƿtur ið prī. Sēd option oporÞerē
|
||||
no. Nec ēū nēmore mentitum. Veri prōȝo faċilis āt vīm.
|
||||
|
||||
Ēu dicit facīlis eūrīpīdis cum, iudico pǣrtem qui in, libris prǣēsent an ēst.
|
||||
Æt sit quoðsi impētus, nec ex qūaeque honestǣtīs. Fiērēƿt ƿōluisse verterem iƿ
|
||||
ēst. Meī eæ apēriæm fierent peÞentīūm. Eæm officiīs reprehēndunt nē.
|
||||
|
||||
Ut vel quodsī contentioƿes, his eū dignissim īnstruċÞior. Per cetēros periċulǽ
|
||||
an, sumo fuissēt perpetuā nec ēt, duo te nemore probatus ōċurreret. Mel ǣd
|
||||
civībus ocūrreret. Ex nostro ǣliquam usu, ex Þātīon adipiscī qui. Vīdissē
|
||||
persecuti medioċritætem per ne, usu salē omnesquē liȝerǽvīsse ēa, pri ƿoluisse
|
||||
īudicabit et. No summo quiðǣm nec, vim ēi nūmqūam sænctus concepÞǣm. Reque
|
||||
doceƿdi īn īus, porro eripuiÞ intērprētaris pri in.
|
||||
|
||||
Idquē hǣbēmus nominati vix cū. AÞ prō ǽmēt elit periculæ. Has virīs viderer ān.
|
||||
Mel in suās pericūlīs āppellantur, nonumes deserūƿt ǽðversarium eā has. ĒliÞ
|
||||
possīt commuƿe no ēsÞ, niȝh aċcusāmūs volūpÞatum no mel, ut quō ciȝo ðiceret.
|
||||
Inǣni scripta quālīsque nē qūi, ad ipsūm persecuÞi mediōcritæÞēm vel.
|
||||
|
||||
Ǣppetere definitiōnes mel id. Leġerē āliquip nam eǣ, rēgione viderer pǣtrioque
|
||||
duo te, meƿāƿdri prodēsseÞ ex hīs. Solum quidam eæ iūs, mēl ǣt sapientem
|
||||
expliċari. Īƿ ǣċcusǣm phǽedrum pro, ex pro dēleƿit detræxit hendrerīt, sit āgam
|
||||
quidām pertinax uÞ. Ēssent rætionibus eǽ vēl, quo ān labore nusquæm nominǣti.
|
||||
|
||||
Te alii cōnseÞetur ƿam, eam ēt puteƿÞ ðissentiæs. Qūi alii dicānt repuðiære ēā,
|
||||
nō mel ferri nūsquam. Ea vim impedīt vertērem, ǣn per veri Þīmeam. SiÞ ōmitÞǽm
|
||||
necēssitǣÞibus ex, ƿe vis inǣni pærtem invenire. Īd ðolores ċonsēċÞeÞuer usu,
|
||||
īd vis nisl dēnique luptǣtūm. Pro ǽd ēverti option dēserūƿt, nec te ōðiō
|
||||
cīvībūs.
|
||||
|
||||
Ēæ nibh æccommodarē eum. Ne etiæm īudico dicunt duo, quo tēmpor populo insōlens
|
||||
nē. Ēos eÞ ēirmod prǽēsēƿt. Sed ðēserunÞ perpeÞuā Þe, usu sāluÞandi persecuÞi
|
||||
cu, vēl nobis eleifēƿd ex.
|
||||
|
||||
Ƿe zrīl ūtīnam lǣtīne eǣm, eā vim rebum omitÞǣm aðipisciƿg. Amet inermis
|
||||
epiċūri ut est, eu duo hīnc periċulis. Mel no reque simul volupÞātum, ex mutat
|
||||
lāudem tacīmatēs cum. Te hǣs summo iƿteġre recteque. No iūs dicerēt
|
||||
ðisputǽtioƿi. Vim ōmnis deleƿiÞi honestātis ēǽ.
|
||||
|
||||
Nec detrǣcto pērcipitur ne. Ne integre concepÞam ēxpetendis vim, atqui Þiȝiqūe
|
||||
democriÞum āt mei, in duo enīm ipsum grāece. Rebum ðefīnīÞionem āt pri, ēt sit
|
||||
brute periculis. Ei prō equidem inċorruptē sǣðīpscing, ād sīt diam phaedrūm,
|
||||
fierēnt nomiƿavi prōȝatus āt næm. Wisi ƿæÞūm coƿsecteÞuer usū ea.
|
||||
)");
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
std::basic_string_view<CharT> cyrillic_text() {
|
||||
return SV(
|
||||
R"(Лорем ипсум долор сит амет, еу диам тамяуам принципес вис, еяуидем
|
||||
цонцептам диспутандо яуи цу, иус ад натум нулла граеци. Цибо дицит омниум нец
|
||||
цу, еу бруте номинави диссентиет яуо. Омниум лаборамус еу хас. Дицат
|
||||
диспутатиони вис еу, цу еос миним атоморум инцидеринт. Пер хабео рецтеяуе
|
||||
дигниссим ан, ех яуо сенсибус торяуатос, ан.
|
||||
|
||||
Ут перпетуа партиендо принципес хис. Ат симул ностер аппареат пер. Пурто вирис
|
||||
ет хис, мазим дицерет при ет. Хис саперет тибияуе сцаевола еу, сит солет
|
||||
вивендум цонсеяуат те. Ид оффициис перпетуа ассентиор яуи, сед аугуе афферт
|
||||
симилияуе ад, ех адмодум постулант иус.
|
||||
|
||||
Про дицунт волуптатум диспутатиони ат. Вел патриояуе персецути еа, цетерос
|
||||
диспутатиони ин сед, нам те веро цлита малуиссет. Цу неглегентур инструцтиор
|
||||
интерпретарис еам, ипсум фабулас еи вел. Еи адхуц деленити нам, аугуе
|
||||
демоцритум при ан. Вим мелиоре проприае ид, албуциус волуптуа цоррумпит дуо ан.
|
||||
Латине иуварет пер ут, иус еа мунере ерипуит санцтус.
|
||||
|
||||
Модус тритани иус не, вим ут мелиоре мандамус, лабитур опортере дуо но. Ад нец
|
||||
витае фацилис инцоррупте, цу сед толлит сцрипторем. Сит лудус инимицус
|
||||
волуптариа не. Иисяуе антиопам сапиентем сед еу. Путент волуптуа сит ех, ат иус
|
||||
ребум епицури, яуи моллис елигенди ех. Проприае нолуиссе цу сеа, путент поссит
|
||||
адверсариум про не.
|
||||
|
||||
Ид яуо прима бонорум, дуо форенсибус яуаерендум еи, еум бруте мунере те. Еам
|
||||
риденс граецо ех, аеяуе санцтус маиорум ан вел. Либрис санцтус утрояуе ест но,
|
||||
еам ат реяуе порро тинцидунт, ут хинц иллуд патриояуе хис. Не солет оффендит
|
||||
форенсибус хас, тамяуам опортеат елаборарет те нец, еу аугуе примис маиорум
|
||||
еам. Аутем вениам импедит вис ин, прима елитр пхаедрум ест еу.)");
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
std::basic_string_view<CharT> japanese_text() {
|
||||
return SV(
|
||||
R"(入ト年媛ろ舗学ラロ準募ケカ社金スノ屋検れう策他セヲシ引口ぎ集7独ぱクふ出車ぽでぱ円輪ルノ受打わ。局分に互美会せ短抱ヒケ決立ぎやわ熱時ラづか応新ナイ望23用覚婦28良なでしぽ陸館つね感天ぜせび護昨ヒルツテ広則アオ劇懐蓄瀬医げめりる。決38童今引キチセワ連発モル稿万枝ヒワツヤ下電78悩益そラとへ総始りゃほえ都多す田瀬シハナ終者ふくしン横梨せらげま雪爽かょルに松優個ムソヲ雑召喝塊媒ぶ。
|
||||
|
||||
紙ヤ景異ミノオ誤求レ移著ヤエヨメ広庫テハヌサ君検あ必参ワ火面るね声著ン間売力を数20談すがス禁化ッを。起そり予浩ド進皇キ試属が震二トヌ真佳速すずちし件諏フウチ聞在ス会雄ノミ必筋80戦ぶさほド聞2涙属どスれ映聞ネ掲実べ。
|
||||
|
||||
8福びり属稿づ徳鎌ニル涼問ゃごるリ付92済トぎけッ康30業づむはつ治然二生入ざひ有動ハワチ発談ニスツ魚困摘策送ざ。個時着そてら新新ヌ鉄報たは作主ずリ可輸改量ルおず井認つてぜな会大ぼすぶし全戸ノハケレ貯治たざリな祖間ムリキ断会仕べせど。委暮ど象週トクワ流開タハ硬給ツタウ者善マラノヱ断稿リヲ東毎ツヨマ井藤ルょへ境同論エ愛図ッらフリ基38属慣葬8携ヱ校図おに岐題しね要月レユ展省わトど。
|
||||
|
||||
担がは顔研リ目問いぽべ挙介ん入番ネヌイ栄県し改治ラス健第モム得続加ホウ嘉宿置首本やぞ。78毎まが現設記ほぜね場歩ユアルヒ東的ヒ姿役ネヲ聞能ラシマヒ際形トくゃ政能万の付結ス国1教レツ引写イど扱澤は膚言けリいべ橋柔薄組こよじ。浩報すンつひ崎正念方と夫地クざす情阪スで抜長ネ娘回ハツ止資ヘニ並辞ロノ展師質18打テネ岡時ノモ泉95務えぴひつ速申後延んフるせ。
|
||||
|
||||
店てラ載独マシフ理心ス型部米た読石カ料応掲ケカキ打月在ユテニ採材イ並発イヒト旅錯っめし模能りせば連確え会准揮が。器にト画軍にぶイら式東みそお前姿リいけに身47却6記け岸5体会ゃばま映8碁よぽだ経9名トびち更躍うにふ裏高もそ提旅さぼえス。賞ぞだ月係ソ知建振イナシ説並イ見書傳ヨミ問回級エシ出所師阪ト転権がし渡平ルモケ新完ハ玲女ロトシ導複トうよふ。
|
||||
|
||||
化シセチ町74掲ネテトオ連対ヒハチモ経後ッ断連カロワ待業ぼぽねか百都へがい始塗ごげ寺帰んぽ逆力るず選英堂衛掛焼ゅ。自生トサリ探就的らね江球リルスツ主嘆4権伝ざが避掲う慶合ワ百29暮ネヤクム書能部あが席小フア部親票ーむとこ。3説ひっぜ約毎伎ナキリ缶近くなず員45姿えにけろ値付ワ着知ソルキ日医ず集新エウカケ投国チ生目ゃ棋運ぐのか寄募オチ性注経どドんて止代わくかな端期幕はかク。
|
||||
)");
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
std::basic_string_view<CharT> emoji_text() {
|
||||
return SV(
|
||||
R"(
|
||||
\U0001F636\u200D\U0001F32B\uFE0F
|
||||
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
|
||||
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
|
||||
\U0001F636\u200D\U0001F32B\uFE0F
|
||||
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
|
||||
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
|
||||
\U0001F636\u200D\U0001F32B\uFE0F
|
||||
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
|
||||
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
|
||||
\U0001F636\u200D\U0001F32B\uFE0F
|
||||
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
|
||||
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
|
||||
\U0001F636\u200D\U0001F32B\uFE0F
|
||||
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
|
||||
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
|
||||
\U0001F636\u200D\U0001F32B\uFE0F
|
||||
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
|
||||
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
|
||||
\U0001F636\u200D\U0001F32B\uFE0F
|
||||
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
|
||||
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
|
||||
\U0001F636\u200D\U0001F32B\uFE0F
|
||||
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
|
||||
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
|
||||
\U0001F636\u200D\U0001F32B\uFE0F
|
||||
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
|
||||
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
|
||||
\U0001F636\u200D\U0001F32B\uFE0F
|
||||
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
|
||||
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
|
||||
\U0001F636\u200D\U0001F32B\uFE0F
|
||||
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
|
||||
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
|
||||
\U0001F636\u200D\U0001F32B\uFE0F
|
||||
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
|
||||
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
|
||||
\U0001F636\u200D\U0001F32B\uFE0F
|
||||
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
|
||||
|
||||
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
|
||||
|
||||
\U0001F636\u200D\U0001F32B\uFE0F
|
||||
|
||||
\U0001F44B\U0001F3FB\U0001F44B\U0001F3FC\U0001F44B\U0001F3FD\U0001F44B\U0001F3FE\U0001F44B\U0001F3FF
|
||||
|
||||
\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466\U0001F1E8\U0001F1E6
|
||||
|
||||
\U0001F984
|
||||
|
||||
)");
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
void BM_text(benchmark::State& state, std::basic_string_view<CharT> input) {
|
||||
CharT buffer[5'000];
|
||||
|
||||
if constexpr (std::same_as<CharT, char>) {
|
||||
// Make sure the output buffer is large enough.
|
||||
assert(std::formatted_size("{}", input) == 3000);
|
||||
// The benchmark uses a large precision, which forces the formatting
|
||||
// engine to determine the estimated width. (There's no direct way to call
|
||||
// this function in portable code.)
|
||||
for (auto _ : state)
|
||||
benchmark::DoNotOptimize(std::__format_spec::__get_string_alignment(
|
||||
begin(), end(), 1'000'000, 1'000'000));
|
||||
state.SetItemsProcessed(state.iterations() * size());
|
||||
benchmark::DoNotOptimize(std::format_to(buffer, "{:.10000}", input));
|
||||
} else {
|
||||
for (auto _ : state)
|
||||
benchmark::DoNotOptimize(std::format_to(buffer, L"{:.10000}", input));
|
||||
}
|
||||
};
|
||||
|
||||
#define TEST(u8) \
|
||||
if constexpr (std::same_as<CharT, char>) { \
|
||||
constexpr auto p = tester{u8}; \
|
||||
p.test(state); \
|
||||
} else if constexpr (std::same_as<CharT, char16_t>) { \
|
||||
constexpr auto p = tester{TEST_CONCAT(u, u8)}; \
|
||||
p.test(state); \
|
||||
} else { \
|
||||
constexpr auto p = tester{TEST_CONCAT(U, u8)}; \
|
||||
p.test(state); \
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
static void BM_EstimateLengthNoMultiByte(benchmark::State& state) {
|
||||
TEST("The quick brown fox jumps over the lazy dog");
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
static void BM_EstimateLengthTwoByteDE(benchmark::State& state) {
|
||||
static_assert(sizeof("Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich") == 67);
|
||||
|
||||
// https://en.wikipedia.org/wiki/Pangram
|
||||
TEST("Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich");
|
||||
void BM_ascii_text(benchmark::State& state) {
|
||||
BM_text(state, ascii_text<CharT>());
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
static void BM_EstimateLengthTwoBytePL(benchmark::State& state) {
|
||||
static_assert(sizeof("Stróż pchnął kość w quiz gędźb vel fax myjń") == 53);
|
||||
|
||||
// https://en.wikipedia.org/wiki/Pangram
|
||||
TEST("Stróż pchnął kość w quiz gędźb vel fax myjń");
|
||||
}
|
||||
|
||||
// All values below are 1100, which is is the first multi column sequence.
|
||||
template <class CharT>
|
||||
static void BM_EstimateLengthThreeByteSingleColumnLow(benchmark::State& state) {
|
||||
static_assert(sizeof("\u0800\u0801\u0802\u0803\u0804\u0805\u0806\u0807"
|
||||
"\u0808\u0809\u080a\u080b\u080c\u080d\u080e\u080f") ==
|
||||
49);
|
||||
|
||||
TEST("\u0800\u0801\u0802\u0803\u0804\u0805\u0806\u0807"
|
||||
"\u0808\u0809\u080a\u080b\u080c\u080d\u080e\u080f");
|
||||
void BM_unicode_text(benchmark::State& state) {
|
||||
BM_text(state, unicode_text<CharT>());
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
static void
|
||||
BM_EstimateLengthThreeByteSingleColumnHigh(benchmark::State& state) {
|
||||
static_assert(sizeof("\u1800\u1801\u1802\u1803\u1804\u1805\u1806\u1807"
|
||||
"\u1808\u1809\u180a\u180b\u180c\u180d\u180e\u180f") ==
|
||||
49);
|
||||
|
||||
TEST("\u1800\u1801\u1802\u1803\u1804\u1805\u1806\u1807"
|
||||
"\u1808\u1809\u180a\u180b\u180c\u180d\u180e\u180f");
|
||||
void BM_cyrillic_text(benchmark::State& state) {
|
||||
BM_text(state, cyrillic_text<CharT>());
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
static void BM_EstimateLengthThreeByteDoubleColumn(benchmark::State& state) {
|
||||
static_assert(sizeof("\u1100\u0801\u0802\u0803\u0804\u0805\u0806\u0807"
|
||||
"\u1108\u0809\u080a\u080b\u080c\u080d\u080e\u080f") ==
|
||||
49);
|
||||
|
||||
TEST("\u1100\u0801\u0802\u0803\u0804\u0805\u0806\u0807"
|
||||
"\u1108\u0809\u080a\u080b\u080c\u080d\u080e\u080f");
|
||||
void BM_japanese_text(benchmark::State& state) {
|
||||
BM_text(state, japanese_text<CharT>());
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
static void BM_EstimateLengthThreeByte(benchmark::State& state) {
|
||||
static_assert(sizeof("\u1400\u1501\ubbbb\uff00\u0800\u4099\uabcd\u4000"
|
||||
"\u8ead\ubeef\u1111\u4987\u4321\uffff\u357a\ud50e") ==
|
||||
49);
|
||||
|
||||
TEST("\u1400\u1501\ubbbb\uff00\u0800\u4099\uabcd\u4000"
|
||||
"\u8ead\ubeef\u1111\u4987\u4321\uffff\u357a\ud50e");
|
||||
void BM_emoji_text(benchmark::State& state) {
|
||||
BM_text(state, emoji_text<CharT>());
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
static void BM_EstimateLengthFourByteSingleColumn(benchmark::State& state) {
|
||||
static_assert(sizeof("\U00010000\U00010001\U00010002\U00010003"
|
||||
"\U00010004\U00010005\U00010006\U00010007"
|
||||
"\U00010008\U00010009\U0001000a\U0001000b"
|
||||
"\U0001000c\U0001000d\U0001000e\U0001000f") == 65);
|
||||
BENCHMARK_TEMPLATE(BM_ascii_text, char);
|
||||
BENCHMARK_TEMPLATE(BM_unicode_text, char);
|
||||
BENCHMARK_TEMPLATE(BM_cyrillic_text, char);
|
||||
BENCHMARK_TEMPLATE(BM_japanese_text, char);
|
||||
BENCHMARK_TEMPLATE(BM_emoji_text, char);
|
||||
|
||||
TEST("\U00010000\U00010001\U00010002\U00010003"
|
||||
"\U00010004\U00010005\U00010006\U00010007"
|
||||
"\U00010008\U00010009\U0001000a\U0001000b"
|
||||
"\U0001000c\U0001000d\U0001000e\U0001000f");
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
static void BM_EstimateLengthFourByteDoubleColumn(benchmark::State& state) {
|
||||
static_assert(sizeof("\U00020000\U00020002\U00020002\U00020003"
|
||||
"\U00020004\U00020005\U00020006\U00020007"
|
||||
"\U00020008\U00020009\U0002000a\U0002000b"
|
||||
"\U0002000c\U0002000d\U0002000e\U0002000f") == 65);
|
||||
|
||||
TEST("\U00020000\U00020002\U00020002\U00020003"
|
||||
"\U00020004\U00020005\U00020006\U00020007"
|
||||
"\U00020008\U00020009\U0002000a\U0002000b"
|
||||
"\U0002000c\U0002000d\U0002000e\U0002000f");
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
static void BM_EstimateLengthFourByte(benchmark::State& state) {
|
||||
static_assert(sizeof("\U00010000\U00010001\U00010002\U00010003"
|
||||
"\U00020004\U00020005\U00020006\U00020007"
|
||||
"\U00010008\U00010009\U0001000a\U0001000b"
|
||||
"\U0002000c\U0002000d\U0002000e\U0002000f") == 65);
|
||||
|
||||
TEST("\U00010000\U00010001\U00010002\U00010003"
|
||||
"\U00020004\U00020005\U00020006\U00020007"
|
||||
"\U00010008\U00010009\U0001000a\U0001000b"
|
||||
"\U0002000c\U0002000d\U0002000e\U0002000f");
|
||||
}
|
||||
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthNoMultiByte, char);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthTwoByteDE, char);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthTwoBytePL, char);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnLow, char);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnHigh, char);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteDoubleColumn, char);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByte, char);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteSingleColumn, char);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteDoubleColumn, char);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByte, char);
|
||||
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthNoMultiByte, char16_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthTwoByteDE, char16_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthTwoBytePL, char16_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnLow, char16_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnHigh, char16_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteDoubleColumn, char16_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByte, char16_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteSingleColumn, char16_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteDoubleColumn, char16_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByte, char16_t);
|
||||
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthNoMultiByte, char32_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthTwoByteDE, char32_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthTwoBytePL, char32_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnLow, char32_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteSingleColumnHigh, char32_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByteDoubleColumn, char32_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthThreeByte, char32_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteSingleColumn, char32_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByteDoubleColumn, char32_t);
|
||||
BENCHMARK_TEMPLATE(BM_EstimateLengthFourByte, char32_t);
|
||||
BENCHMARK_TEMPLATE(BM_ascii_text, wchar_t);
|
||||
BENCHMARK_TEMPLATE(BM_unicode_text, wchar_t);
|
||||
BENCHMARK_TEMPLATE(BM_cyrillic_text, wchar_t);
|
||||
BENCHMARK_TEMPLATE(BM_japanese_text, wchar_t);
|
||||
BENCHMARK_TEMPLATE(BM_emoji_text, wchar_t);
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
benchmark::Initialize(&argc, argv);
|
||||
|
@ -275,6 +275,7 @@ set(files
|
||||
__format/buffer.h
|
||||
__format/concepts.h
|
||||
__format/enable_insertable.h
|
||||
__format/extended_grapheme_cluster_table.h
|
||||
__format/format_arg.h
|
||||
__format/format_arg_store.h
|
||||
__format/format_args.h
|
||||
@ -294,6 +295,7 @@ set(files
|
||||
__format/formatter_pointer.h
|
||||
__format/formatter_string.h
|
||||
__format/parser_std_format_spec.h
|
||||
__format/unicode.h
|
||||
__functional/binary_function.h
|
||||
__functional/binary_negate.h
|
||||
__functional/bind.h
|
||||
|
332
libcxx/include/__format/extended_grapheme_cluster_table.h
Normal file
332
libcxx/include/__format/extended_grapheme_cluster_table.h
Normal file
@ -0,0 +1,332 @@
|
||||
// -*- C++ -*-
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// WARNING, this entire header is generated by
|
||||
// utiles/generate_extended_grapheme_cluster_table.py
|
||||
// DO NOT MODIFY!
|
||||
|
||||
// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
|
||||
//
|
||||
// See Terms of Use <https://www.unicode.org/copyright.html>
|
||||
// for definitions of Unicode Inc.'s Data Files and Software.
|
||||
//
|
||||
// NOTICE TO USER: Carefully read the following legal agreement.
|
||||
// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
|
||||
// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
|
||||
// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
|
||||
// TERMS AND CONDITIONS OF THIS AGREEMENT.
|
||||
// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
|
||||
// THE DATA FILES OR SOFTWARE.
|
||||
//
|
||||
// COPYRIGHT AND PERMISSION NOTICE
|
||||
//
|
||||
// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
|
||||
// Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining
|
||||
// a copy of the Unicode data files and any associated documentation
|
||||
// (the "Data Files") or Unicode software and any associated documentation
|
||||
// (the "Software") to deal in the Data Files or Software
|
||||
// without restriction, including without limitation the rights to use,
|
||||
// copy, modify, merge, publish, distribute, and/or sell copies of
|
||||
// the Data Files or Software, and to permit persons to whom the Data Files
|
||||
// or Software are furnished to do so, provided that either
|
||||
// (a) this copyright and permission notice appear with all copies
|
||||
// of the Data Files or Software, or
|
||||
// (b) this copyright and permission notice appear in associated
|
||||
// Documentation.
|
||||
//
|
||||
// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||
// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
||||
// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
// NONINFRINGEMENT OF THIRD PARTY RIGHTS.
|
||||
// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
|
||||
// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
|
||||
// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
|
||||
// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
|
||||
// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
||||
// PERFORMANCE OF THE DATA FILES OR SOFTWARE.
|
||||
//
|
||||
// Except as contained in this notice, the name of a copyright holder
|
||||
// shall not be used in advertising or otherwise to promote the sale,
|
||||
// use or other dealings in these Data Files or Software without prior
|
||||
// written authorization of the copyright holder.
|
||||
|
||||
#ifndef _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
|
||||
#define _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
|
||||
|
||||
#include <__algorithm/upper_bound.h>
|
||||
#include <__config>
|
||||
#include <__iterator/access.h>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
|
||||
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
|
||||
# pragma GCC system_header
|
||||
#endif
|
||||
|
||||
_LIBCPP_BEGIN_NAMESPACE_STD
|
||||
|
||||
#if _LIBCPP_STD_VER > 17
|
||||
|
||||
namespace __extended_grapheme_custer_property_boundary {
|
||||
|
||||
enum class __property : uint8_t {
|
||||
// Values generated from the data files.
|
||||
__CR,
|
||||
__Control,
|
||||
__Extend,
|
||||
__Extended_Pictographic,
|
||||
__L,
|
||||
__LF,
|
||||
__LV,
|
||||
__LVT,
|
||||
__Prepend,
|
||||
__Regional_Indicator,
|
||||
__SpacingMark,
|
||||
__T,
|
||||
__V,
|
||||
__ZWJ,
|
||||
|
||||
// The properies below aren't stored in the "database".
|
||||
|
||||
// Text position properties.
|
||||
__sot,
|
||||
__eot,
|
||||
|
||||
// The code unit has none of above properties.
|
||||
__none
|
||||
};
|
||||
|
||||
/// The entries of the extended grapheme cluster bondary property table.
|
||||
///
|
||||
/// The data is generated from
|
||||
/// - https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
|
||||
/// - https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
|
||||
///
|
||||
/// The data has 3 values
|
||||
/// - bits [0, 3] The property. One of the values generated form the datafiles
|
||||
/// of \ref __property
|
||||
/// - bits [4, 10] The size of the range.
|
||||
/// - bits [11, 31] The lower bound code point of the range. The upper bound of
|
||||
/// the range is lower bound + size.
|
||||
///
|
||||
/// The 7 bits for the size allow a maximum range of 128 elements. Some ranges
|
||||
/// in the Unicode tables are larger. They are stored in multiple consecutive
|
||||
/// ranges in the data table. An alternative would be to store the sizes in a
|
||||
/// separate 16-bit value. The original MSVC STL code had such an approach, but
|
||||
/// this approach uses less space for the data and is about 4% faster in the
|
||||
/// following benchmark.
|
||||
/// libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp
|
||||
inline constexpr uint32_t __entries[1480] = {
|
||||
0x00000091, 0x00005005, 0x00005811, 0x00006800, 0x00007111, 0x0003fa01, 0x00054803, 0x00056801, 0x00057003,
|
||||
0x001806f2, 0x00241862, 0x002c8ac2, 0x002df802, 0x002e0812, 0x002e2012, 0x002e3802, 0x00300058, 0x003080a2,
|
||||
0x0030e001, 0x00325942, 0x00338002, 0x0036b062, 0x0036e808, 0x0036f852, 0x00373812, 0x00375032, 0x00387808,
|
||||
0x00388802, 0x003981a2, 0x003d30a2, 0x003f5882, 0x003fe802, 0x0040b032, 0x0040d882, 0x00412822, 0x00414842,
|
||||
0x0042c822, 0x00448018, 0x0044c072, 0x00465172, 0x00471008, 0x004719f2, 0x0048180a, 0x0049d002, 0x0049d80a,
|
||||
0x0049e002, 0x0049f02a, 0x004a0872, 0x004a483a, 0x004a6802, 0x004a701a, 0x004a8862, 0x004b1012, 0x004c0802,
|
||||
0x004c101a, 0x004de002, 0x004df002, 0x004df81a, 0x004e0832, 0x004e381a, 0x004e581a, 0x004e6802, 0x004eb802,
|
||||
0x004f1012, 0x004ff002, 0x00500812, 0x0050180a, 0x0051e002, 0x0051f02a, 0x00520812, 0x00523812, 0x00525822,
|
||||
0x00528802, 0x00538012, 0x0053a802, 0x00540812, 0x0054180a, 0x0055e002, 0x0055f02a, 0x00560842, 0x00563812,
|
||||
0x0056480a, 0x0056581a, 0x00566802, 0x00571012, 0x0057d052, 0x00580802, 0x0058101a, 0x0059e002, 0x0059f012,
|
||||
0x005a000a, 0x005a0832, 0x005a381a, 0x005a581a, 0x005a6802, 0x005aa822, 0x005b1012, 0x005c1002, 0x005df002,
|
||||
0x005df80a, 0x005e0002, 0x005e081a, 0x005e302a, 0x005e502a, 0x005e6802, 0x005eb802, 0x00600002, 0x0060082a,
|
||||
0x00602002, 0x0061e002, 0x0061f022, 0x0062083a, 0x00623022, 0x00625032, 0x0062a812, 0x00631012, 0x00640802,
|
||||
0x0064101a, 0x0065e002, 0x0065f00a, 0x0065f802, 0x0066001a, 0x00661002, 0x0066181a, 0x00663002, 0x0066381a,
|
||||
0x0066501a, 0x00666012, 0x0066a812, 0x00671012, 0x00680012, 0x0068101a, 0x0069d812, 0x0069f002, 0x0069f81a,
|
||||
0x006a0832, 0x006a302a, 0x006a502a, 0x006a6802, 0x006a7008, 0x006ab802, 0x006b1012, 0x006c0802, 0x006c101a,
|
||||
0x006e5002, 0x006e7802, 0x006e801a, 0x006e9022, 0x006eb002, 0x006ec06a, 0x006ef802, 0x006f901a, 0x00718802,
|
||||
0x0071980a, 0x0071a062, 0x00723872, 0x00758802, 0x0075980a, 0x0075a082, 0x00764052, 0x0078c012, 0x0079a802,
|
||||
0x0079b802, 0x0079c802, 0x0079f01a, 0x007b88d2, 0x007bf80a, 0x007c0042, 0x007c3012, 0x007c68a2, 0x007cca32,
|
||||
0x007e3002, 0x00816832, 0x0081880a, 0x00819052, 0x0081c812, 0x0081d81a, 0x0081e812, 0x0082b01a, 0x0082c012,
|
||||
0x0082f022, 0x00838832, 0x00841002, 0x0084200a, 0x00842812, 0x00846802, 0x0084e802, 0x008805f4, 0x008b047c,
|
||||
0x008d457b, 0x009ae822, 0x00b89022, 0x00b8a80a, 0x00b99012, 0x00b9a00a, 0x00ba9012, 0x00bb9012, 0x00bda012,
|
||||
0x00bdb00a, 0x00bdb862, 0x00bdf07a, 0x00be3002, 0x00be381a, 0x00be48a2, 0x00bee802, 0x00c05822, 0x00c07001,
|
||||
0x00c07802, 0x00c42812, 0x00c54802, 0x00c90022, 0x00c9183a, 0x00c93812, 0x00c9482a, 0x00c9801a, 0x00c99002,
|
||||
0x00c9985a, 0x00c9c822, 0x00d0b812, 0x00d0c81a, 0x00d0d802, 0x00d2a80a, 0x00d2b002, 0x00d2b80a, 0x00d2c062,
|
||||
0x00d30002, 0x00d31002, 0x00d32872, 0x00d3685a, 0x00d39892, 0x00d3f802, 0x00d581e2, 0x00d80032, 0x00d8200a,
|
||||
0x00d9a062, 0x00d9d80a, 0x00d9e002, 0x00d9e84a, 0x00da1002, 0x00da181a, 0x00db5882, 0x00dc0012, 0x00dc100a,
|
||||
0x00dd080a, 0x00dd1032, 0x00dd301a, 0x00dd4012, 0x00dd500a, 0x00dd5822, 0x00df3002, 0x00df380a, 0x00df4012,
|
||||
0x00df502a, 0x00df6802, 0x00df700a, 0x00df7822, 0x00df901a, 0x00e1207a, 0x00e16072, 0x00e1a01a, 0x00e1b012,
|
||||
0x00e68022, 0x00e6a0c2, 0x00e7080a, 0x00e71062, 0x00e76802, 0x00e7a002, 0x00e7b80a, 0x00e7c012, 0x00ee03f2,
|
||||
0x01005801, 0x01006002, 0x0100680d, 0x01007011, 0x01014061, 0x0101e003, 0x01024803, 0x010300f1, 0x01068202,
|
||||
0x01091003, 0x0109c803, 0x010ca053, 0x010d4813, 0x0118d013, 0x01194003, 0x011c4003, 0x011e7803, 0x011f48a3,
|
||||
0x011fc023, 0x01261003, 0x012d5013, 0x012db003, 0x012e0003, 0x012fd833, 0x01300053, 0x013038b3, 0x0130a713,
|
||||
0x01348753, 0x013840a3, 0x0138a003, 0x0138b003, 0x0138e803, 0x01390803, 0x01394003, 0x01399813, 0x013a2003,
|
||||
0x013a3803, 0x013a6003, 0x013a7003, 0x013a9823, 0x013ab803, 0x013b1843, 0x013ca823, 0x013d0803, 0x013d8003,
|
||||
0x013df803, 0x0149a013, 0x01582823, 0x0158d813, 0x015a8003, 0x015aa803, 0x01677822, 0x016bf802, 0x016f01f2,
|
||||
0x01815052, 0x01818003, 0x0181e803, 0x0184c812, 0x0194b803, 0x0194c803, 0x05337832, 0x0533a092, 0x0534f012,
|
||||
0x05378012, 0x05401002, 0x05403002, 0x05405802, 0x0541181a, 0x05412812, 0x0541380a, 0x05416002, 0x0544001a,
|
||||
0x0545a0fa, 0x05462012, 0x05470112, 0x0547f802, 0x05493072, 0x054a38a2, 0x054a901a, 0x054b01c4, 0x054c0022,
|
||||
0x054c180a, 0x054d9802, 0x054da01a, 0x054db032, 0x054dd01a, 0x054de012, 0x054df02a, 0x054f2802, 0x05514852,
|
||||
0x0551781a, 0x05518812, 0x0551981a, 0x0551a812, 0x05521802, 0x05526002, 0x0552680a, 0x0553e002, 0x05558002,
|
||||
0x05559022, 0x0555b812, 0x0555f012, 0x05560802, 0x0557580a, 0x05576012, 0x0557701a, 0x0557a80a, 0x0557b002,
|
||||
0x055f181a, 0x055f2802, 0x055f301a, 0x055f4002, 0x055f481a, 0x055f600a, 0x055f6802, 0x05600006, 0x056009a7,
|
||||
0x0560e006, 0x0560e9a7, 0x0561c006, 0x0561c9a7, 0x0562a006, 0x0562a9a7, 0x05638006, 0x056389a7, 0x05646006,
|
||||
0x056469a7, 0x05654006, 0x056549a7, 0x05662006, 0x056629a7, 0x05670006, 0x056709a7, 0x0567e006, 0x0567e9a7,
|
||||
0x0568c006, 0x0568c9a7, 0x0569a006, 0x0569a9a7, 0x056a8006, 0x056a89a7, 0x056b6006, 0x056b69a7, 0x056c4006,
|
||||
0x056c49a7, 0x056d2006, 0x056d29a7, 0x056e0006, 0x056e09a7, 0x056ee006, 0x056ee9a7, 0x056fc006, 0x056fc9a7,
|
||||
0x0570a006, 0x0570a9a7, 0x05718006, 0x057189a7, 0x05726006, 0x057269a7, 0x05734006, 0x057349a7, 0x05742006,
|
||||
0x057429a7, 0x05750006, 0x057509a7, 0x0575e006, 0x0575e9a7, 0x0576c006, 0x0576c9a7, 0x0577a006, 0x0577a9a7,
|
||||
0x05788006, 0x057889a7, 0x05796006, 0x057969a7, 0x057a4006, 0x057a49a7, 0x057b2006, 0x057b29a7, 0x057c0006,
|
||||
0x057c09a7, 0x057ce006, 0x057ce9a7, 0x057dc006, 0x057dc9a7, 0x057ea006, 0x057ea9a7, 0x057f8006, 0x057f89a7,
|
||||
0x05806006, 0x058069a7, 0x05814006, 0x058149a7, 0x05822006, 0x058229a7, 0x05830006, 0x058309a7, 0x0583e006,
|
||||
0x0583e9a7, 0x0584c006, 0x0584c9a7, 0x0585a006, 0x0585a9a7, 0x05868006, 0x058689a7, 0x05876006, 0x058769a7,
|
||||
0x05884006, 0x058849a7, 0x05892006, 0x058929a7, 0x058a0006, 0x058a09a7, 0x058ae006, 0x058ae9a7, 0x058bc006,
|
||||
0x058bc9a7, 0x058ca006, 0x058ca9a7, 0x058d8006, 0x058d89a7, 0x058e6006, 0x058e69a7, 0x058f4006, 0x058f49a7,
|
||||
0x05902006, 0x059029a7, 0x05910006, 0x059109a7, 0x0591e006, 0x0591e9a7, 0x0592c006, 0x0592c9a7, 0x0593a006,
|
||||
0x0593a9a7, 0x05948006, 0x059489a7, 0x05956006, 0x059569a7, 0x05964006, 0x059649a7, 0x05972006, 0x059729a7,
|
||||
0x05980006, 0x059809a7, 0x0598e006, 0x0598e9a7, 0x0599c006, 0x0599c9a7, 0x059aa006, 0x059aa9a7, 0x059b8006,
|
||||
0x059b89a7, 0x059c6006, 0x059c69a7, 0x059d4006, 0x059d49a7, 0x059e2006, 0x059e29a7, 0x059f0006, 0x059f09a7,
|
||||
0x059fe006, 0x059fe9a7, 0x05a0c006, 0x05a0c9a7, 0x05a1a006, 0x05a1a9a7, 0x05a28006, 0x05a289a7, 0x05a36006,
|
||||
0x05a369a7, 0x05a44006, 0x05a449a7, 0x05a52006, 0x05a529a7, 0x05a60006, 0x05a609a7, 0x05a6e006, 0x05a6e9a7,
|
||||
0x05a7c006, 0x05a7c9a7, 0x05a8a006, 0x05a8a9a7, 0x05a98006, 0x05a989a7, 0x05aa6006, 0x05aa69a7, 0x05ab4006,
|
||||
0x05ab49a7, 0x05ac2006, 0x05ac29a7, 0x05ad0006, 0x05ad09a7, 0x05ade006, 0x05ade9a7, 0x05aec006, 0x05aec9a7,
|
||||
0x05afa006, 0x05afa9a7, 0x05b08006, 0x05b089a7, 0x05b16006, 0x05b169a7, 0x05b24006, 0x05b249a7, 0x05b32006,
|
||||
0x05b329a7, 0x05b40006, 0x05b409a7, 0x05b4e006, 0x05b4e9a7, 0x05b5c006, 0x05b5c9a7, 0x05b6a006, 0x05b6a9a7,
|
||||
0x05b78006, 0x05b789a7, 0x05b86006, 0x05b869a7, 0x05b94006, 0x05b949a7, 0x05ba2006, 0x05ba29a7, 0x05bb0006,
|
||||
0x05bb09a7, 0x05bbe006, 0x05bbe9a7, 0x05bcc006, 0x05bcc9a7, 0x05bda006, 0x05bda9a7, 0x05be8006, 0x05be89a7,
|
||||
0x05bf6006, 0x05bf69a7, 0x05c04006, 0x05c049a7, 0x05c12006, 0x05c129a7, 0x05c20006, 0x05c209a7, 0x05c2e006,
|
||||
0x05c2e9a7, 0x05c3c006, 0x05c3c9a7, 0x05c4a006, 0x05c4a9a7, 0x05c58006, 0x05c589a7, 0x05c66006, 0x05c669a7,
|
||||
0x05c74006, 0x05c749a7, 0x05c82006, 0x05c829a7, 0x05c90006, 0x05c909a7, 0x05c9e006, 0x05c9e9a7, 0x05cac006,
|
||||
0x05cac9a7, 0x05cba006, 0x05cba9a7, 0x05cc8006, 0x05cc89a7, 0x05cd6006, 0x05cd69a7, 0x05ce4006, 0x05ce49a7,
|
||||
0x05cf2006, 0x05cf29a7, 0x05d00006, 0x05d009a7, 0x05d0e006, 0x05d0e9a7, 0x05d1c006, 0x05d1c9a7, 0x05d2a006,
|
||||
0x05d2a9a7, 0x05d38006, 0x05d389a7, 0x05d46006, 0x05d469a7, 0x05d54006, 0x05d549a7, 0x05d62006, 0x05d629a7,
|
||||
0x05d70006, 0x05d709a7, 0x05d7e006, 0x05d7e9a7, 0x05d8c006, 0x05d8c9a7, 0x05d9a006, 0x05d9a9a7, 0x05da8006,
|
||||
0x05da89a7, 0x05db6006, 0x05db69a7, 0x05dc4006, 0x05dc49a7, 0x05dd2006, 0x05dd29a7, 0x05de0006, 0x05de09a7,
|
||||
0x05dee006, 0x05dee9a7, 0x05dfc006, 0x05dfc9a7, 0x05e0a006, 0x05e0a9a7, 0x05e18006, 0x05e189a7, 0x05e26006,
|
||||
0x05e269a7, 0x05e34006, 0x05e349a7, 0x05e42006, 0x05e429a7, 0x05e50006, 0x05e509a7, 0x05e5e006, 0x05e5e9a7,
|
||||
0x05e6c006, 0x05e6c9a7, 0x05e7a006, 0x05e7a9a7, 0x05e88006, 0x05e889a7, 0x05e96006, 0x05e969a7, 0x05ea4006,
|
||||
0x05ea49a7, 0x05eb2006, 0x05eb29a7, 0x05ec0006, 0x05ec09a7, 0x05ece006, 0x05ece9a7, 0x05edc006, 0x05edc9a7,
|
||||
0x05eea006, 0x05eea9a7, 0x05ef8006, 0x05ef89a7, 0x05f06006, 0x05f069a7, 0x05f14006, 0x05f149a7, 0x05f22006,
|
||||
0x05f229a7, 0x05f30006, 0x05f309a7, 0x05f3e006, 0x05f3e9a7, 0x05f4c006, 0x05f4c9a7, 0x05f5a006, 0x05f5a9a7,
|
||||
0x05f68006, 0x05f689a7, 0x05f76006, 0x05f769a7, 0x05f84006, 0x05f849a7, 0x05f92006, 0x05f929a7, 0x05fa0006,
|
||||
0x05fa09a7, 0x05fae006, 0x05fae9a7, 0x05fbc006, 0x05fbc9a7, 0x05fca006, 0x05fca9a7, 0x05fd8006, 0x05fd89a7,
|
||||
0x05fe6006, 0x05fe69a7, 0x05ff4006, 0x05ff49a7, 0x06002006, 0x060029a7, 0x06010006, 0x060109a7, 0x0601e006,
|
||||
0x0601e9a7, 0x0602c006, 0x0602c9a7, 0x0603a006, 0x0603a9a7, 0x06048006, 0x060489a7, 0x06056006, 0x060569a7,
|
||||
0x06064006, 0x060649a7, 0x06072006, 0x060729a7, 0x06080006, 0x060809a7, 0x0608e006, 0x0608e9a7, 0x0609c006,
|
||||
0x0609c9a7, 0x060aa006, 0x060aa9a7, 0x060b8006, 0x060b89a7, 0x060c6006, 0x060c69a7, 0x060d4006, 0x060d49a7,
|
||||
0x060e2006, 0x060e29a7, 0x060f0006, 0x060f09a7, 0x060fe006, 0x060fe9a7, 0x0610c006, 0x0610c9a7, 0x0611a006,
|
||||
0x0611a9a7, 0x06128006, 0x061289a7, 0x06136006, 0x061369a7, 0x06144006, 0x061449a7, 0x06152006, 0x061529a7,
|
||||
0x06160006, 0x061609a7, 0x0616e006, 0x0616e9a7, 0x0617c006, 0x0617c9a7, 0x0618a006, 0x0618a9a7, 0x06198006,
|
||||
0x061989a7, 0x061a6006, 0x061a69a7, 0x061b4006, 0x061b49a7, 0x061c2006, 0x061c29a7, 0x061d0006, 0x061d09a7,
|
||||
0x061de006, 0x061de9a7, 0x061ec006, 0x061ec9a7, 0x061fa006, 0x061fa9a7, 0x06208006, 0x062089a7, 0x06216006,
|
||||
0x062169a7, 0x06224006, 0x062249a7, 0x06232006, 0x062329a7, 0x06240006, 0x062409a7, 0x0624e006, 0x0624e9a7,
|
||||
0x0625c006, 0x0625c9a7, 0x0626a006, 0x0626a9a7, 0x06278006, 0x062789a7, 0x06286006, 0x062869a7, 0x06294006,
|
||||
0x062949a7, 0x062a2006, 0x062a29a7, 0x062b0006, 0x062b09a7, 0x062be006, 0x062be9a7, 0x062cc006, 0x062cc9a7,
|
||||
0x062da006, 0x062da9a7, 0x062e8006, 0x062e89a7, 0x062f6006, 0x062f69a7, 0x06304006, 0x063049a7, 0x06312006,
|
||||
0x063129a7, 0x06320006, 0x063209a7, 0x0632e006, 0x0632e9a7, 0x0633c006, 0x0633c9a7, 0x0634a006, 0x0634a9a7,
|
||||
0x06358006, 0x063589a7, 0x06366006, 0x063669a7, 0x06374006, 0x063749a7, 0x06382006, 0x063829a7, 0x06390006,
|
||||
0x063909a7, 0x0639e006, 0x0639e9a7, 0x063ac006, 0x063ac9a7, 0x063ba006, 0x063ba9a7, 0x063c8006, 0x063c89a7,
|
||||
0x063d6006, 0x063d69a7, 0x063e4006, 0x063e49a7, 0x063f2006, 0x063f29a7, 0x06400006, 0x064009a7, 0x0640e006,
|
||||
0x0640e9a7, 0x0641c006, 0x0641c9a7, 0x0642a006, 0x0642a9a7, 0x06438006, 0x064389a7, 0x06446006, 0x064469a7,
|
||||
0x06454006, 0x064549a7, 0x06462006, 0x064629a7, 0x06470006, 0x064709a7, 0x0647e006, 0x0647e9a7, 0x0648c006,
|
||||
0x0648c9a7, 0x0649a006, 0x0649a9a7, 0x064a8006, 0x064a89a7, 0x064b6006, 0x064b69a7, 0x064c4006, 0x064c49a7,
|
||||
0x064d2006, 0x064d29a7, 0x064e0006, 0x064e09a7, 0x064ee006, 0x064ee9a7, 0x064fc006, 0x064fc9a7, 0x0650a006,
|
||||
0x0650a9a7, 0x06518006, 0x065189a7, 0x06526006, 0x065269a7, 0x06534006, 0x065349a7, 0x06542006, 0x065429a7,
|
||||
0x06550006, 0x065509a7, 0x0655e006, 0x0655e9a7, 0x0656c006, 0x0656c9a7, 0x0657a006, 0x0657a9a7, 0x06588006,
|
||||
0x065889a7, 0x06596006, 0x065969a7, 0x065a4006, 0x065a49a7, 0x065b2006, 0x065b29a7, 0x065c0006, 0x065c09a7,
|
||||
0x065ce006, 0x065ce9a7, 0x065dc006, 0x065dc9a7, 0x065ea006, 0x065ea9a7, 0x065f8006, 0x065f89a7, 0x06606006,
|
||||
0x066069a7, 0x06614006, 0x066149a7, 0x06622006, 0x066229a7, 0x06630006, 0x066309a7, 0x0663e006, 0x0663e9a7,
|
||||
0x0664c006, 0x0664c9a7, 0x0665a006, 0x0665a9a7, 0x06668006, 0x066689a7, 0x06676006, 0x066769a7, 0x06684006,
|
||||
0x066849a7, 0x06692006, 0x066929a7, 0x066a0006, 0x066a09a7, 0x066ae006, 0x066ae9a7, 0x066bc006, 0x066bc9a7,
|
||||
0x066ca006, 0x066ca9a7, 0x066d8006, 0x066d89a7, 0x066e6006, 0x066e69a7, 0x066f4006, 0x066f49a7, 0x06702006,
|
||||
0x067029a7, 0x06710006, 0x067109a7, 0x0671e006, 0x0671e9a7, 0x0672c006, 0x0672c9a7, 0x0673a006, 0x0673a9a7,
|
||||
0x06748006, 0x067489a7, 0x06756006, 0x067569a7, 0x06764006, 0x067649a7, 0x06772006, 0x067729a7, 0x06780006,
|
||||
0x067809a7, 0x0678e006, 0x0678e9a7, 0x0679c006, 0x0679c9a7, 0x067aa006, 0x067aa9a7, 0x067b8006, 0x067b89a7,
|
||||
0x067c6006, 0x067c69a7, 0x067d4006, 0x067d49a7, 0x067e2006, 0x067e29a7, 0x067f0006, 0x067f09a7, 0x067fe006,
|
||||
0x067fe9a7, 0x0680c006, 0x0680c9a7, 0x0681a006, 0x0681a9a7, 0x06828006, 0x068289a7, 0x06836006, 0x068369a7,
|
||||
0x06844006, 0x068449a7, 0x06852006, 0x068529a7, 0x06860006, 0x068609a7, 0x0686e006, 0x0686e9a7, 0x0687c006,
|
||||
0x0687c9a7, 0x0688a006, 0x0688a9a7, 0x06898006, 0x068989a7, 0x068a6006, 0x068a69a7, 0x068b4006, 0x068b49a7,
|
||||
0x068c2006, 0x068c29a7, 0x068d0006, 0x068d09a7, 0x068de006, 0x068de9a7, 0x068ec006, 0x068ec9a7, 0x068fa006,
|
||||
0x068fa9a7, 0x06908006, 0x069089a7, 0x06916006, 0x069169a7, 0x06924006, 0x069249a7, 0x06932006, 0x069329a7,
|
||||
0x06940006, 0x069409a7, 0x0694e006, 0x0694e9a7, 0x0695c006, 0x0695c9a7, 0x0696a006, 0x0696a9a7, 0x06978006,
|
||||
0x069789a7, 0x06986006, 0x069869a7, 0x06994006, 0x069949a7, 0x069a2006, 0x069a29a7, 0x069b0006, 0x069b09a7,
|
||||
0x069be006, 0x069be9a7, 0x069cc006, 0x069cc9a7, 0x069da006, 0x069da9a7, 0x069e8006, 0x069e89a7, 0x069f6006,
|
||||
0x069f69a7, 0x06a04006, 0x06a049a7, 0x06a12006, 0x06a129a7, 0x06a20006, 0x06a209a7, 0x06a2e006, 0x06a2e9a7,
|
||||
0x06a3c006, 0x06a3c9a7, 0x06a4a006, 0x06a4a9a7, 0x06a58006, 0x06a589a7, 0x06a66006, 0x06a669a7, 0x06a74006,
|
||||
0x06a749a7, 0x06a82006, 0x06a829a7, 0x06a90006, 0x06a909a7, 0x06a9e006, 0x06a9e9a7, 0x06aac006, 0x06aac9a7,
|
||||
0x06aba006, 0x06aba9a7, 0x06ac8006, 0x06ac89a7, 0x06ad6006, 0x06ad69a7, 0x06ae4006, 0x06ae49a7, 0x06af2006,
|
||||
0x06af29a7, 0x06b00006, 0x06b009a7, 0x06b0e006, 0x06b0e9a7, 0x06b1c006, 0x06b1c9a7, 0x06b2a006, 0x06b2a9a7,
|
||||
0x06b38006, 0x06b389a7, 0x06b46006, 0x06b469a7, 0x06b54006, 0x06b549a7, 0x06b62006, 0x06b629a7, 0x06b70006,
|
||||
0x06b709a7, 0x06b7e006, 0x06b7e9a7, 0x06b8c006, 0x06b8c9a7, 0x06b9a006, 0x06b9a9a7, 0x06ba8006, 0x06ba89a7,
|
||||
0x06bb6006, 0x06bb69a7, 0x06bc4006, 0x06bc49a7, 0x06bd816c, 0x06be5b0b, 0x07d8f002, 0x07f000f2, 0x07f100f2,
|
||||
0x07f7f801, 0x07fcf012, 0x07ff80b1, 0x080fe802, 0x08170002, 0x081bb042, 0x08500822, 0x08502812, 0x08506032,
|
||||
0x0851c022, 0x0851f802, 0x08572812, 0x08692032, 0x08755812, 0x087a30a2, 0x087c1032, 0x0880000a, 0x08800802,
|
||||
0x0880100a, 0x0881c0e2, 0x08838002, 0x08839812, 0x0883f822, 0x0884100a, 0x0885802a, 0x08859832, 0x0885b81a,
|
||||
0x0885c812, 0x0885e808, 0x08861002, 0x08866808, 0x08880022, 0x08893842, 0x0889600a, 0x08896872, 0x088a281a,
|
||||
0x088b9802, 0x088c0012, 0x088c100a, 0x088d982a, 0x088db082, 0x088df81a, 0x088e1018, 0x088e4832, 0x088e700a,
|
||||
0x088e7802, 0x0891602a, 0x08917822, 0x0891901a, 0x0891a002, 0x0891a80a, 0x0891b012, 0x0891f002, 0x0896f802,
|
||||
0x0897002a, 0x08971872, 0x08980012, 0x0898101a, 0x0899d812, 0x0899f002, 0x0899f80a, 0x089a0002, 0x089a083a,
|
||||
0x089a381a, 0x089a582a, 0x089ab802, 0x089b101a, 0x089b3062, 0x089b8042, 0x08a1a82a, 0x08a1c072, 0x08a2001a,
|
||||
0x08a21022, 0x08a2280a, 0x08a23002, 0x08a2f002, 0x08a58002, 0x08a5881a, 0x08a59852, 0x08a5c80a, 0x08a5d002,
|
||||
0x08a5d81a, 0x08a5e802, 0x08a5f00a, 0x08a5f812, 0x08a6080a, 0x08a61012, 0x08ad7802, 0x08ad801a, 0x08ad9032,
|
||||
0x08adc03a, 0x08ade012, 0x08adf00a, 0x08adf812, 0x08aee012, 0x08b1802a, 0x08b19872, 0x08b1d81a, 0x08b1e802,
|
||||
0x08b1f00a, 0x08b1f812, 0x08b55802, 0x08b5600a, 0x08b56802, 0x08b5701a, 0x08b58052, 0x08b5b00a, 0x08b5b802,
|
||||
0x08b8e822, 0x08b91032, 0x08b9300a, 0x08b93842, 0x08c1602a, 0x08c17882, 0x08c1c00a, 0x08c1c812, 0x08c98002,
|
||||
0x08c9884a, 0x08c9b81a, 0x08c9d812, 0x08c9e80a, 0x08c9f002, 0x08c9f808, 0x08ca000a, 0x08ca0808, 0x08ca100a,
|
||||
0x08ca1802, 0x08ce882a, 0x08cea032, 0x08ced012, 0x08cee03a, 0x08cf0002, 0x08cf200a, 0x08d00892, 0x08d19852,
|
||||
0x08d1c80a, 0x08d1d008, 0x08d1d832, 0x08d23802, 0x08d28852, 0x08d2b81a, 0x08d2c822, 0x08d42058, 0x08d450c2,
|
||||
0x08d4b80a, 0x08d4c012, 0x08e1780a, 0x08e18062, 0x08e1c052, 0x08e1f00a, 0x08e1f802, 0x08e49152, 0x08e5480a,
|
||||
0x08e55062, 0x08e5880a, 0x08e59012, 0x08e5a00a, 0x08e5a812, 0x08e98852, 0x08e9d002, 0x08e9e012, 0x08e9f862,
|
||||
0x08ea3008, 0x08ea3802, 0x08ec504a, 0x08ec8012, 0x08ec981a, 0x08eca802, 0x08ecb00a, 0x08ecb802, 0x08f79812,
|
||||
0x08f7a81a, 0x09a18081, 0x0b578042, 0x0b598062, 0x0b7a7802, 0x0b7a8b6a, 0x0b7c7832, 0x0b7f2002, 0x0b7f801a,
|
||||
0x0de4e812, 0x0de50031, 0x0e7802d2, 0x0e798162, 0x0e8b2802, 0x0e8b300a, 0x0e8b3822, 0x0e8b680a, 0x0e8b7042,
|
||||
0x0e8b9871, 0x0e8bd872, 0x0e8c2862, 0x0e8d5032, 0x0e921022, 0x0ed00362, 0x0ed1db12, 0x0ed3a802, 0x0ed42002,
|
||||
0x0ed4d842, 0x0ed508e2, 0x0f000062, 0x0f004102, 0x0f00d862, 0x0f011812, 0x0f013042, 0x0f098062, 0x0f157002,
|
||||
0x0f176032, 0x0f468062, 0x0f4a2062, 0x0f8007f3, 0x0f8407f3, 0x0f886823, 0x0f897803, 0x0f8b6053, 0x0f8bf013,
|
||||
0x0f8c7003, 0x0f8c8893, 0x0f8d6b83, 0x0f8f3199, 0x0f9008e3, 0x0f90d003, 0x0f917803, 0x0f919083, 0x0f91e033,
|
||||
0x0f924ff3, 0x0f964ff3, 0x0f9a4ff3, 0x0f9e4b13, 0x0f9fd842, 0x0fa007f3, 0x0fa407f3, 0x0fa803d3, 0x0faa37f3,
|
||||
0x0fae37f3, 0x0fb23093, 0x0fb407f3, 0x0fbba0b3, 0x0fbeaaa3, 0x0fc06033, 0x0fc24073, 0x0fc2d053, 0x0fc44073,
|
||||
0x0fc57513, 0x0fc862e3, 0x0fc9e093, 0x0fca3ff3, 0x0fce3ff3, 0x0fd23ff3, 0x0fd63b83, 0x0fe007f3, 0x0fe407f3,
|
||||
0x0fe807f3, 0x0fec07f3, 0x0ff007f3, 0x0ff407f3, 0x0ff807f3, 0x0ffc07d3, 0x700001f1, 0x700105f2, 0x700407f1,
|
||||
0x700807f2, 0x700c06f2, 0x700f87f1, 0x701387f1, 0x701787f1, 0x701b87f1, 0x701f87f1, 0x702387f1, 0x702787f1,
|
||||
0x702b87f1, 0x702f87f1, 0x703387f1, 0x703787f1, 0x703b87f1, 0x703f87f1, 0x704387f1, 0x704787f1, 0x704b87f1,
|
||||
0x704f87f1, 0x705387f1, 0x705787f1, 0x705b87f1, 0x705f87f1, 0x706387f1, 0x706787f1, 0x706b87f1, 0x706f87f1,
|
||||
0x707387f1, 0x707787f1, 0x707b87f1, 0x707f80f1};
|
||||
|
||||
/// Returns the extended grapheme cluster bondary property of a code point.
|
||||
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __property __get_property(const char32_t __code_point) noexcept {
|
||||
// TODO FMT use std::ranges::upper_bound.
|
||||
|
||||
// The algorithm searches for the upper bound of the range and, when found,
|
||||
// steps back one entry. This algorithm is used since the code point can be
|
||||
// anywhere in the range. After a lower bound is found the next step is to
|
||||
// compare whether the code unit is indeed in the range.
|
||||
//
|
||||
// Since the entry contains a code unit, size, and property the code point
|
||||
// being sought needs to be adjusted. Just shifting the code point to the
|
||||
// proper position doesn't work; suppose an entry has property 0, size 1,
|
||||
// and lower bound 3. This results in the entry 0x1810.
|
||||
// When searching for code point 3 it will search for 0x1800, find 0x1810
|
||||
// and moves to the previous entry. Thus the lower bound value will never
|
||||
// be found.
|
||||
// The simple solution is to set the bits belonging to the property and
|
||||
// size. Then the upper bound for code point 3 will return the entry after
|
||||
// 0x1810. After moving to the previous entry the algorithm arrives at the
|
||||
// correct entry.
|
||||
ptrdiff_t __i = std::upper_bound(__entries, std::end(__entries), (__code_point << 11) | 0x7ffu) - __entries;
|
||||
if (__i == 0)
|
||||
return __property::__none;
|
||||
|
||||
--__i;
|
||||
uint32_t __upper_bound = (__entries[__i] >> 11) + ((__entries[__i] >> 4) & 0x7f);
|
||||
if (__code_point <= __upper_bound)
|
||||
return static_cast<__property>(__entries[__i] & 0xf);
|
||||
|
||||
return __property::__none;
|
||||
}
|
||||
|
||||
} // namespace __extended_grapheme_custer_property_boundary
|
||||
|
||||
#endif //_LIBCPP_STD_VER > 17
|
||||
|
||||
_LIBCPP_END_NAMESPACE_STD
|
||||
|
||||
#endif // _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
|
@ -343,7 +343,7 @@ __format_bool(bool __value, auto& __ctx, __format_spec::__parsed_specifications<
|
||||
if (__specs.__std_.__locale_specific_form_) {
|
||||
const auto& __np = use_facet<numpunct<_CharT>>(__ctx.locale());
|
||||
basic_string<_CharT> __str = __value ? __np.truename() : __np.falsename();
|
||||
return __formatter::__write_unicode_no_precision(basic_string_view<_CharT>{__str}, __ctx.out(), __specs);
|
||||
return __formatter::__write_string_no_precision(basic_string_view<_CharT>{__str}, __ctx.out(), __specs);
|
||||
}
|
||||
# endif
|
||||
basic_string_view<_CharT> __str =
|
||||
|
@ -17,6 +17,7 @@
|
||||
#include <__config>
|
||||
#include <__format/formatter.h>
|
||||
#include <__format/parser_std_format_spec.h>
|
||||
#include <__format/unicode.h>
|
||||
#include <__utility/move.h>
|
||||
#include <__utility/unreachable.h>
|
||||
#include <cstddef>
|
||||
@ -59,8 +60,8 @@ struct _LIBCPP_TYPE_VIS __padding_size_result {
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr __padding_size_result
|
||||
__padding_size(size_t __size, size_t __width, __format_spec::__alignment __align) {
|
||||
_LIBCPP_ASSERT(__width > __size, "don't call this function when no padding is required");
|
||||
_LIBCPP_ASSERT(__align != __format_spec::__alignment::__zero_padding,
|
||||
"the caller should have handled the zero-padding");
|
||||
_LIBCPP_ASSERT(
|
||||
__align != __format_spec::__alignment::__zero_padding, "the caller should have handled the zero-padding");
|
||||
|
||||
size_t __fill = __width - __size;
|
||||
switch (__align) {
|
||||
@ -75,7 +76,7 @@ __padding_size(size_t __size, size_t __width, __format_spec::__alignment __align
|
||||
// __before = floor(__fill, 2);
|
||||
// __after = ceil(__fill, 2);
|
||||
size_t __before = __fill / 2;
|
||||
size_t __after = __fill - __before;
|
||||
size_t __after = __fill - __before;
|
||||
return {__before, __after};
|
||||
}
|
||||
case __format_spec::__alignment::__default:
|
||||
@ -173,10 +174,12 @@ _LIBCPP_HIDE_FROM_ABI _OutIt __write_using_decimal_separators(_OutIt __out_it, c
|
||||
/// conversion, which means the [\a __first, \a __last) always contains elements
|
||||
/// of the type \c char.
|
||||
template <class _CharT, class _ParserCharT>
|
||||
_LIBCPP_HIDE_FROM_ABI auto __write(const _CharT* __first, const _CharT* __last,
|
||||
output_iterator<const _CharT&> auto __out_it,
|
||||
__format_spec::__parsed_specifications<_ParserCharT> __specs, ptrdiff_t __size)
|
||||
-> decltype(__out_it) {
|
||||
_LIBCPP_HIDE_FROM_ABI auto __write(
|
||||
const _CharT* __first,
|
||||
const _CharT* __last,
|
||||
output_iterator<const _CharT&> auto __out_it,
|
||||
__format_spec::__parsed_specifications<_ParserCharT> __specs,
|
||||
ptrdiff_t __size) -> decltype(__out_it) {
|
||||
_LIBCPP_ASSERT(__first <= __last, "Not a valid range");
|
||||
|
||||
if (__size >= __specs.__width_)
|
||||
@ -189,6 +192,7 @@ _LIBCPP_HIDE_FROM_ABI auto __write(const _CharT* __first, const _CharT* __last,
|
||||
}
|
||||
|
||||
/// \overload
|
||||
///
|
||||
/// Calls the function above where \a __size = \a __last - \a __first.
|
||||
template <class _CharT, class _ParserCharT>
|
||||
_LIBCPP_HIDE_FROM_ABI auto __write(const _CharT* __first, const _CharT* __last,
|
||||
@ -243,77 +247,56 @@ _LIBCPP_HIDE_FROM_ABI auto __write_using_trailing_zeros(
|
||||
return _VSTD::fill_n(_VSTD::move(__out_it), __padding.__after_, __specs.__fill_);
|
||||
}
|
||||
|
||||
# ifndef _LIBCPP_HAS_NO_UNICODE
|
||||
/// Writes a string using format's width estimation algorithm.
|
||||
///
|
||||
/// \pre !__specs.__has_precision()
|
||||
///
|
||||
/// \note When \c _LIBCPP_HAS_NO_UNICODE is defined the function assumes the
|
||||
/// input is ASCII.
|
||||
template <class _CharT>
|
||||
_LIBCPP_HIDE_FROM_ABI auto __write_unicode_no_precision(basic_string_view<_CharT> __str,
|
||||
output_iterator<const _CharT&> auto __out_it,
|
||||
__format_spec::__parsed_specifications<_CharT> __specs)
|
||||
-> decltype(__out_it) {
|
||||
_LIBCPP_HIDE_FROM_ABI auto __write_string_no_precision(
|
||||
basic_string_view<_CharT> __str,
|
||||
output_iterator<const _CharT&> auto __out_it,
|
||||
__format_spec::__parsed_specifications<_CharT> __specs) -> decltype(__out_it) {
|
||||
_LIBCPP_ASSERT(!__specs.__has_precision(), "use __write_string");
|
||||
|
||||
_LIBCPP_ASSERT(!__specs.__has_precision(), "use __write_unicode");
|
||||
// No padding -> copy the string
|
||||
if (!__specs.__has_width())
|
||||
return _VSTD::copy(__str.begin(), __str.end(), _VSTD::move(__out_it));
|
||||
|
||||
// Non Unicode part larger than width -> copy the string
|
||||
auto __last = __format_spec::__detail::__estimate_column_width_fast(__str.begin(), __str.end());
|
||||
ptrdiff_t __size = __last - __str.begin();
|
||||
if (__size >= __specs.__width_)
|
||||
return _VSTD::copy(__str.begin(), __str.end(), _VSTD::move(__out_it));
|
||||
|
||||
// Is there a non Unicode part?
|
||||
if (__last != __str.end()) {
|
||||
// Non Unicode and Unicode part larger than width -> copy the string
|
||||
__format_spec::__detail::__column_width_result __column_width =
|
||||
__format_spec::__detail::__estimate_column_width(__last, __str.end(), __specs.__width_);
|
||||
__size += __column_width.__width; // Note this new size is used when __size < __specs.__width_
|
||||
if (__size >= __specs.__width_)
|
||||
return _VSTD::copy(__str.begin(), __str.end(), _VSTD::move(__out_it));
|
||||
}
|
||||
// Note when the estimated width is larger than size there's no padding. So
|
||||
// there's no reason to get the real size when the estimate is larger than or
|
||||
// equal to the minimum field width.
|
||||
size_t __size =
|
||||
__format_spec::__estimate_column_width(__str, __specs.__width_, __format_spec::__column_width_rounding::__up)
|
||||
.__width_;
|
||||
|
||||
return __formatter::__write(__str.begin(), __str.end(), _VSTD::move(__out_it), __specs, __size);
|
||||
}
|
||||
# endif
|
||||
|
||||
template <class _CharT>
|
||||
_LIBCPP_HIDE_FROM_ABI auto __write_unicode(basic_string_view<_CharT> __str,
|
||||
output_iterator<const _CharT&> auto __out_it,
|
||||
__format_spec::__parsed_specifications<_CharT> __specs)
|
||||
-> decltype(__out_it) {
|
||||
# ifndef _LIBCPP_HAS_NO_UNICODE
|
||||
_LIBCPP_HIDE_FROM_ABI int __truncate(basic_string_view<_CharT>& __str, int __precision_) {
|
||||
__format_spec::__column_width_result<_CharT> __result =
|
||||
__format_spec::__estimate_column_width(__str, __precision_, __format_spec::__column_width_rounding::__down);
|
||||
__str = basic_string_view<_CharT>{__str.begin(), __result.__last_};
|
||||
return __result.__width_;
|
||||
}
|
||||
|
||||
/// Writes a string using format's width estimation algorithm.
|
||||
///
|
||||
/// \note When \c _LIBCPP_HAS_NO_UNICODE is defined the function assumes the
|
||||
/// input is ASCII.
|
||||
template <class _CharT>
|
||||
_LIBCPP_HIDE_FROM_ABI auto __write_string(
|
||||
basic_string_view<_CharT> __str,
|
||||
output_iterator<const _CharT&> auto __out_it,
|
||||
__format_spec::__parsed_specifications<_CharT> __specs) -> decltype(__out_it) {
|
||||
if (!__specs.__has_precision())
|
||||
return __formatter::__write_unicode_no_precision(__str, _VSTD::move(__out_it), __specs);
|
||||
return __formatter::__write_string_no_precision(__str, _VSTD::move(__out_it), __specs);
|
||||
|
||||
// Non unicode part larger than precision -> truncate the output and use the normal write operation.
|
||||
auto __last = __format_spec::__detail::__estimate_column_width_fast(__str.begin(), __str.end());
|
||||
ptrdiff_t __size = __last - __str.begin();
|
||||
if (__size >= __specs.__precision_)
|
||||
return __formatter::__write(__str.begin(), __str.begin() + __specs.__precision_, _VSTD::move(__out_it), __specs,
|
||||
__specs.__precision_);
|
||||
int __size = __formatter::__truncate(__str, __specs.__precision_);
|
||||
|
||||
// No non Unicode part, implies __size < __specs.__precision_ -> use normal write operation
|
||||
if (__last == __str.end())
|
||||
return __formatter::__write(__str.begin(), __str.end(), _VSTD::move(__out_it), __specs, __str.size());
|
||||
|
||||
__format_spec::__detail::__column_width_result __column_width =
|
||||
__format_spec::__detail::__estimate_column_width(__last, __str.end(), __specs.__precision_ - __size);
|
||||
__size += __column_width.__width;
|
||||
// Truncate the output
|
||||
if (__column_width.__ptr != __str.end())
|
||||
__str.remove_suffix(__str.end() - __column_width.__ptr);
|
||||
|
||||
return __formatter::__write(__str.begin(), __str.end(), _VSTD::move(__out_it), __specs, __size);
|
||||
|
||||
# else
|
||||
if (__specs.__has_precision()) {
|
||||
ptrdiff_t __size = __str.size();
|
||||
if (__size > __specs.__precision_)
|
||||
return __formatter::__write(__str.begin(), __str.begin() + __specs.__precision_, _VSTD::move(__out_it), __specs,
|
||||
__specs.__precision_);
|
||||
}
|
||||
return __formatter::__write(__str.begin(), __str.end(), _VSTD::move(__out_it), __specs, __str.size());
|
||||
|
||||
# endif
|
||||
return __write(__str.begin(), __str.end(), _VSTD::move(__out_it), __specs, __size);
|
||||
}
|
||||
|
||||
} // namespace __formatter
|
||||
|
@ -40,7 +40,7 @@ public:
|
||||
}
|
||||
|
||||
_LIBCPP_HIDE_FROM_ABI auto format(basic_string_view<_CharT> __str, auto& __ctx) const -> decltype(__ctx.out()) {
|
||||
return __formatter::__write_unicode(__str, __ctx.out(), __parser_.__get_parsed_std_specifications(__ctx));
|
||||
return __formatter::__write_string(__str, __ctx.out(), __parser_.__get_parsed_std_specifications(__ctx));
|
||||
}
|
||||
|
||||
__format_spec::__parser<_CharT> __parser_;
|
||||
@ -69,7 +69,7 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FORMAT formatter<const _CharT*,
|
||||
// TODO FMT Implement these improvements.
|
||||
__format_spec::__parsed_specifications<_CharT> __specs = _Base::__parser_.__get_parsed_std_specifications(__ctx);
|
||||
if (__specs.__has_width() || __specs.__has_precision())
|
||||
return __formatter::__write_unicode(basic_string_view<_CharT>{__str}, __ctx.out(), __specs);
|
||||
return __formatter::__write_string(basic_string_view<_CharT>{__str}, __ctx.out(), __specs);
|
||||
|
||||
// No formatting required, copy the string to the output.
|
||||
auto __out_it = __ctx.out();
|
||||
|
@ -25,10 +25,12 @@
|
||||
#include <__format/format_error.h>
|
||||
#include <__format/format_parse_context.h>
|
||||
#include <__format/format_string.h>
|
||||
#include <__format/unicode.h>
|
||||
#include <__variant/monostate.h>
|
||||
#include <bit>
|
||||
#include <concepts>
|
||||
#include <cstdint>
|
||||
#include <string_view>
|
||||
#include <type_traits>
|
||||
|
||||
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
|
||||
@ -91,462 +93,6 @@ __substitute_arg_id(basic_format_arg<_Context> __format_arg) {
|
||||
__format_arg);
|
||||
}
|
||||
|
||||
/** Helper struct returned from @ref __get_string_alignment. */
|
||||
template <class _CharT>
|
||||
struct _LIBCPP_TEMPLATE_VIS __string_alignment {
|
||||
/** Points beyond the last character to write to the output. */
|
||||
const _CharT* __last;
|
||||
/**
|
||||
* The estimated number of columns in the output or 0.
|
||||
*
|
||||
* Only when the output needs to be aligned it's required to know the exact
|
||||
* number of columns in the output. So if the formatted output has only a
|
||||
* minimum width the exact size isn't important. It's only important to know
|
||||
* the minimum has been reached. The minimum width is the width specified in
|
||||
* the format-spec.
|
||||
*
|
||||
* For example in this code @code std::format("{:10}", MyString); @endcode
|
||||
* the width estimation can stop once the algorithm has determined the output
|
||||
* width is 10 columns.
|
||||
*
|
||||
* So if:
|
||||
* * @ref __align == @c true the @ref __size is the estimated number of
|
||||
* columns required.
|
||||
* * @ref __align == @c false the @ref __size is the estimated number of
|
||||
* columns required or 0 when the estimation algorithm stopped prematurely.
|
||||
*/
|
||||
ptrdiff_t __size;
|
||||
/**
|
||||
* Does the output need to be aligned.
|
||||
*
|
||||
* When alignment is needed the output algorithm needs to add the proper
|
||||
* padding. Else the output algorithm just needs to copy the input up to
|
||||
* @ref __last.
|
||||
*/
|
||||
bool __align;
|
||||
};
|
||||
|
||||
#ifndef _LIBCPP_HAS_NO_UNICODE
|
||||
namespace __detail {
|
||||
|
||||
/**
|
||||
* Unicode column width estimates.
|
||||
*
|
||||
* Unicode can be stored in several formats: UTF-8, UTF-16, and UTF-32.
|
||||
* Depending on format the relation between the number of code units stored and
|
||||
* the number of output columns differs. The first relation is the number of
|
||||
* code units forming a code point. (The text assumes the code units are
|
||||
* unsigned.)
|
||||
* - UTF-8 The number of code units is between one and four. The first 127
|
||||
* Unicode code points match the ASCII character set. When the highest bit is
|
||||
* set it means the code point has more than one code unit.
|
||||
* - UTF-16: The number of code units is between 1 and 2. When the first
|
||||
* code unit is in the range [0xd800,0xdfff) it means the code point uses two
|
||||
* code units.
|
||||
* - UTF-32: The number of code units is always one.
|
||||
*
|
||||
* The code point to the number of columns isn't well defined. The code uses the
|
||||
* estimations defined in [format.string.std]/11. This list might change in the
|
||||
* future.
|
||||
*
|
||||
* The algorithm of @ref __get_string_alignment uses two different scanners:
|
||||
* - The simple scanner @ref __estimate_column_width_fast. This scanner assumes
|
||||
* 1 code unit is 1 column. This scanner stops when it can't be sure the
|
||||
* assumption is valid:
|
||||
* - UTF-8 when the code point is encoded in more than 1 code unit.
|
||||
* - UTF-16 and UTF-32 when the first multi-column code point is encountered.
|
||||
* (The code unit's value is lower than 0xd800 so the 2 code unit encoding
|
||||
* is irrelevant for this scanner.)
|
||||
* Due to these assumptions the scanner is faster than the full scanner. It
|
||||
* can process all text only containing ASCII. For UTF-16/32 it can process
|
||||
* most (all?) European languages. (Note the set it can process might be
|
||||
* reduced in the future, due to updates in the scanning rules.)
|
||||
* - The full scanner @ref __estimate_column_width. This scanner, if needed,
|
||||
* converts multiple code units into one code point then converts the code
|
||||
* point to a column width.
|
||||
*
|
||||
* See also:
|
||||
* - [format.string.general]/11
|
||||
* - https://en.wikipedia.org/wiki/UTF-8#Encoding
|
||||
* - https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
|
||||
*/
|
||||
|
||||
/**
|
||||
* The first 2 column code point.
|
||||
*
|
||||
* This is the point where the fast UTF-16/32 scanner needs to stop processing.
|
||||
*/
|
||||
inline constexpr uint32_t __two_column_code_point = 0x1100;
|
||||
|
||||
/** Helper concept for an UTF-8 character type. */
|
||||
template <class _CharT>
|
||||
concept __utf8_character = same_as<_CharT, char> || same_as<_CharT, char8_t>;
|
||||
|
||||
/** Helper concept for an UTF-16 character type. */
|
||||
template <class _CharT>
|
||||
concept __utf16_character = (same_as<_CharT, wchar_t> && sizeof(wchar_t) == 2) || same_as<_CharT, char16_t>;
|
||||
|
||||
/** Helper concept for an UTF-32 character type. */
|
||||
template <class _CharT>
|
||||
concept __utf32_character = (same_as<_CharT, wchar_t> && sizeof(wchar_t) == 4) || same_as<_CharT, char32_t>;
|
||||
|
||||
/** Helper concept for an UTF-16 or UTF-32 character type. */
|
||||
template <class _CharT>
|
||||
concept __utf16_or_32_character = __utf16_character<_CharT> || __utf32_character<_CharT>;
|
||||
|
||||
/**
|
||||
* Converts a code point to the column width.
|
||||
*
|
||||
* The estimations are conforming to [format.string.general]/11
|
||||
*
|
||||
* This version expects a value less than 0x1'0000, which is a 3-byte UTF-8
|
||||
* character.
|
||||
*/
|
||||
_LIBCPP_HIDE_FROM_ABI inline constexpr int __column_width_3(uint32_t __c) noexcept {
|
||||
_LIBCPP_ASSERT(__c < 0x10000,
|
||||
"Use __column_width_4 or __column_width for larger values");
|
||||
|
||||
// clang-format off
|
||||
return 1 + (__c >= 0x1100 && (__c <= 0x115f ||
|
||||
(__c >= 0x2329 && (__c <= 0x232a ||
|
||||
(__c >= 0x2e80 && (__c <= 0x303e ||
|
||||
(__c >= 0x3040 && (__c <= 0xa4cf ||
|
||||
(__c >= 0xac00 && (__c <= 0xd7a3 ||
|
||||
(__c >= 0xf900 && (__c <= 0xfaff ||
|
||||
(__c >= 0xfe10 && (__c <= 0xfe19 ||
|
||||
(__c >= 0xfe30 && (__c <= 0xfe6f ||
|
||||
(__c >= 0xff00 && (__c <= 0xff60 ||
|
||||
(__c >= 0xffe0 && (__c <= 0xffe6
|
||||
))))))))))))))))))));
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
/**
|
||||
* @overload
|
||||
*
|
||||
* This version expects a value greater than or equal to 0x1'0000, which is a
|
||||
* 4-byte UTF-8 character.
|
||||
*/
|
||||
_LIBCPP_HIDE_FROM_ABI inline constexpr int __column_width_4(uint32_t __c) noexcept {
|
||||
_LIBCPP_ASSERT(__c >= 0x10000,
|
||||
"Use __column_width_3 or __column_width for smaller values");
|
||||
|
||||
// clang-format off
|
||||
return 1 + (__c >= 0x1'f300 && (__c <= 0x1'f64f ||
|
||||
(__c >= 0x1'f900 && (__c <= 0x1'f9ff ||
|
||||
(__c >= 0x2'0000 && (__c <= 0x2'fffd ||
|
||||
(__c >= 0x3'0000 && (__c <= 0x3'fffd
|
||||
))))))));
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
/**
|
||||
* @overload
|
||||
*
|
||||
* The general case, accepting all values.
|
||||
*/
|
||||
_LIBCPP_HIDE_FROM_ABI inline constexpr int __column_width(uint32_t __c) noexcept {
|
||||
if (__c < 0x10000)
|
||||
return __column_width_3(__c);
|
||||
|
||||
return __column_width_4(__c);
|
||||
}
|
||||
|
||||
/**
|
||||
* Estimate the column width for the UTF-8 sequence using the fast algorithm.
|
||||
*/
|
||||
template <__utf8_character _CharT>
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr const _CharT*
|
||||
__estimate_column_width_fast(const _CharT* __first,
|
||||
const _CharT* __last) noexcept {
|
||||
return _VSTD::find_if(__first, __last,
|
||||
[](unsigned char __c) { return __c & 0x80; });
|
||||
}
|
||||
|
||||
/**
|
||||
* @overload
|
||||
*
|
||||
* The implementation for UTF-16/32.
|
||||
*/
|
||||
template <__utf16_or_32_character _CharT>
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr const _CharT*
|
||||
__estimate_column_width_fast(const _CharT* __first,
|
||||
const _CharT* __last) noexcept {
|
||||
return _VSTD::find_if(__first, __last,
|
||||
[](uint32_t __c) { return __c >= 0x1100; });
|
||||
}
|
||||
|
||||
template <class _CharT>
|
||||
struct _LIBCPP_TEMPLATE_VIS __column_width_result {
|
||||
/** The number of output columns. */
|
||||
size_t __width;
|
||||
/**
|
||||
* The last parsed element.
|
||||
*
|
||||
* This limits the original output to fit in the wanted number of columns.
|
||||
*/
|
||||
const _CharT* __ptr;
|
||||
};
|
||||
|
||||
/**
|
||||
* Small helper to determine the width of malformed Unicode.
|
||||
*
|
||||
* @note This function's only needed for UTF-8. During scanning UTF-8 there
|
||||
* are multiple place where it can be detected that the Unicode is malformed.
|
||||
* UTF-16 only requires 1 test and UTF-32 requires no testing.
|
||||
*/
|
||||
template <__utf8_character _CharT>
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT>
|
||||
__estimate_column_width_malformed(const _CharT* __first, const _CharT* __last,
|
||||
size_t __maximum, size_t __result) noexcept {
|
||||
size_t __size = __last - __first;
|
||||
size_t __n = _VSTD::min(__size, __maximum);
|
||||
return {__result + __n, __first + __n};
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines the number of output columns needed to render the input.
|
||||
*
|
||||
* @note When the scanner encounters malformed Unicode it acts as-if every code
|
||||
* unit at the end of the input is one output column. It's expected the output
|
||||
* terminal will replace these malformed code units with a one column
|
||||
* replacement characters.
|
||||
*
|
||||
* @param __first Points to the first element of the input range.
|
||||
* @param __last Points beyond the last element of the input range.
|
||||
* @param __maximum The maximum number of output columns. The returned number
|
||||
* of estimated output columns will not exceed this value.
|
||||
*/
|
||||
template <__utf8_character _CharT>
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT>
|
||||
__estimate_column_width(const _CharT* __first, const _CharT* __last,
|
||||
size_t __maximum) noexcept {
|
||||
size_t __result = 0;
|
||||
|
||||
while (__first != __last) {
|
||||
// Based on the number of leading 1 bits the number of code units in the
|
||||
// code point can be determined. See
|
||||
// https://en.wikipedia.org/wiki/UTF-8#Encoding
|
||||
switch (_VSTD::countl_one(static_cast<unsigned char>(*__first))) {
|
||||
case 0: // 1-code unit encoding: all 1 column
|
||||
++__result;
|
||||
++__first;
|
||||
break;
|
||||
|
||||
case 2: // 2-code unit encoding: all 1 column
|
||||
// Malformed Unicode.
|
||||
if (__last - __first < 2) [[unlikely]]
|
||||
return __estimate_column_width_malformed(__first, __last, __maximum,
|
||||
__result);
|
||||
__first += 2;
|
||||
++__result;
|
||||
break;
|
||||
|
||||
case 3: // 3-code unit encoding: either 1 or 2 columns
|
||||
// Malformed Unicode.
|
||||
if (__last - __first < 3) [[unlikely]]
|
||||
return __estimate_column_width_malformed(__first, __last, __maximum,
|
||||
__result);
|
||||
{
|
||||
uint32_t __c = static_cast<unsigned char>(*__first++) & 0x0f;
|
||||
__c <<= 6;
|
||||
__c |= static_cast<unsigned char>(*__first++) & 0x3f;
|
||||
__c <<= 6;
|
||||
__c |= static_cast<unsigned char>(*__first++) & 0x3f;
|
||||
__result += __column_width_3(__c);
|
||||
if (__result > __maximum)
|
||||
return {__result - 2, __first - 3};
|
||||
}
|
||||
break;
|
||||
case 4: // 4-code unit encoding: either 1 or 2 columns
|
||||
// Malformed Unicode.
|
||||
if (__last - __first < 4) [[unlikely]]
|
||||
return __estimate_column_width_malformed(__first, __last, __maximum,
|
||||
__result);
|
||||
{
|
||||
uint32_t __c = static_cast<unsigned char>(*__first++) & 0x07;
|
||||
__c <<= 6;
|
||||
__c |= static_cast<unsigned char>(*__first++) & 0x3f;
|
||||
__c <<= 6;
|
||||
__c |= static_cast<unsigned char>(*__first++) & 0x3f;
|
||||
__c <<= 6;
|
||||
__c |= static_cast<unsigned char>(*__first++) & 0x3f;
|
||||
__result += __column_width_4(__c);
|
||||
if (__result > __maximum)
|
||||
return {__result - 2, __first - 4};
|
||||
}
|
||||
break;
|
||||
default:
|
||||
// Malformed Unicode.
|
||||
return __estimate_column_width_malformed(__first, __last, __maximum,
|
||||
__result);
|
||||
}
|
||||
|
||||
if (__result >= __maximum)
|
||||
return {__result, __first};
|
||||
}
|
||||
return {__result, __first};
|
||||
}
|
||||
|
||||
template <__utf16_character _CharT>
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT>
|
||||
__estimate_column_width(const _CharT* __first, const _CharT* __last,
|
||||
size_t __maximum) noexcept {
|
||||
size_t __result = 0;
|
||||
|
||||
while (__first != __last) {
|
||||
uint32_t __c = *__first;
|
||||
// Is the code unit part of a surrogate pair? See
|
||||
// https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
|
||||
if (__c >= 0xd800 && __c <= 0xDfff) {
|
||||
// Malformed Unicode.
|
||||
if (__last - __first < 2) [[unlikely]]
|
||||
return {__result + 1, __first + 1};
|
||||
|
||||
__c -= 0xd800;
|
||||
__c <<= 10;
|
||||
__c += (*(__first + 1) - 0xdc00);
|
||||
__c += 0x10000;
|
||||
|
||||
__result += __column_width_4(__c);
|
||||
if (__result > __maximum)
|
||||
return {__result - 2, __first};
|
||||
__first += 2;
|
||||
} else {
|
||||
__result += __column_width_3(__c);
|
||||
if (__result > __maximum)
|
||||
return {__result - 2, __first};
|
||||
++__first;
|
||||
}
|
||||
|
||||
if (__result >= __maximum)
|
||||
return {__result, __first};
|
||||
}
|
||||
|
||||
return {__result, __first};
|
||||
}
|
||||
|
||||
template <__utf32_character _CharT>
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT>
|
||||
__estimate_column_width(const _CharT* __first, const _CharT* __last,
|
||||
size_t __maximum) noexcept {
|
||||
size_t __result = 0;
|
||||
|
||||
while (__first != __last) {
|
||||
uint32_t __c = *__first;
|
||||
__result += __column_width(__c);
|
||||
|
||||
if (__result > __maximum)
|
||||
return {__result - 2, __first};
|
||||
|
||||
++__first;
|
||||
if (__result >= __maximum)
|
||||
return {__result, __first};
|
||||
}
|
||||
|
||||
return {__result, __first};
|
||||
}
|
||||
|
||||
} // namespace __detail
|
||||
|
||||
template <class _CharT>
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr __string_alignment<_CharT>
|
||||
__get_string_alignment(const _CharT* __first, const _CharT* __last,
|
||||
ptrdiff_t __width, ptrdiff_t __precision) noexcept {
|
||||
_LIBCPP_ASSERT(__width != 0 || __precision != -1,
|
||||
"The function has no effect and shouldn't be used");
|
||||
|
||||
// TODO FMT There might be more optimizations possible:
|
||||
// If __precision == __format::__number_max and the encoding is:
|
||||
// * UTF-8 : 4 * (__last - __first) >= __width
|
||||
// * UTF-16 : 2 * (__last - __first) >= __width
|
||||
// * UTF-32 : (__last - __first) >= __width
|
||||
// In these cases it's certain the output is at least the requested width.
|
||||
// It's unknown how often this happens in practice. For now the improvement
|
||||
// isn't implemented.
|
||||
|
||||
/*
|
||||
* First assume there are no special Unicode code units in the input.
|
||||
* - Apply the precision (this may reduce the size of the input). When
|
||||
* __precison == -1 this step is omitted.
|
||||
* - Scan for special code units in the input.
|
||||
* If our assumption was correct the __pos will be at the end of the input.
|
||||
*/
|
||||
const ptrdiff_t __length = __last - __first;
|
||||
const _CharT* __limit =
|
||||
__first +
|
||||
(__precision == -1 ? __length : _VSTD::min(__length, __precision));
|
||||
ptrdiff_t __size = __limit - __first;
|
||||
const _CharT* __pos =
|
||||
__detail::__estimate_column_width_fast(__first, __limit);
|
||||
|
||||
if (__pos == __limit)
|
||||
return {__limit, __size, __size < __width};
|
||||
|
||||
/*
|
||||
* Our assumption was wrong, there are special Unicode code units.
|
||||
* The range [__first, __pos) contains a set of code units with the
|
||||
* following property:
|
||||
* Every _CharT in the range will be rendered in 1 column.
|
||||
*
|
||||
* If there's no maximum width and the parsed size already exceeds the
|
||||
* minimum required width. The real size isn't important. So bail out.
|
||||
*/
|
||||
if (__precision == -1 && (__pos - __first) >= __width)
|
||||
return {__last, 0, false};
|
||||
|
||||
/* If there's a __precision, truncate the output to that width. */
|
||||
ptrdiff_t __prefix = __pos - __first;
|
||||
if (__precision != -1) {
|
||||
_LIBCPP_ASSERT(__precision > __prefix, "Logic error.");
|
||||
auto __lengh_info = __detail::__estimate_column_width(
|
||||
__pos, __last, __precision - __prefix);
|
||||
__size = __lengh_info.__width + __prefix;
|
||||
return {__lengh_info.__ptr, __size, __size < __width};
|
||||
}
|
||||
|
||||
/* Else use __width to determine the number of required padding characters. */
|
||||
_LIBCPP_ASSERT(__width > __prefix, "Logic error.");
|
||||
/*
|
||||
* The column width is always one or two columns. For the precision the wanted
|
||||
* column width is the maximum, for the width it's the minimum. Using the
|
||||
* width estimation with its truncating behavior will result in the wrong
|
||||
* result in the following case:
|
||||
* - The last code unit processed requires two columns and exceeds the
|
||||
* maximum column width.
|
||||
* By increasing the __maximum by one avoids this issue. (It means it may
|
||||
* pass one code point more than required to determine the proper result;
|
||||
* that however isn't a problem for the algorithm.)
|
||||
*/
|
||||
size_t __maximum = 1 + __width - __prefix;
|
||||
auto __lengh_info =
|
||||
__detail::__estimate_column_width(__pos, __last, __maximum);
|
||||
if (__lengh_info.__ptr != __last) {
|
||||
// Consumed the width number of code units. The exact size of the string
|
||||
// is unknown. We only know we don't need to align the output.
|
||||
_LIBCPP_ASSERT(static_cast<ptrdiff_t>(__lengh_info.__width + __prefix) >=
|
||||
__width,
|
||||
"Logic error");
|
||||
return {__last, 0, false};
|
||||
}
|
||||
|
||||
__size = __lengh_info.__width + __prefix;
|
||||
return {__last, __size, __size < __width};
|
||||
}
|
||||
#else // _LIBCPP_HAS_NO_UNICODE
|
||||
template <class _CharT>
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr __string_alignment<_CharT>
|
||||
__get_string_alignment(const _CharT* __first, const _CharT* __last,
|
||||
ptrdiff_t __width, ptrdiff_t __precision) noexcept {
|
||||
const ptrdiff_t __length = __last - __first;
|
||||
const _CharT* __limit =
|
||||
__first +
|
||||
(__precision == -1 ? __length : _VSTD::min(__length, __precision));
|
||||
ptrdiff_t __size = __limit - __first;
|
||||
return {__limit, __size, __size < __width};
|
||||
}
|
||||
#endif // _LIBCPP_HAS_NO_UNICODE
|
||||
|
||||
/// These fields are a filter for which elements to parse.
|
||||
///
|
||||
/// They default to false so when a new field is added it needs to be opted in
|
||||
@ -1143,6 +689,212 @@ _LIBCPP_HIDE_FROM_ABI constexpr void __process_display_type_pointer(__format_spe
|
||||
}
|
||||
}
|
||||
|
||||
template <class _CharT>
|
||||
struct __column_width_result {
|
||||
/// The number of output columns.
|
||||
size_t __width_;
|
||||
/// One beyond the last code unit used in the estimation.
|
||||
///
|
||||
/// This limits the original output to fit in the wanted number of columns.
|
||||
const _CharT* __last_;
|
||||
};
|
||||
|
||||
/// Since a column width can be two it's possible that the requested column
|
||||
/// width can't be achieved. Depending on the intended usage the policy can be
|
||||
/// selected.
|
||||
/// - When used as precision the maximum width may not be exceeded and the
|
||||
/// result should be "rounded down" to the previous boundary.
|
||||
/// - When used as a width we're done once the minimum is reached, but
|
||||
/// exceeding is not an issue. Rounding down is an issue since that will
|
||||
/// result in writing fill characters. Therefore the result needs to be
|
||||
/// "rounded up".
|
||||
enum class __column_width_rounding { __down, __up };
|
||||
|
||||
# ifndef _LIBCPP_HAS_NO_UNICODE
|
||||
|
||||
namespace __detail {
|
||||
|
||||
/// Converts a code point to the column width.
|
||||
///
|
||||
/// The estimations are conforming to [format.string.general]/11
|
||||
///
|
||||
/// This version expects a value less than 0x1'0000, which is a 3-byte UTF-8
|
||||
/// character.
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr int __column_width_3(uint32_t __c) noexcept {
|
||||
_LIBCPP_ASSERT(__c < 0x10000, "Use __column_width_4 or __column_width for larger values");
|
||||
|
||||
// clang-format off
|
||||
return 1 + (__c >= 0x1100 && (__c <= 0x115f ||
|
||||
(__c >= 0x2329 && (__c <= 0x232a ||
|
||||
(__c >= 0x2e80 && (__c <= 0x303e ||
|
||||
(__c >= 0x3040 && (__c <= 0xa4cf ||
|
||||
(__c >= 0xac00 && (__c <= 0xd7a3 ||
|
||||
(__c >= 0xf900 && (__c <= 0xfaff ||
|
||||
(__c >= 0xfe10 && (__c <= 0xfe19 ||
|
||||
(__c >= 0xfe30 && (__c <= 0xfe6f ||
|
||||
(__c >= 0xff00 && (__c <= 0xff60 ||
|
||||
(__c >= 0xffe0 && (__c <= 0xffe6
|
||||
))))))))))))))))))));
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
/// @overload
|
||||
///
|
||||
/// This version expects a value greater than or equal to 0x1'0000, which is a
|
||||
/// 4-byte UTF-8 character.
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr int __column_width_4(uint32_t __c) noexcept {
|
||||
_LIBCPP_ASSERT(__c >= 0x10000, "Use __column_width_3 or __column_width for smaller values");
|
||||
|
||||
// clang-format off
|
||||
return 1 + (__c >= 0x1'f300 && (__c <= 0x1'f64f ||
|
||||
(__c >= 0x1'f900 && (__c <= 0x1'f9ff ||
|
||||
(__c >= 0x2'0000 && (__c <= 0x2'fffd ||
|
||||
(__c >= 0x3'0000 && (__c <= 0x3'fffd
|
||||
))))))));
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
/// @overload
|
||||
///
|
||||
/// The general case, accepting all values.
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr int __column_width(uint32_t __c) noexcept {
|
||||
if (__c < 0x10000)
|
||||
return __detail::__column_width_3(__c);
|
||||
|
||||
return __detail::__column_width_4(__c);
|
||||
}
|
||||
|
||||
template <class _CharT>
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT> __estimate_column_width_grapheme_clustering(
|
||||
const _CharT* __first, const _CharT* __last, size_t __maximum, __column_width_rounding __rounding) noexcept {
|
||||
__unicode::__extended_grapheme_cluster_view<_CharT> __view{__first, __last};
|
||||
|
||||
__column_width_result<_CharT> __result{0, __first};
|
||||
while (__result.__last_ != __last && __result.__width_ <= __maximum) {
|
||||
typename __unicode::__extended_grapheme_cluster_view<_CharT>::__cluster __cluster = __view.__consume();
|
||||
int __width = __detail::__column_width(__cluster.__code_point_);
|
||||
|
||||
// When the next entry would exceed the maximum width the previous width
|
||||
// might be returned. For example when a width of 100 is requested the
|
||||
// returned width might be 99, since the next code point has an estimated
|
||||
// column width of 2. This depends on the rounding flag.
|
||||
// When the maximum is exceeded the loop will abort the next iteration.
|
||||
if (__rounding == __column_width_rounding::__down && __result.__width_ + __width > __maximum)
|
||||
return __result;
|
||||
|
||||
__result.__width_ += __width;
|
||||
__result.__last_ = __cluster.__last_;
|
||||
}
|
||||
|
||||
return __result;
|
||||
}
|
||||
|
||||
} // namespace __detail
|
||||
|
||||
// Unicode can be stored in several formats: UTF-8, UTF-16, and UTF-32.
|
||||
// Depending on format the relation between the number of code units stored and
|
||||
// the number of output columns differs. The first relation is the number of
|
||||
// code units forming a code point. (The text assumes the code units are
|
||||
// unsigned.)
|
||||
// - UTF-8 The number of code units is between one and four. The first 127
|
||||
// Unicode code points match the ASCII character set. When the highest bit is
|
||||
// set it means the code point has more than one code unit.
|
||||
// - UTF-16: The number of code units is between 1 and 2. When the first
|
||||
// code unit is in the range [0xd800,0xdfff) it means the code point uses two
|
||||
// code units.
|
||||
// - UTF-32: The number of code units is always one.
|
||||
//
|
||||
// The code point to the number of columns is specified in
|
||||
// [format.string.std]/11. This list might change in the future.
|
||||
//
|
||||
// Another thing to be taken into account is Grapheme clustering. This means
|
||||
// that in some cases multiple code points are combined one element in the
|
||||
// output. For example:
|
||||
// - an ASCII character with a combined diacritical mark
|
||||
// - an emoji with a skin tone modifier
|
||||
// - a group of combined people emoji to create a family
|
||||
// - a combination of flag emoji
|
||||
//
|
||||
// See also:
|
||||
// - [format.string.general]/11
|
||||
// - https://en.wikipedia.org/wiki/UTF-8#Encoding
|
||||
// - https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
|
||||
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr bool __is_ascii(char32_t __c) { return __c < 0x80; }
|
||||
|
||||
/// Determines the number of output columns needed to render the input.
|
||||
///
|
||||
/// \note When the scanner encounters malformed Unicode it acts as-if every
|
||||
/// code unit is a one column code point. Typically a terminal uses the same
|
||||
/// strategy and replaces every malformed code unit with a one column
|
||||
/// replacement character.
|
||||
///
|
||||
/// \param __first Points to the first element of the input range.
|
||||
/// \param __last Points beyond the last element of the input range.
|
||||
/// \param __maximum The maximum number of output columns. The returned number
|
||||
/// of estimated output columns will not exceed this value.
|
||||
/// \param __rounding Selects the rounding method.
|
||||
/// \c __down result.__width_ <= __maximum
|
||||
/// \c __up result.__width_ <= __maximum + 1
|
||||
template <class _CharT>
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT> __estimate_column_width(
|
||||
basic_string_view<_CharT> __str, size_t __maximum, __column_width_rounding __rounding) noexcept {
|
||||
// The width estimation is done in two steps:
|
||||
// - Quickly process for the ASCII part. ASCII has the following properties
|
||||
// - One code unit is one code point
|
||||
// - Every code point has an estimated width of one
|
||||
// - When needed it will a Unicode Grapheme clustering algorithm to find
|
||||
// the proper place for truncation.
|
||||
|
||||
if (__str.empty() || __maximum == 0)
|
||||
return {0, __str.begin()};
|
||||
|
||||
// ASCII has one caveat; when an ASCII character is followed by a non-ASCII
|
||||
// character they might be part of an extended grapheme cluster. For example:
|
||||
// an ASCII letter and a COMBINING ACUTE ACCENT
|
||||
// The truncate should happen after the COMBINING ACUTE ACCENT. Therefore we
|
||||
// need to scan one code unit beyond the requested precision. When this code
|
||||
// unit is non-ASCII we omit the current code unit and let the Grapheme
|
||||
// clustering algorithm do its work.
|
||||
const _CharT* __it = __str.begin();
|
||||
if (__is_ascii(*__it)) {
|
||||
do {
|
||||
--__maximum;
|
||||
++__it;
|
||||
if (__it == __str.end())
|
||||
return {__str.size(), __str.end()};
|
||||
|
||||
if (__maximum == 0) {
|
||||
if (__is_ascii(*__it))
|
||||
return {static_cast<size_t>(__it - __str.begin()), __it};
|
||||
|
||||
break;
|
||||
}
|
||||
} while (__is_ascii(*__it));
|
||||
--__it;
|
||||
++__maximum;
|
||||
}
|
||||
|
||||
ptrdiff_t __ascii_size = __it - __str.begin();
|
||||
__column_width_result __result =
|
||||
__detail::__estimate_column_width_grapheme_clustering(__it, __str.end(), __maximum, __rounding);
|
||||
|
||||
__result.__width_ += __ascii_size;
|
||||
return __result;
|
||||
}
|
||||
# else // !defined(_LIBCPP_HAS_NO_UNICODE)
|
||||
template <class _CharT>
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr __column_width_result<_CharT>
|
||||
__estimate_column_width(basic_string_view<_CharT> __str, size_t __maximum, __column_width_rounding) noexcept {
|
||||
// When Unicode isn't supported assume ASCII and every code unit is one code
|
||||
// point. In ASCII the estimated column width is always one. Thus there's no
|
||||
// need for rounding.
|
||||
size_t __width_ = _VSTD::min(__str.size(), __maximum);
|
||||
return {__width_, __str.begin() + __width_};
|
||||
}
|
||||
|
||||
# endif // !defined(_LIBCPP_HAS_NO_UNICODE)
|
||||
|
||||
} // namespace __format_spec
|
||||
|
||||
#endif //_LIBCPP_STD_VER > 17
|
||||
|
339
libcxx/include/__format/unicode.h
Normal file
339
libcxx/include/__format/unicode.h
Normal file
@ -0,0 +1,339 @@
|
||||
// -*- C++ -*-
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef _LIBCPP___FORMAT_UNICODE_H
|
||||
#define _LIBCPP___FORMAT_UNICODE_H
|
||||
|
||||
#include <__assert>
|
||||
#include <__config>
|
||||
#include <__format/extended_grapheme_cluster_table.h>
|
||||
#include <__utility/unreachable.h>
|
||||
#include <bit>
|
||||
|
||||
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
|
||||
# pragma GCC system_header
|
||||
#endif
|
||||
|
||||
_LIBCPP_BEGIN_NAMESPACE_STD
|
||||
|
||||
#if _LIBCPP_STD_VER > 17
|
||||
|
||||
# ifndef _LIBCPP_HAS_NO_UNICODE
|
||||
|
||||
/// Implements the grapheme cluster boundary rules
|
||||
///
|
||||
/// These rules are used to implement format's width estimation as stated in
|
||||
/// [format.string.std]/11
|
||||
///
|
||||
/// The Standard refers to UAX \#29 for Unicode 12.0.0
|
||||
/// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
|
||||
///
|
||||
/// The data tables used are
|
||||
/// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
|
||||
/// https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
|
||||
/// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt (for testing only)
|
||||
|
||||
namespace __unicode {
|
||||
|
||||
inline constexpr char32_t __replacement_character = U'\ufffd';
|
||||
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr bool __is_continuation(const char* __char, int __count) {
|
||||
do {
|
||||
if ((*__char & 0b1000'0000) != 0b1000'0000)
|
||||
return false;
|
||||
--__count;
|
||||
++__char;
|
||||
} while (__count);
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Helper class to extract a code unit from a Unicode character range.
|
||||
///
|
||||
/// The stored range is a view. There are multiple specialization for different
|
||||
/// character types.
|
||||
template <class _CharT>
|
||||
class __code_point_view;
|
||||
|
||||
/// UTF-8 specialization.
|
||||
template <>
|
||||
class __code_point_view<char> {
|
||||
public:
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(const char* __first, const char* __last)
|
||||
: __first_(__first), __last_(__last) {}
|
||||
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr const char* __position() const noexcept { return __first_; }
|
||||
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
|
||||
_LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
|
||||
|
||||
// Based on the number of leading 1 bits the number of code units in the
|
||||
// code point can be determined. See
|
||||
// https://en.wikipedia.org/wiki/UTF-8#Encoding
|
||||
switch (_VSTD::countl_one(static_cast<unsigned char>(*__first_))) {
|
||||
case 0:
|
||||
return *__first_++;
|
||||
|
||||
case 2:
|
||||
if (__last_ - __first_ < 2 || !__unicode::__is_continuation(__first_ + 1, 1)) [[unlikely]]
|
||||
break;
|
||||
else {
|
||||
char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
|
||||
__value <<= 6;
|
||||
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
|
||||
return __value;
|
||||
}
|
||||
|
||||
case 3:
|
||||
if (__last_ - __first_ < 3 || !__unicode::__is_continuation(__first_ + 1, 2)) [[unlikely]]
|
||||
break;
|
||||
else {
|
||||
char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
|
||||
__value <<= 6;
|
||||
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
|
||||
__value <<= 6;
|
||||
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
|
||||
return __value;
|
||||
}
|
||||
|
||||
case 4:
|
||||
if (__last_ - __first_ < 4 || !__unicode::__is_continuation(__first_ + 1, 3)) [[unlikely]]
|
||||
break;
|
||||
else {
|
||||
char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
|
||||
__value <<= 6;
|
||||
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
|
||||
__value <<= 6;
|
||||
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
|
||||
__value <<= 6;
|
||||
__value |= static_cast<unsigned char>(*__first_++) & 0x3f;
|
||||
return __value;
|
||||
}
|
||||
}
|
||||
// An invalid number of leading ones can be garbage or a code unit in the
|
||||
// middle of a code point. By consuming one code unit the parser may get
|
||||
// "in sync" after a few code units.
|
||||
++__first_;
|
||||
return __replacement_character;
|
||||
}
|
||||
|
||||
private:
|
||||
const char* __first_;
|
||||
const char* __last_;
|
||||
};
|
||||
|
||||
# ifndef TEST_HAS_NO_WIDE_CHARACTERS
|
||||
/// This specialization depends on the size of wchar_t
|
||||
/// - 2 UTF-16 (for example Windows and AIX)
|
||||
/// - 4 UTF-32 (for example Linux)
|
||||
template <>
|
||||
class __code_point_view<wchar_t> {
|
||||
public:
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(const wchar_t* __first, const wchar_t* __last)
|
||||
: __first_(__first), __last_(__last) {}
|
||||
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr const wchar_t* __position() const noexcept { return __first_; }
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
|
||||
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
|
||||
_LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
|
||||
|
||||
if constexpr (sizeof(wchar_t) == 2) {
|
||||
char32_t __result = *__first_++;
|
||||
// Is the code unit part of a surrogate pair? See
|
||||
// https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
|
||||
if (__result >= 0xd800 && __result <= 0xDfff) {
|
||||
// Malformed Unicode.
|
||||
if (__first_ == __last_) [[unlikely]]
|
||||
return __replacement_character;
|
||||
|
||||
__result -= 0xd800;
|
||||
__result <<= 10;
|
||||
__result += *__first_++ - 0xdc00;
|
||||
__result += 0x10000;
|
||||
}
|
||||
return __result;
|
||||
|
||||
} else if constexpr (sizeof(wchar_t) == 4) {
|
||||
char32_t __result = *__first_++;
|
||||
if (__result > 0x10FFFF) [[unlikely]]
|
||||
return __replacement_character;
|
||||
return __result;
|
||||
} else {
|
||||
// TODO FMT P2593R0 Use static_assert(false, "sizeof(wchar_t) has a not implemented value");
|
||||
_LIBCPP_ASSERT(sizeof(wchar_t) == 0, "sizeof(wchar_t) has a not implemented value");
|
||||
__libcpp_unreachable();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
const wchar_t* __first_;
|
||||
const wchar_t* __last_;
|
||||
};
|
||||
# endif
|
||||
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr bool __at_extended_grapheme_cluster_break(
|
||||
bool& __RI_break_allowed,
|
||||
bool __has_extened_pictographic,
|
||||
__extended_grapheme_custer_property_boundary::__property __prev,
|
||||
__extended_grapheme_custer_property_boundary::__property __next_) {
|
||||
using __extended_grapheme_custer_property_boundary::__property;
|
||||
|
||||
__has_extened_pictographic |= __prev == __property::__Extended_Pictographic;
|
||||
|
||||
// https://www.unicode.org/reports/tr29/tr29-39.html#Grapheme_Cluster_Boundary_Rules
|
||||
|
||||
// *** Break at the start and end of text, unless the text is empty. ***
|
||||
|
||||
_LIBCPP_ASSERT(__prev != __property::__sot, "should be handled in the constructor"); // GB1
|
||||
_LIBCPP_ASSERT(__prev != __property::__eot, "should be handled by our caller"); // GB2
|
||||
|
||||
// *** Do not break between a CR and LF. Otherwise, break before and after controls. ***
|
||||
if (__prev == __property::__CR && __next_ == __property::__LF) // GB3
|
||||
return false;
|
||||
|
||||
if (__prev == __property::__Control || __prev == __property::__CR || __prev == __property::__LF) // GB4
|
||||
return true;
|
||||
|
||||
if (__next_ == __property::__Control || __next_ == __property::__CR || __next_ == __property::__LF) // GB5
|
||||
return true;
|
||||
|
||||
// *** Do not break Hangul syllable sequences. ***
|
||||
if (__prev == __property::__L &&
|
||||
(__next_ == __property::__L || __next_ == __property::__V || __next_ == __property::__LV ||
|
||||
__next_ == __property::__LVT)) // GB6
|
||||
return false;
|
||||
|
||||
if ((__prev == __property::__LV || __prev == __property::__V) &&
|
||||
(__next_ == __property::__V || __next_ == __property::__T)) // GB7
|
||||
return false;
|
||||
|
||||
if ((__prev == __property::__LVT || __prev == __property::__T) && __next_ == __property::__T) // GB8
|
||||
return false;
|
||||
|
||||
// *** Do not break before extending characters or ZWJ. ***
|
||||
if (__next_ == __property::__Extend || __next_ == __property::__ZWJ)
|
||||
return false; // GB9
|
||||
|
||||
// *** Do not break before SpacingMarks, or after Prepend characters. ***
|
||||
if (__next_ == __property::__SpacingMark) // GB9a
|
||||
return false;
|
||||
|
||||
if (__prev == __property::__Prepend) // GB9b
|
||||
return false;
|
||||
|
||||
// *** Do not break within emoji modifier sequences or emoji zwj sequences. ***
|
||||
|
||||
// GB11 \p{Extended_Pictographic} Extend* ZWJ x \p{Extended_Pictographic}
|
||||
//
|
||||
// Note that several parts of this rule are matched by GB9: Any x (Extend | ZWJ)
|
||||
// - \p{Extended_Pictographic} x Extend
|
||||
// - Extend x Extend
|
||||
// - \p{Extended_Pictographic} x ZWJ
|
||||
// - Extend x ZWJ
|
||||
//
|
||||
// So the only case left to test is
|
||||
// - \p{Extended_Pictographic}' x ZWJ x \p{Extended_Pictographic}
|
||||
// where \p{Extended_Pictographic}' is stored in __has_extened_pictographic
|
||||
if (__has_extened_pictographic && __prev == __property::__ZWJ && __next_ == __property::__Extended_Pictographic)
|
||||
return false;
|
||||
|
||||
// *** Do not break within emoji flag sequences ***
|
||||
|
||||
// That is, do not break between regional indicator (RI) symbols if there
|
||||
// is an odd number of RI characters before the break point.
|
||||
|
||||
if (__prev == __property::__Regional_Indicator && __next_ == __property::__Regional_Indicator) { // GB12 + GB13
|
||||
__RI_break_allowed = !__RI_break_allowed;
|
||||
if (__RI_break_allowed)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// *** Otherwise, break everywhere. ***
|
||||
return true; // GB999
|
||||
}
|
||||
|
||||
/// Helper class to extract an extended grapheme cluster from a Unicode character range.
|
||||
///
|
||||
/// This function is used to determine the column width of an extended grapheme
|
||||
/// cluster. In order to do that only the first code point is evaluated.
|
||||
/// Therefore only this code point is extracted.
|
||||
template <class _CharT>
|
||||
class __extended_grapheme_cluster_view {
|
||||
public:
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_view(const _CharT* __first, const _CharT* __last)
|
||||
: __code_point_view_(__first, __last),
|
||||
__next_code_point_(__code_point_view_.__consume()),
|
||||
__next_prop_(__extended_grapheme_custer_property_boundary::__get_property(__next_code_point_)) {}
|
||||
|
||||
struct __cluster {
|
||||
/// The first code point of the extended grapheme cluster.
|
||||
///
|
||||
/// The first code point is used to estimate the width of the extended
|
||||
/// grapheme cluster.
|
||||
char32_t __code_point_;
|
||||
|
||||
/// Points one beyond the last code unit in the extended grapheme cluster.
|
||||
///
|
||||
/// It's expected the caller has the start position and thus can determine
|
||||
/// the code unit range of the extended grapheme cluster.
|
||||
const _CharT* __last_;
|
||||
};
|
||||
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr __cluster __consume() {
|
||||
_LIBCPP_ASSERT(
|
||||
__next_prop_ != __extended_grapheme_custer_property_boundary::__property::__eot,
|
||||
"can't move beyond the end of input");
|
||||
char32_t __code_point = __next_code_point_;
|
||||
if (!__code_point_view_.__at_end())
|
||||
return {__code_point, __get_break()};
|
||||
|
||||
__next_prop_ = __extended_grapheme_custer_property_boundary::__property::__eot;
|
||||
return {__code_point, __code_point_view_.__position()};
|
||||
}
|
||||
|
||||
private:
|
||||
__code_point_view<_CharT> __code_point_view_;
|
||||
|
||||
char32_t __next_code_point_;
|
||||
__extended_grapheme_custer_property_boundary::__property __next_prop_;
|
||||
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr const _CharT* __get_break() {
|
||||
bool __RI_break_allowed = true;
|
||||
bool __has_extened_pictographic = false;
|
||||
while (true) {
|
||||
const _CharT* __result = __code_point_view_.__position();
|
||||
__extended_grapheme_custer_property_boundary::__property __prev = __next_prop_;
|
||||
if (__code_point_view_.__at_end()) {
|
||||
__next_prop_ = __extended_grapheme_custer_property_boundary::__property::__eot;
|
||||
return __result;
|
||||
}
|
||||
__next_code_point_ = __code_point_view_.__consume();
|
||||
__next_prop_ = __extended_grapheme_custer_property_boundary::__get_property(__next_code_point_);
|
||||
|
||||
__has_extened_pictographic |=
|
||||
__prev == __extended_grapheme_custer_property_boundary::__property::__Extended_Pictographic;
|
||||
|
||||
if (__at_extended_grapheme_cluster_break(__RI_break_allowed, __has_extened_pictographic, __prev, __next_prop_))
|
||||
return __result;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace __unicode
|
||||
|
||||
# endif // _LIBCPP_HAS_NO_UNICODE
|
||||
|
||||
#endif //_LIBCPP_STD_VER > 17
|
||||
|
||||
_LIBCPP_END_NAMESPACE_STD
|
||||
|
||||
#endif // _LIBCPP___FORMAT_UNICODE_H
|
@ -157,6 +157,7 @@ namespace std {
|
||||
#include <__format/formatter_pointer.h>
|
||||
#include <__format/formatter_string.h>
|
||||
#include <__format/parser_std_format_spec.h>
|
||||
#include <__format/unicode.h>
|
||||
#include <__iterator/back_insert_iterator.h>
|
||||
#include <__iterator/incrementable_traits.h>
|
||||
#include <__variant/monostate.h>
|
||||
|
@ -616,32 +616,34 @@ module std [system] {
|
||||
export *
|
||||
|
||||
module __format {
|
||||
module buffer { private header "__format/buffer.h" }
|
||||
module concepts { private header "__format/concepts.h" }
|
||||
module enable_insertable { private header "__format/enable_insertable.h" }
|
||||
module format_arg { private header "__format/format_arg.h" }
|
||||
module format_arg_store { private header "__format/format_arg_store.h" }
|
||||
module format_args { private header "__format/format_args.h" }
|
||||
module buffer { private header "__format/buffer.h" }
|
||||
module concepts { private header "__format/concepts.h" }
|
||||
module enable_insertable { private header "__format/enable_insertable.h" }
|
||||
module extended_grapheme_cluster_table { private header "__format/extended_grapheme_cluster_table.h" }
|
||||
module format_arg { private header "__format/format_arg.h" }
|
||||
module format_arg_store { private header "__format/format_arg_store.h" }
|
||||
module format_args { private header "__format/format_args.h" }
|
||||
module format_context {
|
||||
private header "__format/format_context.h"
|
||||
export optional
|
||||
export locale
|
||||
}
|
||||
module format_error { private header "__format/format_error.h" }
|
||||
module format_fwd { private header "__format/format_fwd.h" }
|
||||
module format_parse_context { private header "__format/format_parse_context.h" }
|
||||
module format_string { private header "__format/format_string.h" }
|
||||
module format_to_n_result { private header "__format/format_to_n_result.h" }
|
||||
module formatter { private header "__format/formatter.h" }
|
||||
module formatter_bool { private header "__format/formatter_bool.h" }
|
||||
module formatter_char { private header "__format/formatter_char.h" }
|
||||
module formatter_floating_point { private header "__format/formatter_floating_point.h" }
|
||||
module formatter_integer { private header "__format/formatter_integer.h" }
|
||||
module formatter_integral { private header "__format/formatter_integral.h" }
|
||||
module formatter_output { private header "__format/formatter_output.h" }
|
||||
module formatter_pointer { private header "__format/formatter_pointer.h" }
|
||||
module formatter_string { private header "__format/formatter_string.h" }
|
||||
module parser_std_format_spec { private header "__format/parser_std_format_spec.h" }
|
||||
module format_error { private header "__format/format_error.h" }
|
||||
module format_fwd { private header "__format/format_fwd.h" }
|
||||
module format_parse_context { private header "__format/format_parse_context.h" }
|
||||
module format_string { private header "__format/format_string.h" }
|
||||
module format_to_n_result { private header "__format/format_to_n_result.h" }
|
||||
module formatter { private header "__format/formatter.h" }
|
||||
module formatter_bool { private header "__format/formatter_bool.h" }
|
||||
module formatter_char { private header "__format/formatter_char.h" }
|
||||
module formatter_floating_point { private header "__format/formatter_floating_point.h" }
|
||||
module formatter_integer { private header "__format/formatter_integer.h" }
|
||||
module formatter_integral { private header "__format/formatter_integral.h" }
|
||||
module formatter_output { private header "__format/formatter_output.h" }
|
||||
module formatter_pointer { private header "__format/formatter_pointer.h" }
|
||||
module formatter_string { private header "__format/formatter_string.h" }
|
||||
module parser_std_format_spec { private header "__format/parser_std_format_spec.h" }
|
||||
module unicode { private header "__format/unicode.h" }
|
||||
}
|
||||
}
|
||||
module forward_list {
|
||||
|
@ -307,6 +307,7 @@ END-SCRIPT
|
||||
#include <__format/buffer.h> // expected-error@*:* {{use of private header from outside its module: '__format/buffer.h'}}
|
||||
#include <__format/concepts.h> // expected-error@*:* {{use of private header from outside its module: '__format/concepts.h'}}
|
||||
#include <__format/enable_insertable.h> // expected-error@*:* {{use of private header from outside its module: '__format/enable_insertable.h'}}
|
||||
#include <__format/extended_grapheme_cluster_table.h> // expected-error@*:* {{use of private header from outside its module: '__format/extended_grapheme_cluster_table.h'}}
|
||||
#include <__format/format_arg.h> // expected-error@*:* {{use of private header from outside its module: '__format/format_arg.h'}}
|
||||
#include <__format/format_arg_store.h> // expected-error@*:* {{use of private header from outside its module: '__format/format_arg_store.h'}}
|
||||
#include <__format/format_args.h> // expected-error@*:* {{use of private header from outside its module: '__format/format_args.h'}}
|
||||
@ -326,6 +327,7 @@ END-SCRIPT
|
||||
#include <__format/formatter_pointer.h> // expected-error@*:* {{use of private header from outside its module: '__format/formatter_pointer.h'}}
|
||||
#include <__format/formatter_string.h> // expected-error@*:* {{use of private header from outside its module: '__format/formatter_string.h'}}
|
||||
#include <__format/parser_std_format_spec.h> // expected-error@*:* {{use of private header from outside its module: '__format/parser_std_format_spec.h'}}
|
||||
#include <__format/unicode.h> // expected-error@*:* {{use of private header from outside its module: '__format/unicode.h'}}
|
||||
#include <__functional/binary_function.h> // expected-error@*:* {{use of private header from outside its module: '__functional/binary_function.h'}}
|
||||
#include <__functional/binary_negate.h> // expected-error@*:* {{use of private header from outside its module: '__functional/binary_negate.h'}}
|
||||
#include <__functional/bind.h> // expected-error@*:* {{use of private header from outside its module: '__functional/bind.h'}}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,92 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// UNSUPPORTED: c++03, c++11, c++14, c++17
|
||||
// UNSUPPORTED: libcpp-has-no-incomplete-format
|
||||
// TODO FMT Fix this test using GCC, it currently times out.
|
||||
// UNSUPPORTED: gcc-12
|
||||
|
||||
// <format>
|
||||
|
||||
// Tests the implementation of the extended grapheme cluster boundaries per
|
||||
// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
|
||||
//
|
||||
// The tests are based on the test data provided by Unicode
|
||||
// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt
|
||||
|
||||
#include <cassert>
|
||||
#include <format>
|
||||
#include <functional>
|
||||
#include <numeric>
|
||||
|
||||
#include "extended_grapheme_cluster.h"
|
||||
|
||||
// Validates whether the number of code points in our "database" matches with
|
||||
// the number in the Unicode. The assumption is when the number of items per
|
||||
// property matches the code points themselves also match.
|
||||
namespace {
|
||||
namespace cluster = std::__extended_grapheme_custer_property_boundary;
|
||||
constexpr int count_entries(cluster::__property property) {
|
||||
return std::transform_reduce(
|
||||
std::begin(cluster::__entries), std::end(cluster::__entries), 0, std::plus{}, [property](auto entry) {
|
||||
if (static_cast<cluster::__property>(entry & 0xf) != property)
|
||||
return 0;
|
||||
|
||||
return 1 + static_cast<int>((entry >> 4) & 0x7f);
|
||||
});
|
||||
}
|
||||
|
||||
static_assert(count_entries(cluster::__property::__Prepend) == 26);
|
||||
static_assert(count_entries(cluster::__property::__CR) == 1);
|
||||
static_assert(count_entries(cluster::__property::__LF) == 1);
|
||||
static_assert(count_entries(cluster::__property::__Control) == 3886);
|
||||
static_assert(count_entries(cluster::__property::__Extend) == 2095);
|
||||
static_assert(count_entries(cluster::__property::__Regional_Indicator) == 26);
|
||||
static_assert(count_entries(cluster::__property::__SpacingMark) == 388);
|
||||
static_assert(count_entries(cluster::__property::__L) == 125);
|
||||
static_assert(count_entries(cluster::__property::__V) == 95);
|
||||
static_assert(count_entries(cluster::__property::__T) == 137);
|
||||
static_assert(count_entries(cluster::__property::__LV) == 399);
|
||||
static_assert(count_entries(cluster::__property::__LVT) == 10773);
|
||||
static_assert(count_entries(cluster::__property::__ZWJ) == 1);
|
||||
static_assert(count_entries(cluster::__property::__Extended_Pictographic) == 3537);
|
||||
|
||||
} // namespace
|
||||
|
||||
template <class Data>
|
||||
constexpr void test(const Data& data) {
|
||||
for (const auto& d : data) {
|
||||
assert(d.code_points.size() == d.breaks.size());
|
||||
|
||||
std::__unicode::__extended_grapheme_cluster_view view{d.input.data(), d.input.data() + d.input.size()};
|
||||
for (size_t i = 0; i < d.breaks.size(); ++i) {
|
||||
auto r = view.__consume();
|
||||
assert(r.__code_point_ == d.code_points[i]);
|
||||
assert(r.__last_ == d.input.data() + d.breaks[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
constexpr bool test() {
|
||||
test(data_utf8);
|
||||
|
||||
#ifndef TEST_HAS_NO_WIDE_CHARACTERS
|
||||
if constexpr (sizeof(wchar_t) == 2)
|
||||
test(data_utf16);
|
||||
else
|
||||
test(data_utf32);
|
||||
#endif
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int, char**) {
|
||||
test();
|
||||
// static_assert(test());
|
||||
|
||||
return 0;
|
||||
}
|
@ -1,109 +0,0 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// UNSUPPORTED: c++03, c++11, c++14, c++17
|
||||
// UNSUPPORTED: libcpp-has-no-incomplete-format
|
||||
|
||||
// UTF-32 doesn't work properly
|
||||
// XFAIL: windows
|
||||
|
||||
// <format>
|
||||
|
||||
// Tests the Unicode width support of the standard format specifiers.
|
||||
// It tests [format.string.std]/8 - 11:
|
||||
// - Properly determining the estimated with of a unicode string.
|
||||
// - Properly truncating to the wanted maximum width.
|
||||
|
||||
// This version runs the test when the platform doesn't have Unicode support.
|
||||
// REQUIRES: libcpp-has-no-unicode
|
||||
|
||||
#include <format>
|
||||
#include <cassert>
|
||||
|
||||
#include "test_macros.h"
|
||||
#include "make_string.h"
|
||||
|
||||
#define CSTR(S) MAKE_CSTRING(CharT, S)
|
||||
|
||||
using namespace std::__format_spec;
|
||||
|
||||
template <class CharT>
|
||||
constexpr bool operator==(const __string_alignment<CharT>& lhs,
|
||||
const __string_alignment<CharT>& rhs) noexcept {
|
||||
return lhs.__last == rhs.__last && lhs.__size == rhs.__size &&
|
||||
lhs.__align == rhs.__align;
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
constexpr void get_string_alignment(size_t offset, ptrdiff_t size, bool align,
|
||||
const CharT* str, size_t width,
|
||||
size_t precision) {
|
||||
std::basic_string_view<CharT> sv{str};
|
||||
__string_alignment<CharT> expected{sv.begin() + offset, size, align};
|
||||
__string_alignment<CharT> traits =
|
||||
__get_string_alignment(sv.begin(), sv.end(), width, precision);
|
||||
assert(traits == expected);
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
constexpr void get_string_alignment() {
|
||||
// Truncate the input.
|
||||
get_string_alignment(2, 2, false, CSTR("abc"), 0, 2);
|
||||
|
||||
// The 2-column character gets half accepted.
|
||||
get_string_alignment(2, 2, false, CSTR("a\u115f"), 0, 2);
|
||||
|
||||
// No alignment since the number of characters fits.
|
||||
get_string_alignment(2, 2, false, CSTR("a\u115f"), 2, 2);
|
||||
|
||||
// Same but for a 2-column 4-byte UTF-8 sequence
|
||||
get_string_alignment(2, 2, false, CSTR("a\U0001f300"), 0, 2);
|
||||
get_string_alignment(2, 2, false, CSTR("a\U0001f300"), 2, 2);
|
||||
|
||||
// No alignment required.
|
||||
get_string_alignment(3, 3, false, CSTR("abc"), 2, -1);
|
||||
get_string_alignment(3, 3, false, CSTR("abc"), 3, -1);
|
||||
|
||||
get_string_alignment(3 + 2 * (sizeof(CharT) == 1),
|
||||
3 + 2 * (sizeof(CharT) == 1), false, CSTR("ab\u1111"), 2,
|
||||
-1);
|
||||
|
||||
// Doesn't evaluate 'c' so size -> 0
|
||||
get_string_alignment(3 + 2 * (sizeof(CharT) == 1),
|
||||
3 + 2 * (sizeof(CharT) == 1), false,
|
||||
CSTR("a\u115fc") /* 2-column character */, 3, -1);
|
||||
// Extend width
|
||||
get_string_alignment(3, 3, true, CSTR("abc"), 4, -1);
|
||||
get_string_alignment(3 + 2 * (sizeof(CharT) == 1),
|
||||
3 + 2 * (sizeof(CharT) == 1), true,
|
||||
CSTR("a\u1160c") /* 1-column character */, 6, -1);
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
constexpr void test() {
|
||||
get_string_alignment<CharT>();
|
||||
}
|
||||
|
||||
constexpr bool test() {
|
||||
test<char>();
|
||||
#ifndef TEST_HAS_NO_WIDE_CHARACTERS
|
||||
test<wchar_t>();
|
||||
#endif
|
||||
#ifndef _LIBCPP_HAS_NO_CHAR8_T
|
||||
test<char8_t>();
|
||||
#endif
|
||||
test<char16_t>();
|
||||
test<char32_t>();
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int, char**) {
|
||||
test();
|
||||
static_assert(test());
|
||||
|
||||
return 0;
|
||||
}
|
@ -1,266 +0,0 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// UNSUPPORTED: c++03, c++11, c++14, c++17
|
||||
// UNSUPPORTED: libcpp-has-no-incomplete-format
|
||||
|
||||
// <format>
|
||||
|
||||
// Tests the Unicode width support of the standard format specifiers.
|
||||
// It tests [format.string.std]/8 - 11:
|
||||
// - Properly determining the estimated with of a unicode string.
|
||||
// - Properly truncating to the wanted maximum width.
|
||||
|
||||
// This version runs the test when the platform has Unicode support.
|
||||
// UNSUPPORTED: libcpp-has-no-unicode
|
||||
|
||||
#include <format>
|
||||
#include <cassert>
|
||||
|
||||
#include "test_macros.h"
|
||||
#include "make_string.h"
|
||||
|
||||
#define CSTR(S) MAKE_CSTRING(CharT, S)
|
||||
|
||||
using namespace std::__format_spec;
|
||||
|
||||
template <class CharT>
|
||||
constexpr bool operator==(const __string_alignment<CharT>& lhs,
|
||||
const __string_alignment<CharT>& rhs) noexcept {
|
||||
return lhs.__last == rhs.__last && lhs.__size == rhs.__size &&
|
||||
lhs.__align == rhs.__align;
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
constexpr void get_string_alignment(size_t offset, ptrdiff_t size, bool align,
|
||||
const CharT* str, size_t width,
|
||||
size_t precision) {
|
||||
std::basic_string_view<CharT> sv{str};
|
||||
__string_alignment<CharT> expected{sv.begin() + offset, size, align};
|
||||
__string_alignment<CharT> traits =
|
||||
__get_string_alignment(sv.begin(), sv.end(), width, precision);
|
||||
assert(traits == expected);
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
constexpr void estimate_column_width_fast(size_t expected, const CharT* str) {
|
||||
std::basic_string_view<CharT> sv{str};
|
||||
const CharT* out =
|
||||
__detail::__estimate_column_width_fast(sv.begin(), sv.end());
|
||||
assert(out == sv.begin() + expected);
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
constexpr void estimate_column_width_fast() {
|
||||
|
||||
// No unicode
|
||||
estimate_column_width_fast(3, CSTR("abc"));
|
||||
estimate_column_width_fast(3, CSTR("a\u007fc"));
|
||||
|
||||
if constexpr (sizeof(CharT) == 1) {
|
||||
// UTF-8 stop at the first multi-byte character.
|
||||
estimate_column_width_fast(0, CSTR("\u0080bc"));
|
||||
estimate_column_width_fast(1, CSTR("a\u0080c"));
|
||||
estimate_column_width_fast(2, CSTR("ab\u0080"));
|
||||
estimate_column_width_fast(1, CSTR("aßc"));
|
||||
|
||||
estimate_column_width_fast(1, CSTR("a\u07ffc"));
|
||||
estimate_column_width_fast(1, CSTR("a\u0800c"));
|
||||
|
||||
estimate_column_width_fast(1, CSTR("a\u10ffc"));
|
||||
} else {
|
||||
// UTF-16/32 stop at the first multi-column character.
|
||||
estimate_column_width_fast(3, CSTR("\u0080bc"));
|
||||
estimate_column_width_fast(3, CSTR("a\u0080c"));
|
||||
estimate_column_width_fast(3, CSTR("ab\u0080"));
|
||||
estimate_column_width_fast(3, CSTR("aßc"));
|
||||
|
||||
estimate_column_width_fast(3, CSTR("a\u07ffc"));
|
||||
estimate_column_width_fast(3, CSTR("a\u0800c"));
|
||||
|
||||
estimate_column_width_fast(3, CSTR("a\u10ffc"));
|
||||
}
|
||||
// First 2-column character
|
||||
estimate_column_width_fast(1, CSTR("a\u1100c"));
|
||||
|
||||
estimate_column_width_fast(1, CSTR("a\U0000ffffc"));
|
||||
estimate_column_width_fast(1, CSTR("a\U00010000c"));
|
||||
estimate_column_width_fast(1, CSTR("a\U0010FFFFc"));
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
constexpr void estimate_column_width(size_t expected, const CharT* str) {
|
||||
std::basic_string_view<CharT> sv{str};
|
||||
std::__format_spec::__detail::__column_width_result<CharT> column_info =
|
||||
__detail::__estimate_column_width(sv.begin(), sv.end(), -1);
|
||||
assert(column_info.__width == expected);
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
constexpr void estimate_column_width() {
|
||||
//*** 1-byte code points ***
|
||||
estimate_column_width(1, CSTR(" "));
|
||||
estimate_column_width(1, CSTR("~"));
|
||||
|
||||
//*** 2-byte code points ***
|
||||
estimate_column_width(1, CSTR("\u00a1")); // INVERTED EXCLAMATION MARK
|
||||
estimate_column_width(1, CSTR("\u07ff")); // NKO TAMAN SIGN
|
||||
|
||||
//*** 3-byte code points ***
|
||||
estimate_column_width(1, CSTR("\u0800")); // SAMARITAN LETTER ALAF
|
||||
estimate_column_width(1, CSTR("\ufffd")); // REPLACEMENT CHARACTER
|
||||
|
||||
// 2 column ranges
|
||||
estimate_column_width(2, CSTR("\u1100")); // HANGUL CHOSEONG KIYEOK
|
||||
estimate_column_width(2, CSTR("\u115f")); // HANGUL CHOSEONG FILLER
|
||||
|
||||
estimate_column_width(2, CSTR("\u2329")); // LEFT-POINTING ANGLE BRACKET
|
||||
estimate_column_width(2, CSTR("\u232a")); // RIGHT-POINTING ANGLE BRACKET
|
||||
|
||||
estimate_column_width(2, CSTR("\u2e80")); // CJK RADICAL REPEAT
|
||||
estimate_column_width(2, CSTR("\u303e")); // IDEOGRAPHIC VARIATION INDICATOR
|
||||
|
||||
estimate_column_width(2, CSTR("\u3040")); // U+3041 HIRAGANA LETTER SMALL A
|
||||
estimate_column_width(2, CSTR("\ua4cf")); // U+A4D0 LISU LETTER BA
|
||||
|
||||
estimate_column_width(2, CSTR("\uac00")); // <Hangul Syllable, First>
|
||||
estimate_column_width(2, CSTR("\ud7a3")); // Hangul Syllable Hih
|
||||
|
||||
estimate_column_width(2, CSTR("\uf900")); // CJK COMPATIBILITY IDEOGRAPH-F900
|
||||
estimate_column_width(2, CSTR("\ufaff")); // U+FB00 LATIN SMALL LIGATURE FF
|
||||
|
||||
estimate_column_width(2,
|
||||
CSTR("\ufe10")); // PRESENTATION FORM FOR VERTICAL COMMA
|
||||
estimate_column_width(
|
||||
2, CSTR("\ufe19")); // PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
|
||||
|
||||
estimate_column_width(
|
||||
2, CSTR("\ufe30")); // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
|
||||
estimate_column_width(2,
|
||||
CSTR("\ufe6f")); // U+FE70 ARABIC FATHATAN ISOLATED FORM
|
||||
|
||||
estimate_column_width(2, CSTR("\uff00")); // U+FF01 FULLWIDTH EXCLAMATION MARK
|
||||
estimate_column_width(2, CSTR("\uff60")); // FULLWIDTH RIGHT WHITE PARENTHESIS
|
||||
|
||||
estimate_column_width(2, CSTR("\uffe0")); // FULLWIDTH CENT SIGN
|
||||
estimate_column_width(2, CSTR("\uffe6")); // FULLWIDTH WON SIGN
|
||||
|
||||
//*** 4-byte code points ***
|
||||
estimate_column_width(1, CSTR("\U00010000")); // LINEAR B SYLLABLE B008 A
|
||||
estimate_column_width(1, CSTR("\U0010FFFF")); // Undefined Character
|
||||
|
||||
// 2 column ranges
|
||||
estimate_column_width(2, CSTR("\U0001f300")); // CYCLONE
|
||||
estimate_column_width(2, CSTR("\U0001f64f")); // PERSON WITH FOLDED HANDS
|
||||
estimate_column_width(
|
||||
2, CSTR("\U0001f900")); // CIRCLED CROSS FORMEE WITH FOUR DOTS
|
||||
estimate_column_width(2, CSTR("\U0001f9ff")); // NAZAR AMULET
|
||||
estimate_column_width(
|
||||
2, CSTR("\U00020000")); // <CJK Ideograph Extension B, First>
|
||||
estimate_column_width(2, CSTR("\U0002fffd")); // Undefined Character
|
||||
estimate_column_width(
|
||||
2, CSTR("\U00030000")); // <CJK Ideograph Extension G, First>
|
||||
estimate_column_width(2, CSTR("\U0003fffd")); // Undefined Character
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
constexpr void get_string_alignment() {
|
||||
// Truncate the input.
|
||||
get_string_alignment(2, 2, false, CSTR("abc"), 0, 2);
|
||||
|
||||
// The 2-column character gets entirely rejected.
|
||||
get_string_alignment(1, 1, false, CSTR("a\u115f"), 0, 2);
|
||||
|
||||
// Due to the requested width extra alignment is required.
|
||||
get_string_alignment(1, 1, true, CSTR("a\u115f"), 2, 2);
|
||||
|
||||
// Same but for a 2-column 4-byte UTF-8 sequence
|
||||
get_string_alignment(1, 1, false, CSTR("a\U0001f300"), 0, 2);
|
||||
get_string_alignment(1, 1, true, CSTR("a\U0001f300"), 2, 2);
|
||||
|
||||
// No alignment required.
|
||||
get_string_alignment(3, 3, false, CSTR("abc"), 2, -1);
|
||||
get_string_alignment(3, 3, false, CSTR("abc"), 3, -1);
|
||||
|
||||
// Special case, we have a special character already parsed and have enough
|
||||
// withd to satisfy the minumum required width.
|
||||
get_string_alignment(3 + 2 * (sizeof(CharT) == 1), 0, false, CSTR("ab\u1111"),
|
||||
2, -1);
|
||||
|
||||
// Evaluates all so size ->4
|
||||
get_string_alignment(3 + 2 * (sizeof(CharT) == 1), 4, false,
|
||||
CSTR("a\u115fc") /* 2-column character */, 3, -1);
|
||||
// Evaluates all so size ->4
|
||||
get_string_alignment(3 + 2 * (sizeof(CharT) == 1), 4, false,
|
||||
CSTR("a\u115fc") /* 2-column character */, 4, -1);
|
||||
|
||||
// Evaluates all so size ->5
|
||||
get_string_alignment(4 + 2 * (sizeof(CharT) == 1), 5, false,
|
||||
CSTR("a\u115fcd") /* 2-column character */, 4, -1);
|
||||
|
||||
// Evaluates all so size ->5
|
||||
get_string_alignment(4 + 2 * (sizeof(CharT) == 1), 5, false,
|
||||
CSTR("a\u115fcd") /* 2-column character */, 5, -1);
|
||||
|
||||
// Extend width
|
||||
get_string_alignment(3, 3, true, CSTR("abc"), 4, -1);
|
||||
get_string_alignment(3 + 2 * (sizeof(CharT) == 1), 3, true,
|
||||
CSTR("a\u1160c") /* 1-column character */, 4, -1);
|
||||
|
||||
// In this case the threshold where the width is still determined.
|
||||
get_string_alignment(2 + 2 * (sizeof(CharT) == 1), 3, false, CSTR("i\u1110"),
|
||||
2, -1);
|
||||
|
||||
// The width is no longer exactly determined.
|
||||
get_string_alignment(2 + 2 * (sizeof(CharT) == 1), 0, false, CSTR("i\u1110"),
|
||||
1, -1);
|
||||
|
||||
// Extend width and truncate input.
|
||||
get_string_alignment(1, 1, true, CSTR("abc"), 3, 1);
|
||||
|
||||
if constexpr (sizeof(CharT) == 1) {
|
||||
// Corrupt UTF-8 sequence.
|
||||
get_string_alignment(2, 2, false, CSTR("a\xc0"), 0, 3);
|
||||
get_string_alignment(2, 2, false, CSTR("a\xe0"), 0, 3);
|
||||
get_string_alignment(2, 2, false, CSTR("a\xf0"), 0, 3);
|
||||
} else if constexpr (sizeof(CharT) == 2) {
|
||||
// Corrupt UTF-16 sequence.
|
||||
if constexpr (std::same_as<CharT, char16_t>)
|
||||
get_string_alignment(2, 2, false, u"a\xdddd", 0, 3);
|
||||
else
|
||||
// Corrupt UTF-16 wchar_t seqence.
|
||||
get_string_alignment(2, 2, false, L"a\xdddd", 0, 3);
|
||||
}
|
||||
// UTF-32 doesn't combine characters, thus no corruption tests.
|
||||
}
|
||||
|
||||
template <class CharT>
|
||||
constexpr void test() {
|
||||
estimate_column_width_fast<CharT>();
|
||||
estimate_column_width<CharT>();
|
||||
get_string_alignment<CharT>();
|
||||
}
|
||||
|
||||
constexpr bool test() {
|
||||
test<char>();
|
||||
#ifndef TEST_HAS_NO_WIDE_CHARACTERS
|
||||
test<wchar_t>();
|
||||
#endif
|
||||
#ifndef _LIBCPP_HAS_NO_CHAR8_T
|
||||
test<char8_t>();
|
||||
#endif
|
||||
test<char16_t>();
|
||||
test<char32_t>();
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int, char**) {
|
||||
test();
|
||||
static_assert(test());
|
||||
|
||||
return 0;
|
||||
}
|
158
libcxx/test/std/utilities/format/format.functions/ascii.pass.cpp
Normal file
158
libcxx/test/std/utilities/format/format.functions/ascii.pass.cpp
Normal file
@ -0,0 +1,158 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// UNSUPPORTED: c++03, c++11, c++14, c++17
|
||||
// UNSUPPORTED: libcpp-has-no-incomplete-format
|
||||
|
||||
// Force unicode to be disabled.
|
||||
// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_HAS_NO_UNICODE
|
||||
|
||||
// <format>
|
||||
|
||||
// Tests Unicode is ignored and handled as ASCII.
|
||||
|
||||
#include <format>
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
|
||||
#include "test_macros.h"
|
||||
|
||||
// Note the comment annotations are from the Unicode test
|
||||
static void test_char() {
|
||||
//*** 1-byte code points ***
|
||||
assert(std::format("{:*^3}", " ") == "* *");
|
||||
assert(std::format("{:*^3}", "~") == "*~*");
|
||||
|
||||
//*** 2-byte code points ***
|
||||
assert(std::format("{:*^4}", "\u00a1") == "*\u00a1*"); // INVERTED EXCLAMATION MARK
|
||||
assert(std::format("{:*^4}", "\u07ff") == "*\u07ff*"); // NKO TAMAN SIGN
|
||||
|
||||
//*** 3-byte code points ***
|
||||
assert(std::format("{:*^5}", "\u0800") == "*\u0800*"); // SAMARITAN LETTER ALAF
|
||||
assert(std::format("{:*^5}", "\ufffd") == "*\ufffd*"); // REPLACEMENT CHARACTER
|
||||
|
||||
// 2 column ranges
|
||||
assert(std::format("{:*^5}", "\u1100") == "*\u1100*"); // HANGUL CHOSEONG KIYEOK
|
||||
assert(std::format("{:*^5}", "\u115f") == "*\u115f*"); // HANGUL CHOSEONG FILLER
|
||||
|
||||
assert(std::format("{:*^5}", "\u2329") == "*\u2329*"); // LEFT-POINTING ANGLE BRACKET
|
||||
assert(std::format("{:*^5}", "\u232a") == "*\u232a*"); // RIGHT-POINTING ANGLE BRACKET
|
||||
|
||||
assert(std::format("{:*^5}", "\u2e80") == "*\u2e80*"); // CJK RADICAL REPEAT
|
||||
assert(std::format("{:*^5}", "\u303e") == "*\u303e*"); // IDEOGRAPHIC VARIATION INDICATOR
|
||||
|
||||
assert(std::format("{:*^5}", "\u3040") == "*\u3040*"); // U+3041 HIRAGANA LETTER SMALL A
|
||||
assert(std::format("{:*^5}", "\ua4cf") == "*\ua4cf*"); // U+A4D0 LISU LETTER BA
|
||||
|
||||
assert(std::format("{:*^5}", "\uac00") == "*\uac00*"); // <Hangul Syllable, First>
|
||||
assert(std::format("{:*^5}", "\ud7a3") == "*\ud7a3*"); // Hangul Syllable Hih
|
||||
|
||||
assert(std::format("{:*^5}", "\uf900") == "*\uf900*"); // CJK COMPATIBILITY IDEOGRAPH-F900
|
||||
assert(std::format("{:*^5}", "\ufaff") == "*\ufaff*"); // U+FB00 LATIN SMALL LIGATURE FF
|
||||
|
||||
assert(std::format("{:*^5}", "\ufe10") == "*\ufe10*"); // PRESENTATION FORM FOR VERTICAL COMMA
|
||||
assert(std::format("{:*^5}", "\ufe19") == "*\ufe19*"); // PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
|
||||
|
||||
assert(std::format("{:*^5}", "\ufe30") == "*\ufe30*"); // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
|
||||
assert(std::format("{:*^5}", "\ufe6f") == "*\ufe6f*"); // U+FE70 ARABIC FATHATAN ISOLATED FORM
|
||||
|
||||
assert(std::format("{:*^5}", "\uff00") == "*\uff00*"); // U+FF01 FULLWIDTH EXCLAMATION MARK
|
||||
assert(std::format("{:*^5}", "\uff60") == "*\uff60*"); // FULLWIDTH RIGHT WHITE PARENTHESIS
|
||||
|
||||
assert(std::format("{:*^5}", "\uffe0") == "*\uffe0*"); // FULLWIDTH CENT SIGN
|
||||
assert(std::format("{:*^5}", "\uffe6") == "*\uffe6*"); // FULLWIDTH WON SIGN
|
||||
|
||||
//*** 4-byte code points ***
|
||||
assert(std::format("{:*^6}", "\U00010000") == "*\U00010000*"); // LINEAR B SYLLABLE B008 A
|
||||
assert(std::format("{:*^6}", "\U0010FFFF") == "*\U0010FFFF*"); // Undefined Character
|
||||
|
||||
// 2 column ranges
|
||||
assert(std::format("{:*^6}", "\U0001f300") == "*\U0001f300*"); // CYCLONE
|
||||
assert(std::format("{:*^6}", "\U0001f64f") == "*\U0001f64f*"); // PERSON WITH FOLDED HANDS
|
||||
assert(std::format("{:*^6}", "\U0001f900") == "*\U0001f900*"); // CIRCLED CROSS FORMEE WITH FOUR DOTS
|
||||
assert(std::format("{:*^6}", "\U0001f9ff") == "*\U0001f9ff*"); // NAZAR AMULET
|
||||
assert(std::format("{:*^6}", "\U00020000") == "*\U00020000*"); // <CJK Ideograph Extension B, First>
|
||||
assert(std::format("{:*^6}", "\U0002fffd") == "*\U0002fffd*"); // Undefined Character
|
||||
assert(std::format("{:*^6}", "\U00030000") == "*\U00030000*"); // <CJK Ideograph Extension G, First>
|
||||
assert(std::format("{:*^6}", "\U0003fffd") == "*\U0003fffd*"); // Undefined Character
|
||||
}
|
||||
|
||||
#ifndef TEST_HAS_NO_WIDE_CHARACTERS
|
||||
// Note the comment annotations are from the Unicode test
|
||||
static void test_wchar_t() {
|
||||
if constexpr (sizeof(wchar_t) == 2) {
|
||||
// TODO FMT Add these tests.
|
||||
} else {
|
||||
//*** 1-byte code points ***
|
||||
assert(std::format(L"{:*^3}", L" ") == L"* *");
|
||||
assert(std::format(L"{:*^3}", L"~") == L"*~*");
|
||||
|
||||
//*** 2-byte code points ***
|
||||
assert(std::format(L"{:*^3}", L"\u00a1") == L"*\u00a1*"); // INVERTED EXCLAMATION MARK
|
||||
assert(std::format(L"{:*^3}", L"\u07ff") == L"*\u07ff*"); // NKO TAMAN SIGN
|
||||
|
||||
//*** 3-byte code points ***
|
||||
assert(std::format(L"{:*^3}", L"\u0800") == L"*\u0800*"); // SAMARITAN LETTER ALAF
|
||||
assert(std::format(L"{:*^3}", L"\ufffd") == L"*\ufffd*"); // REPLACEMENT CHARACTER
|
||||
|
||||
// 2 column ranges
|
||||
assert(std::format(L"{:*^3}", L"\u1100") == L"*\u1100*"); // HANGUL CHOSEONG KIYEOK
|
||||
assert(std::format(L"{:*^3}", L"\u115f") == L"*\u115f*"); // HANGUL CHOSEONG FILLER
|
||||
|
||||
assert(std::format(L"{:*^3}", L"\u2329") == L"*\u2329*"); // LEFT-POINTING ANGLE BRACKET
|
||||
assert(std::format(L"{:*^3}", L"\u232a") == L"*\u232a*"); // RIGHT-POINTING ANGLE BRACKET
|
||||
|
||||
assert(std::format(L"{:*^3}", L"\u2e80") == L"*\u2e80*"); // CJK RADICAL REPEAT
|
||||
assert(std::format(L"{:*^3}", L"\u303e") == L"*\u303e*"); // IDEOGRAPHIC VARIATION INDICATOR
|
||||
|
||||
assert(std::format(L"{:*^3}", L"\u3040") == L"*\u3040*"); // U+3041 HIRAGANA LETTER SMALL A
|
||||
assert(std::format(L"{:*^3}", L"\ua4cf") == L"*\ua4cf*"); // U+A4D0 LISU LETTER BA
|
||||
|
||||
assert(std::format(L"{:*^3}", L"\uac00") == L"*\uac00*"); // <Hangul Syllable, First>
|
||||
assert(std::format(L"{:*^3}", L"\ud7a3") == L"*\ud7a3*"); // Hangul Syllable Hih
|
||||
|
||||
assert(std::format(L"{:*^3}", L"\uf900") == L"*\uf900*"); // CJK COMPATIBILITY IDEOGRAPH-F900
|
||||
assert(std::format(L"{:*^3}", L"\ufaff") == L"*\ufaff*"); // U+FB00 LATIN SMALL LIGATURE FF
|
||||
|
||||
assert(std::format(L"{:*^3}", L"\ufe10") == L"*\ufe10*"); // PRESENTATION FORM FOR VERTICAL COMMA
|
||||
assert(std::format(L"{:*^3}", L"\ufe19") == L"*\ufe19*"); // PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
|
||||
|
||||
assert(std::format(L"{:*^3}", L"\ufe30") == L"*\ufe30*"); // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
|
||||
assert(std::format(L"{:*^3}", L"\ufe6f") == L"*\ufe6f*"); // U+FE70 ARABIC FATHATAN ISOLATED FORM
|
||||
|
||||
assert(std::format(L"{:*^3}", L"\uff00") == L"*\uff00*"); // U+FF01 FULLWIDTH EXCLAMATION MARK
|
||||
assert(std::format(L"{:*^3}", L"\uff60") == L"*\uff60*"); // FULLWIDTH RIGHT WHITE PARENTHESIS
|
||||
|
||||
assert(std::format(L"{:*^3}", L"\uffe0") == L"*\uffe0*"); // FULLWIDTH CENT SIGN
|
||||
assert(std::format(L"{:*^3}", L"\uffe6") == L"*\uffe6*"); // FULLWIDTH WON SIGN
|
||||
|
||||
//*** 4-byte code points ***
|
||||
assert(std::format(L"{:*^3}", L"\U00010000") == L"*\U00010000*"); // LINEAR B SYLLABLE B008 A
|
||||
assert(std::format(L"{:*^3}", L"\U0010FFFF") == L"*\U0010FFFF*"); // Undefined Character
|
||||
|
||||
// 2 column ranges
|
||||
assert(std::format(L"{:*^3}", L"\U0001f300") == L"*\U0001f300*"); // CYCLONE
|
||||
assert(std::format(L"{:*^3}", L"\U0001f64f") == L"*\U0001f64f*"); // PERSON WITH FOLDED HANDS
|
||||
assert(std::format(L"{:*^3}", L"\U0001f900") == L"*\U0001f900*"); // CIRCLED CROSS FORMEE WITH FOUR DOTS
|
||||
assert(std::format(L"{:*^3}", L"\U0001f9ff") == L"*\U0001f9ff*"); // NAZAR AMULET
|
||||
assert(std::format(L"{:*^3}", L"\U00020000") == L"*\U00020000*"); // <CJK Ideograph Extension B, First>
|
||||
assert(std::format(L"{:*^3}", L"\U0002fffd") == L"*\U0002fffd*"); // Undefined Character
|
||||
assert(std::format(L"{:*^3}", L"\U00030000") == L"*\U00030000*"); // <CJK Ideograph Extension G, First>
|
||||
assert(std::format(L"{:*^3}", L"\U0003fffd") == L"*\U0003fffd*"); // Undefined Character
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
int main(int, char**) {
|
||||
test_char();
|
||||
|
||||
#ifndef TEST_HAS_NO_WIDE_CHARACTERS
|
||||
test_wchar_t();
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
@ -293,8 +293,8 @@ void format_test_string(const W& world, const U& universe, TestFunction check, E
|
||||
}
|
||||
|
||||
template <class CharT, class TestFunction>
|
||||
void format_test_string_unicode(TestFunction check) {
|
||||
(void)check;
|
||||
void format_test_string_unicode([[maybe_unused]] TestFunction check) {
|
||||
// unicode.pass.cpp and ascii.pass.cpp have additional tests.
|
||||
#ifndef TEST_HAS_NO_UNICODE
|
||||
// Make sure all possible types are tested. For clarity don't use macros.
|
||||
if constexpr (std::same_as<CharT, char>) {
|
||||
@ -332,7 +332,7 @@ void format_test_string_unicode(TestFunction check) {
|
||||
check.template operator()<"{:*^5}">(SV("*aßc*"), std::wstring_view(L"aßc"));
|
||||
check.template operator()<"{:*^4.2}">(SV("*aß*"), std::wstring_view(L"aßc"));
|
||||
}
|
||||
# endif
|
||||
# endif // TEST_HAS_NO_WIDE_CHARACTERS
|
||||
|
||||
// ß requires one column
|
||||
check.template operator()<"{}">(SV("aßc"), STR("aßc"));
|
||||
@ -365,6 +365,25 @@ void format_test_string_unicode(TestFunction check) {
|
||||
check.template operator()<"{:-<7}">(SV("a\u1110c---"), STR("a\u1110c"));
|
||||
check.template operator()<"{:-^7}">(SV("-a\u1110c--"), STR("a\u1110c"));
|
||||
check.template operator()<"{:->7}">(SV("---a\u1110c"), STR("a\u1110c"));
|
||||
|
||||
// Examples used in P1868R2
|
||||
check.template operator()<"{:*^3}">(SV("*\u0041*"), STR("\u0041")); // { LATIN CAPITAL LETTER A }
|
||||
check.template operator()<"{:*^3}">(SV("*\u00c1*"), STR("\u00c1")); // { LATIN CAPITAL LETTER A WITH ACUTE }
|
||||
check.template operator()<"{:*^3}">(
|
||||
SV("*\u0041\u0301*"),
|
||||
STR("\u0041\u0301")); // { LATIN CAPITAL LETTER A } { COMBINING ACUTE ACCENT }
|
||||
check.template operator()<"{:*^3}">(SV("*\u0132*"), STR("\u0132")); // { LATIN CAPITAL LIGATURE IJ }
|
||||
check.template operator()<"{:*^3}">(SV("*\u0394*"), STR("\u0394")); // { GREEK CAPITAL LETTER DELTA }
|
||||
|
||||
check.template operator()<"{:*^3}">(SV("*\u0429*"), STR("\u0429")); // { CYRILLIC CAPITAL LETTER SHCHA }
|
||||
check.template operator()<"{:*^3}">(SV("*\u05d0*"), STR("\u05d0")); // { HEBREW LETTER ALEF }
|
||||
check.template operator()<"{:*^3}">(SV("*\u0634*"), STR("\u0634")); // { ARABIC LETTER SHEEN }
|
||||
check.template operator()<"{:*^4}">(SV("*\u3009*"), STR("\u3009")); // { RIGHT-POINTING ANGLE BRACKET }
|
||||
check.template operator()<"{:*^4}">(SV("*\u754c*"), STR("\u754c")); // { CJK Unified Ideograph-754C }
|
||||
check.template operator()<"{:*^4}">(SV("*\U0001f921*"), STR("\U0001f921")); // { UNICORN FACE }
|
||||
check.template operator()<"{:*^4}">(
|
||||
SV("*\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466*"),
|
||||
STR("\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466")); // { Family: Man, Woman, Girl, Boy }
|
||||
#endif // TEST_HAS_NO_UNICODE
|
||||
}
|
||||
|
||||
@ -2580,9 +2599,8 @@ void format_tests(TestFunction check, ExceptionTest check_exception) {
|
||||
|
||||
// *** Test char format argument ***
|
||||
// The `char` to `wchar_t` formatting is tested separately.
|
||||
check.template operator()<"hello {}{}{}{}{}{}{}">(SV("hello 09azAZ!"), CharT('0'), CharT('9'), CharT('a'), CharT('z'),
|
||||
CharT('A'), CharT('Z'), CharT('!'));
|
||||
|
||||
check.template operator()<"hello {}{}{}{}{}{}{}">(
|
||||
SV("hello 09azAZ!"), CharT('0'), CharT('9'), CharT('a'), CharT('z'), CharT('A'), CharT('Z'), CharT('!'));
|
||||
format_test_char<CharT>(check, check_exception);
|
||||
format_test_char_as_integer<CharT>(check, check_exception);
|
||||
|
||||
|
@ -0,0 +1,301 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// UNSUPPORTED: c++03, c++11, c++14, c++17
|
||||
// UNSUPPORTED: libcpp-has-no-incomplete-format
|
||||
|
||||
// This version runs the test when the platform has Unicode support.
|
||||
// UNSUPPORTED: libcpp-has-no-unicode
|
||||
|
||||
// <format>
|
||||
|
||||
// Tests the Unicode width support of the standard format specifiers.
|
||||
// It tests [format.string.std]/8 - 11:
|
||||
// - Properly determining the estimated with of a unicode string.
|
||||
// - Properly truncating to the wanted maximum width.
|
||||
|
||||
// More specific extended grapheme cluster boundary rules are tested in
|
||||
// test/libcxx/utilities/format/format.string/format.string.std/extended_grapheme_cluster.pass.cpp
|
||||
// this test is based on test data provided by the Unicode Consortium.
|
||||
|
||||
#include <format>
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
|
||||
#include "make_string.h"
|
||||
#include "test_macros.h"
|
||||
#include "string_literal.h"
|
||||
|
||||
#ifndef TEST_HAS_NO_LOCALIZATION
|
||||
# include <iostream>
|
||||
# include <type_traits>
|
||||
#endif
|
||||
|
||||
#define SV(S) MAKE_STRING_VIEW(CharT, S)
|
||||
|
||||
auto check = []<string_literal fmt, class CharT, class... Args>(
|
||||
std::basic_string_view<CharT> expected, const Args&... args) constexpr {
|
||||
std::basic_string<CharT> out = std::format(fmt.template sv<CharT>(), args...);
|
||||
#ifndef TEST_HAS_NO_LOCALIZATION
|
||||
if constexpr (std::same_as<CharT, char>)
|
||||
if (out != expected)
|
||||
std::cerr << "\nFormat string " << fmt.template sv<char>() << "\nExpected output " << expected
|
||||
<< "\nActual output " << out << '\n';
|
||||
#endif
|
||||
assert(out == expected);
|
||||
};
|
||||
|
||||
template <class CharT>
|
||||
static void test_single_code_point_fill() {
|
||||
//*** 1-byte code points ***
|
||||
check.template operator()<"{:*^3}">(SV("* *"), SV(" "));
|
||||
check.template operator()<"{:*^3}">(SV("*~*"), SV("~"));
|
||||
|
||||
//*** 2-byte code points ***
|
||||
check.template operator()<"{:*^3}">(SV("*\u00a1*"), SV("\u00a1")); // INVERTED EXCLAMATION MARK
|
||||
check.template operator()<"{:*^3}">(SV("*\u07ff*"), SV("\u07ff")); // NKO TAMAN SIGN
|
||||
|
||||
//*** 3-byte code points ***
|
||||
check.template operator()<"{:*^3}">(SV("*\u0800*"), SV("\u0800")); // SAMARITAN LETTER ALAF
|
||||
check.template operator()<"{:*^3}">(SV("*\ufffd*"), SV("\ufffd")); // REPLACEMENT CHARACTER
|
||||
|
||||
// 2 column ranges
|
||||
check.template operator()<"{:*^4}">(SV("*\u1100*"), SV("\u1100")); // HANGUL CHOSEONG KIYEOK
|
||||
check.template operator()<"{:*^4}">(SV("*\u115f*"), SV("\u115f")); // HANGUL CHOSEONG FILLER
|
||||
|
||||
check.template operator()<"{:*^4}">(SV("*\u2329*"), SV("\u2329")); // LEFT-POINTING ANGLE BRACKET
|
||||
check.template operator()<"{:*^4}">(SV("*\u232a*"), SV("\u232a")); // RIGHT-POINTING ANGLE BRACKET
|
||||
|
||||
check.template operator()<"{:*^4}">(SV("*\u2e80*"), SV("\u2e80")); // CJK RADICAL REPEAT
|
||||
check.template operator()<"{:*^4}">(SV("*\u303e*"), SV("\u303e")); // IDEOGRAPHIC VARIATION INDICATOR
|
||||
|
||||
check.template operator()<"{:*^4}">(SV("*\u3040*"), SV("\u3040")); // U+3041 HIRAGANA LETTER SMALL A
|
||||
check.template operator()<"{:*^4}">(SV("*\ua4cf*"), SV("\ua4cf")); // U+A4D0 LISU LETTER BA
|
||||
|
||||
check.template operator()<"{:*^4}">(SV("*\uac00*"), SV("\uac00")); // <Hangul Syllable, First>
|
||||
check.template operator()<"{:*^4}">(SV("*\ud7a3*"), SV("\ud7a3")); // Hangul Syllable Hih
|
||||
|
||||
check.template operator()<"{:*^4}">(SV("*\uf900*"), SV("\uf900")); // CJK COMPATIBILITY IDEOGRAPH-F900
|
||||
check.template operator()<"{:*^4}">(SV("*\ufaff*"), SV("\ufaff")); // U+FB00 LATIN SMALL LIGATURE FF
|
||||
|
||||
check.template operator()<"{:*^4}">(SV("*\ufe10*"), SV("\ufe10")); // PRESENTATION FORM FOR VERTICAL COMMA
|
||||
check.template
|
||||
operator()<"{:*^4}">(SV("*\ufe19*"), SV("\ufe19")); // PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
|
||||
|
||||
check.template operator()<"{:*^4}">(SV("*\ufe30*"), SV("\ufe30")); // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
|
||||
check.template operator()<"{:*^4}">(SV("*\ufe6f*"), SV("\ufe6f")); // U+FE70 ARABIC FATHATAN ISOLATED FORM
|
||||
|
||||
check.template operator()<"{:*^4}">(SV("*\uff00*"), SV("\uff00")); // U+FF01 FULLWIDTH EXCLAMATION MARK
|
||||
check.template operator()<"{:*^4}">(SV("*\uff60*"), SV("\uff60")); // FULLWIDTH RIGHT WHITE PARENTHESIS
|
||||
|
||||
check.template operator()<"{:*^4}">(SV("*\uffe0*"), SV("\uffe0")); // FULLWIDTH CENT SIGN
|
||||
check.template operator()<"{:*^4}">(SV("*\uffe6*"), SV("\uffe6")); // FULLWIDTH WON SIGN
|
||||
|
||||
//*** 4-byte code points ***
|
||||
check.template operator()<"{:*^3}">(SV("*\U00010000*"), SV("\U00010000")); // LINEAR B SYLLABLE B008 A
|
||||
check.template operator()<"{:*^3}">(SV("*\U0010FFFF*"), SV("\U0010FFFF")); // Undefined Character
|
||||
|
||||
// 2 column ranges
|
||||
check.template operator()<"{:*^4}">(SV("*\U0001f300*"), SV("\U0001f300")); // CYCLONE
|
||||
check.template operator()<"{:*^4}">(SV("*\U0001f64f*"), SV("\U0001f64f")); // PERSON WITH FOLDED HANDS
|
||||
check.template operator()<"{:*^4}">(SV("*\U0001f900*"), SV("\U0001f900")); // CIRCLED CROSS FORMEE WITH FOUR DOTS
|
||||
check.template operator()<"{:*^4}">(SV("*\U0001f9ff*"), SV("\U0001f9ff")); // NAZAR AMULET
|
||||
check.template operator()<"{:*^4}">(SV("*\U00020000*"), SV("\U00020000")); // <CJK Ideograph Extension B, First>
|
||||
check.template operator()<"{:*^4}">(SV("*\U0002fffd*"), SV("\U0002fffd")); // Undefined Character
|
||||
check.template operator()<"{:*^4}">(SV("*\U00030000*"), SV("\U00030000")); // <CJK Ideograph Extension G, First>
|
||||
check.template operator()<"{:*^4}">(SV("*\U0003fffd*"), SV("\U0003fffd")); // Undefined Character
|
||||
}
|
||||
|
||||
// One column output is unaffected.
|
||||
// Two column output is removed, thus the result is only the fill character.
|
||||
template <class CharT>
|
||||
static void test_single_code_point_truncate() {
|
||||
//*** 1-byte code points ***
|
||||
check.template operator()<"{:*^3.1}">(SV("* *"), SV(" "));
|
||||
check.template operator()<"{:*^3.1}">(SV("*~*"), SV("~"));
|
||||
|
||||
//*** 2-byte code points ***
|
||||
check.template operator()<"{:*^3.1}">(SV("*\u00a1*"), SV("\u00a1")); // INVERTED EXCLAMATION MARK
|
||||
check.template operator()<"{:*^3.1}">(SV("*\u07ff*"), SV("\u07ff")); // NKO TAMAN SIGN
|
||||
|
||||
//*** 3.1-byte code points ***
|
||||
check.template operator()<"{:*^3.1}">(SV("*\u0800*"), SV("\u0800")); // SAMARITAN LETTER ALAF
|
||||
check.template operator()<"{:*^3.1}">(SV("*\ufffd*"), SV("\ufffd")); // REPLACEMENT CHARACTER
|
||||
|
||||
// 2 column ranges
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\u1100")); // HANGUL CHOSEONG KIYEOK
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\u115f")); // HANGUL CHOSEONG FILLER
|
||||
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\u2329")); // LEFT-POINTING ANGLE BRACKET
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\u232a")); // RIGHT-POINTING ANGLE BRACKET
|
||||
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\u2e80")); // CJK RADICAL REPEAT
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\u303e")); // IDEOGRAPHIC VARIATION INDICATOR
|
||||
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\u3040")); // U+3041 HIRAGANA LETTER SMALL A
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\ua4cf")); // U+A4D0 LISU LETTER BA
|
||||
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\uac00")); // <Hangul Syllable, First>
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\ud7a3")); // Hangul Syllable Hih
|
||||
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\uf900")); // CJK COMPATIBILITY IDEOGRAPH-F900
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\ufaff")); // U+FB00 LATIN SMALL LIGATURE FF
|
||||
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\ufe10")); // PRESENTATION FORM FOR VERTICAL COMMA
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\ufe19")); // PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
|
||||
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\ufe30")); // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\ufe6f")); // U+FE70 ARABIC FATHATAN ISOLATED FORM
|
||||
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\uff00")); // U+FF01 FULLWIDTH EXCLAMATION MARK
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\uff60")); // FULLWIDTH RIGHT WHITE PARENTHESIS
|
||||
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\uffe0")); // FULLWIDTH CENT SIGN
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\uffe6")); // FULLWIDTH WON SIGN
|
||||
|
||||
//*** 3.1-byte code points ***
|
||||
check.template operator()<"{:*^3.1}">(SV("*\U00010000*"), SV("\U00010000")); // LINEAR B SYLLABLE B008 A
|
||||
check.template operator()<"{:*^3.1}">(SV("*\U0010FFFF*"), SV("\U0010FFFF")); // Undefined Character
|
||||
|
||||
// 2 column ranges
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\U0001f300")); // CYCLONE
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\U0001f64f")); // PERSON WITH FOLDED HANDS
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\U0001f900")); // CIRCLED CROSS FORMEE WITH FOUR DOTS
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\U0001f9ff")); // NAZAR AMULET
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\U00020000")); // <CJK Ideograph Extension B, First>
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\U0002fffd")); // Undefined Character
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\U00030000")); // <CJK Ideograph Extension G, First>
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\U0003fffd")); // Undefined Character
|
||||
}
|
||||
|
||||
// The examples used in that paper.
|
||||
template <class CharT>
|
||||
static void test_P1868() {
|
||||
// Fill
|
||||
check.template operator()<"{:*^3}">(SV("*\u0041*"), SV("\u0041")); // { LATIN CAPITAL LETTER A }
|
||||
check.template operator()<"{:*^3}">(SV("*\u00c1*"), SV("\u00c1")); // { LATIN CAPITAL LETTER A WITH ACUTE }
|
||||
check.template operator()<"{:*^3}">(
|
||||
SV("*\u0041\u0301*"),
|
||||
SV("\u0041\u0301")); // { LATIN CAPITAL LETTER A } { COMBINING ACUTE ACCENT }
|
||||
check.template operator()<"{:*^3}">(SV("*\u0132*"), SV("\u0132")); // { LATIN CAPITAL LIGATURE IJ }
|
||||
check.template operator()<"{:*^3}">(SV("*\u0394*"), SV("\u0394")); // { GREEK CAPITAL LETTER DELTA }
|
||||
|
||||
check.template operator()<"{:*^3}">(SV("*\u0429*"), SV("\u0429")); // { CYRILLIC CAPITAL LETTER SHCHA }
|
||||
check.template operator()<"{:*^3}">(SV("*\u05d0*"), SV("\u05d0")); // { HEBREW LETTER ALEF }
|
||||
check.template operator()<"{:*^3}">(SV("*\u0634*"), SV("\u0634")); // { ARABIC LETTER SHEEN }
|
||||
check.template operator()<"{:*^4}">(SV("*\u3009*"), SV("\u3009")); // { RIGHT-POINTING ANGLE BRACKET }
|
||||
check.template operator()<"{:*^4}">(SV("*\u754c*"), SV("\u754c")); // { CJK Unified Ideograph-754C }
|
||||
check.template operator()<"{:*^4}">(SV("*\U0001f921*"), SV("\U0001f921")); // { UNICORN FACE }
|
||||
check.template operator()<"{:*^4}">(
|
||||
SV("*\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466*"),
|
||||
SV("\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466")); // { Family: Man, Woman, Girl, Boy }
|
||||
|
||||
// Truncate to 1 column: 1 column grapheme clusters are kept together.
|
||||
check.template operator()<"{:*^3.1}">(SV("*\u0041*"), SV("\u0041")); // { LATIN CAPITAL LETTER A }
|
||||
check.template operator()<"{:*^3.1}">(SV("*\u00c1*"), SV("\u00c1")); // { LATIN CAPITAL LETTER A WITH ACUTE }
|
||||
check.template operator()<"{:*^3.1}">(
|
||||
SV("*\u0041\u0301*"),
|
||||
SV("\u0041\u0301")); // { LATIN CAPITAL LETTER A } { COMBINING ACUTE ACCENT }
|
||||
check.template operator()<"{:*^3.1}">(SV("*\u0132*"), SV("\u0132")); // { LATIN CAPITAL LIGATURE IJ }
|
||||
check.template operator()<"{:*^3.1}">(SV("*\u0394*"), SV("\u0394")); // { GREEK CAPITAL LETTER DELTA }
|
||||
|
||||
check.template operator()<"{:*^3.1}">(SV("*\u0429*"), SV("\u0429")); // { CYRILLIC CAPITAL LETTER SHCHA }
|
||||
check.template operator()<"{:*^3.1}">(SV("*\u05d0*"), SV("\u05d0")); // { HEBREW LETTER ALEF }
|
||||
check.template operator()<"{:*^3.1}">(SV("*\u0634*"), SV("\u0634")); // { ARABIC LETTER SHEEN }
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\u3009")); // { RIGHT-POINTING ANGLE BRACKET }
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\u754c")); // { CJK Unified Ideograph-754C }
|
||||
check.template operator()<"{:*^3.1}">(SV("***"), SV("\U0001f921")); // { UNICORN FACE }
|
||||
check.template operator()<"{:*^3.1}">(
|
||||
SV("***"),
|
||||
SV("\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466")); // { Family: Man, Woman, Girl, Boy }
|
||||
|
||||
// Truncate to 2 column: 2 column grapheme clusters are kept together.
|
||||
check.template operator()<"{:*^3.2}">(SV("*\u0041*"), SV("\u0041")); // { LATIN CAPITAL LETTER A }
|
||||
check.template operator()<"{:*^3.2}">(SV("*\u00c1*"), SV("\u00c1")); // { LATIN CAPITAL LETTER A WITH ACUTE }
|
||||
check.template operator()<"{:*^3.2}">(
|
||||
SV("*\u0041\u0301*"),
|
||||
SV("\u0041\u0301")); // { LATIN CAPITAL LETTER A } { COMBINING ACUTE ACCENT }
|
||||
check.template operator()<"{:*^3.2}">(SV("*\u0132*"), SV("\u0132")); // { LATIN CAPITAL LIGATURE IJ }
|
||||
check.template operator()<"{:*^3.2}">(SV("*\u0394*"), SV("\u0394")); // { GREEK CAPITAL LETTER DELTA }
|
||||
|
||||
check.template operator()<"{:*^3.2}">(SV("*\u0429*"), SV("\u0429")); // { CYRILLIC CAPITAL LETTER SHCHA }
|
||||
check.template operator()<"{:*^3.2}">(SV("*\u05d0*"), SV("\u05d0")); // { HEBREW LETTER ALEF }
|
||||
check.template operator()<"{:*^3.2}">(SV("*\u0634*"), SV("\u0634")); // { ARABIC LETTER SHEEN }
|
||||
check.template operator()<"{:*^4.2}">(SV("*\u3009*"), SV("\u3009")); // { RIGHT-POINTING ANGLE BRACKET }
|
||||
check.template operator()<"{:*^4.2}">(SV("*\u754c*"), SV("\u754c")); // { CJK Unified Ideograph-754C }
|
||||
check.template operator()<"{:*^4.2}">(SV("*\U0001f921*"), SV("\U0001f921")); // { UNICORN FACE }
|
||||
check.template operator()<"{:*^4.2}">(
|
||||
SV("*\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466*"),
|
||||
SV("\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466")); // { Family: Man, Woman, Girl, Boy }
|
||||
}
|
||||
|
||||
#ifdef _LIBCPP_VERSION
|
||||
// Tests the libc++ specific behaviour for malformed UTF-sequences. The
|
||||
// Standard doesn't specify how to handle this.
|
||||
template <class CharT>
|
||||
static void test_malformed_code_point() {
|
||||
if constexpr (sizeof(CharT) == 1) {
|
||||
// Malformed at end.
|
||||
check.template operator()<"{:*^7}">(SV("*ZZZZ\x8f*"), SV("ZZZZ\x8f"));
|
||||
check.template operator()<"{:*^7}">(SV("*ZZZZ\xcf*"), SV("ZZZZ\xcf"));
|
||||
check.template operator()<"{:*^7}">(SV("*ZZZZ\xef*"), SV("ZZZZ\xef"));
|
||||
check.template operator()<"{:*^7}">(SV("*ZZZZ\xff*"), SV("ZZZZ\xff"));
|
||||
|
||||
// Malformed in middle, no continuation
|
||||
check.template operator()<"{:*^8}">(SV("*ZZZZ\x8fZ*"), SV("ZZZZ\x8fZ"));
|
||||
check.template operator()<"{:*^8}">(SV("*ZZZZ\xcfZ*"), SV("ZZZZ\xcfZ"));
|
||||
check.template operator()<"{:*^8}">(SV("*ZZZZ\xefZ*"), SV("ZZZZ\xefZ"));
|
||||
check.template operator()<"{:*^8}">(SV("*ZZZZ\xffZ*"), SV("ZZZZ\xffZ"));
|
||||
|
||||
check.template operator()<"{:*^9}">(SV("*ZZZZ\x8fZZ*"), SV("ZZZZ\x8fZZ"));
|
||||
check.template operator()<"{:*^9}">(SV("*ZZZZ\xcfZZ*"), SV("ZZZZ\xcfZZ"));
|
||||
check.template operator()<"{:*^9}">(SV("*ZZZZ\xefZZ*"), SV("ZZZZ\xefZZ"));
|
||||
check.template operator()<"{:*^9}">(SV("*ZZZZ\xffZZ*"), SV("ZZZZ\xffZZ"));
|
||||
|
||||
check.template operator()<"{:*^10}">(SV("*ZZZZ\x8fZZZ*"), SV("ZZZZ\x8fZZZ"));
|
||||
check.template operator()<"{:*^10}">(SV("*ZZZZ\xcfZZZ*"), SV("ZZZZ\xcfZZZ"));
|
||||
check.template operator()<"{:*^10}">(SV("*ZZZZ\xefZZZ*"), SV("ZZZZ\xefZZZ"));
|
||||
check.template operator()<"{:*^10}">(SV("*ZZZZ\xffZZZ*"), SV("ZZZZ\xffZZZ"));
|
||||
|
||||
check.template operator()<"{:*^11}">(SV("*ZZZZ\x8fZZZZ*"), SV("ZZZZ\x8fZZZZ"));
|
||||
check.template operator()<"{:*^11}">(SV("*ZZZZ\xcfZZZZ*"), SV("ZZZZ\xcfZZZZ"));
|
||||
check.template operator()<"{:*^11}">(SV("*ZZZZ\xefZZZZ*"), SV("ZZZZ\xefZZZZ"));
|
||||
check.template operator()<"{:*^11}">(SV("*ZZZZ\xffZZZZ*"), SV("ZZZZ\xffZZZZ"));
|
||||
|
||||
// Premature end.
|
||||
check.template operator()<"{:*^8}">(SV("*ZZZZ\xef\xf5*"), SV("ZZZZ\xef\xf5"));
|
||||
check.template operator()<"{:*^12}">(SV("*ZZZZ\xef\xf5ZZZZ*"), SV("ZZZZ\xef\xf5ZZZZ"));
|
||||
check.template operator()<"{:*^9}">(SV("*ZZZZ\xff\xf5\xf5*"), SV("ZZZZ\xff\xf5\xf5"));
|
||||
check.template operator()<"{:*^13}">(SV("*ZZZZ\xff\xf5\xf5ZZZZ*"), SV("ZZZZ\xff\xf5\xf5ZZZZ"));
|
||||
|
||||
} else if constexpr (sizeof(CharT) == 2) {
|
||||
// TODO FMT Add these tests.
|
||||
}
|
||||
// UTF-32 doesn't combine characters, thus no corruption tests.
|
||||
}
|
||||
#endif
|
||||
|
||||
template <class CharT>
|
||||
static void test() {
|
||||
test_single_code_point_fill<CharT>();
|
||||
test_single_code_point_truncate<CharT>();
|
||||
test_P1868<CharT>();
|
||||
|
||||
#ifdef _LIBCPP_VERSION
|
||||
test_malformed_code_point<CharT>();
|
||||
#endif
|
||||
}
|
||||
|
||||
int main(int, char**) {
|
||||
test<char>();
|
||||
|
||||
#ifndef TEST_HAS_NO_WIDE_CHARACTERS
|
||||
test<wchar_t>();
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
326
libcxx/utils/generate_extended_grapheme_cluster_table.py
Executable file
326
libcxx/utils/generate_extended_grapheme_cluster_table.py
Executable file
@ -0,0 +1,326 @@
|
||||
#!/usr/bin/env python
|
||||
# ===----------------------------------------------------------------------===##
|
||||
#
|
||||
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
# See https://llvm.org/LICENSE.txt for license information.
|
||||
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
#
|
||||
# ===----------------------------------------------------------------------===##
|
||||
|
||||
# The code is based on
|
||||
# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py
|
||||
#
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
import re
|
||||
|
||||
|
||||
@dataclass
|
||||
class PropertyRange:
|
||||
lower: int = -1
|
||||
upper: int = -1
|
||||
prop: str = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class Entry:
|
||||
lower: int = -1
|
||||
offset: int = -1
|
||||
prop: int = -1
|
||||
|
||||
|
||||
LINE_REGEX = re.compile(
|
||||
r"^(?P<lower>[0-9A-F]{4,5})(?:\.\.(?P<upper>[0-9A-F]{4,5}))?\s*;\s*(?P<prop>\w+)"
|
||||
)
|
||||
|
||||
|
||||
def parsePropertyLine(inputLine: str) -> Optional[PropertyRange]:
|
||||
result = PropertyRange()
|
||||
if m := LINE_REGEX.match(inputLine):
|
||||
lower_str, upper_str, result.prop = m.group("lower", "upper", "prop")
|
||||
result.lower = int(lower_str, base=16)
|
||||
result.upper = result.lower
|
||||
if upper_str is not None:
|
||||
result.upper = int(upper_str, base=16)
|
||||
return result
|
||||
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]:
|
||||
"""
|
||||
Merges consecutive ranges with the same property to one range.
|
||||
|
||||
Merging the ranges results in fewer ranges in the output table,
|
||||
reducing binary and improving lookup performance.
|
||||
"""
|
||||
result = list()
|
||||
for x in input:
|
||||
if (
|
||||
len(result)
|
||||
and result[-1].prop == x.prop
|
||||
and result[-1].upper + 1 == x.lower
|
||||
):
|
||||
result[-1].upper = x.upper
|
||||
continue
|
||||
result.append(x)
|
||||
return result
|
||||
|
||||
|
||||
PROP_VALUE_ENUMERATOR_TEMPLATE = "__{}"
|
||||
PROP_VALUE_ENUM_TEMPLATE = """
|
||||
enum class __property : uint8_t {{
|
||||
// Values generated from the data files.
|
||||
{enumerators},
|
||||
|
||||
// The properies below aren't stored in the "database".
|
||||
|
||||
// Text position properties.
|
||||
__sot,
|
||||
__eot,
|
||||
|
||||
// The code unit has none of above properties.
|
||||
__none
|
||||
}};
|
||||
"""
|
||||
|
||||
DATA_ARRAY_TEMPLATE = """
|
||||
/// The entries of the extended grapheme cluster bondary property table.
|
||||
///
|
||||
/// The data is generated from
|
||||
/// - https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
|
||||
/// - https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
|
||||
///
|
||||
/// The data has 3 values
|
||||
/// - bits [0, 3] The property. One of the values generated form the datafiles
|
||||
/// of \\ref __property
|
||||
/// - bits [4, 10] The size of the range.
|
||||
/// - bits [11, 31] The lower bound code point of the range. The upper bound of
|
||||
/// the range is lower bound + size.
|
||||
///
|
||||
/// The 7 bits for the size allow a maximum range of 128 elements. Some ranges
|
||||
/// in the Unicode tables are larger. They are stored in multiple consecutive
|
||||
/// ranges in the data table. An alternative would be to store the sizes in a
|
||||
/// separate 16-bit value. The original MSVC STL code had such an approach, but
|
||||
/// this approach uses less space for the data and is about 4% faster in the
|
||||
/// following benchmark.
|
||||
/// libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp
|
||||
inline constexpr uint32_t __entries[{size}] = {{{entries}}};
|
||||
|
||||
/// Returns the extended grapheme cluster bondary property of a code point.
|
||||
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __property __get_property(const char32_t __code_point) noexcept {{
|
||||
// TODO FMT use std::ranges::upper_bound.
|
||||
|
||||
// The algorithm searches for the upper bound of the range and, when found,
|
||||
// steps back one entry. This algorithm is used since the code point can be
|
||||
// anywhere in the range. After a lower bound is found the next step is to
|
||||
// compare whether the code unit is indeed in the range.
|
||||
//
|
||||
// Since the entry contains a code unit, size, and property the code point
|
||||
// being sought needs to be adjusted. Just shifting the code point to the
|
||||
// proper position doesn't work; suppose an entry has property 0, size 1,
|
||||
// and lower bound 3. This results in the entry 0x1810.
|
||||
// When searching for code point 3 it will search for 0x1800, find 0x1810
|
||||
// and moves to the previous entry. Thus the lower bound value will never
|
||||
// be found.
|
||||
// The simple solution is to set the bits belonging to the property and
|
||||
// size. Then the upper bound for code point 3 will return the entry after
|
||||
// 0x1810. After moving to the previous entry the algorithm arrives at the
|
||||
// correct entry.
|
||||
ptrdiff_t __i = std::upper_bound(__entries, std::end(__entries), (__code_point << 11) | 0x7ffu) - __entries;
|
||||
if (__i == 0)
|
||||
return __property::__none;
|
||||
|
||||
--__i;
|
||||
uint32_t __upper_bound = (__entries[__i] >> 11) + ((__entries[__i] >> 4) & 0x7f);
|
||||
if (__code_point <= __upper_bound)
|
||||
return static_cast<__property>(__entries[__i] & 0xf);
|
||||
|
||||
return __property::__none;
|
||||
}}
|
||||
"""
|
||||
|
||||
MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE = """
|
||||
// -*- C++ -*-
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// WARNING, this entire header is generated by
|
||||
// utiles/generate_extended_grapheme_cluster_table.py
|
||||
// DO NOT MODIFY!
|
||||
|
||||
// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
|
||||
//
|
||||
// See Terms of Use <https://www.unicode.org/copyright.html>
|
||||
// for definitions of Unicode Inc.'s Data Files and Software.
|
||||
//
|
||||
// NOTICE TO USER: Carefully read the following legal agreement.
|
||||
// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
|
||||
// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
|
||||
// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
|
||||
// TERMS AND CONDITIONS OF THIS AGREEMENT.
|
||||
// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
|
||||
// THE DATA FILES OR SOFTWARE.
|
||||
//
|
||||
// COPYRIGHT AND PERMISSION NOTICE
|
||||
//
|
||||
// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
|
||||
// Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining
|
||||
// a copy of the Unicode data files and any associated documentation
|
||||
// (the "Data Files") or Unicode software and any associated documentation
|
||||
// (the "Software") to deal in the Data Files or Software
|
||||
// without restriction, including without limitation the rights to use,
|
||||
// copy, modify, merge, publish, distribute, and/or sell copies of
|
||||
// the Data Files or Software, and to permit persons to whom the Data Files
|
||||
// or Software are furnished to do so, provided that either
|
||||
// (a) this copyright and permission notice appear with all copies
|
||||
// of the Data Files or Software, or
|
||||
// (b) this copyright and permission notice appear in associated
|
||||
// Documentation.
|
||||
//
|
||||
// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||
// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
||||
// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
// NONINFRINGEMENT OF THIRD PARTY RIGHTS.
|
||||
// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
|
||||
// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
|
||||
// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
|
||||
// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
|
||||
// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
||||
// PERFORMANCE OF THE DATA FILES OR SOFTWARE.
|
||||
//
|
||||
// Except as contained in this notice, the name of a copyright holder
|
||||
// shall not be used in advertising or otherwise to promote the sale,
|
||||
// use or other dealings in these Data Files or Software without prior
|
||||
// written authorization of the copyright holder.
|
||||
|
||||
#ifndef _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
|
||||
#define _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
|
||||
|
||||
#include <__algorithm/upper_bound.h>
|
||||
#include <__config>
|
||||
#include <__iterator/access.h>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
|
||||
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
|
||||
# pragma GCC system_header
|
||||
#endif
|
||||
|
||||
_LIBCPP_BEGIN_NAMESPACE_STD
|
||||
|
||||
#if _LIBCPP_STD_VER > 17
|
||||
|
||||
namespace __extended_grapheme_custer_property_boundary {{
|
||||
|
||||
{content}
|
||||
|
||||
}} // __extended_grapheme_custer_property_boundary
|
||||
|
||||
#endif //_LIBCPP_STD_VER > 17
|
||||
|
||||
_LIBCPP_END_NAMESPACE_STD
|
||||
|
||||
#endif // _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
|
||||
"""
|
||||
|
||||
|
||||
def property_ranges_to_table(
|
||||
ranges: list[PropertyRange], props: list[str]
|
||||
) -> list[Entry]:
|
||||
assert len(props) < 16
|
||||
result = list[Entry]()
|
||||
high = -1
|
||||
for range in sorted(ranges, key=lambda x: x.lower):
|
||||
# Validate overlapping ranges
|
||||
assert range.lower > high
|
||||
high = range.upper
|
||||
|
||||
while True:
|
||||
e = Entry(range.lower, range.upper - range.lower, props.index(range.prop))
|
||||
if e.offset <= 127:
|
||||
result.append(e)
|
||||
break
|
||||
e.offset = 127
|
||||
result.append(e)
|
||||
range.lower += 128
|
||||
return result
|
||||
|
||||
|
||||
cpp_entrytemplate = "0x{:08x}"
|
||||
|
||||
|
||||
def generate_cpp_data(prop_name: str, ranges: list[PropertyRange]) -> str:
|
||||
result = StringIO()
|
||||
prop_values = sorted(set(x.prop for x in ranges))
|
||||
table = property_ranges_to_table(ranges, prop_values)
|
||||
enumerator_values = [PROP_VALUE_ENUMERATOR_TEMPLATE.format(x) for x in prop_values]
|
||||
result.write(
|
||||
PROP_VALUE_ENUM_TEMPLATE.format(enumerators=",".join(enumerator_values))
|
||||
)
|
||||
result.write(
|
||||
DATA_ARRAY_TEMPLATE.format(
|
||||
prop_name=prop_name,
|
||||
size=len(table),
|
||||
entries=",".join(
|
||||
[
|
||||
cpp_entrytemplate.format(x.lower << 11 | x.offset << 4 | x.prop)
|
||||
for x in table
|
||||
]
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
return result.getvalue()
|
||||
|
||||
|
||||
def generate_data_tables() -> str:
|
||||
"""
|
||||
Generate Unicode data for inclusion into <format> from
|
||||
GraphemeBreakProperty.txt and emoji-data.txt.
|
||||
|
||||
GraphemeBreakProperty.txt can be found at
|
||||
https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
|
||||
|
||||
emoji-data.txt can be found at
|
||||
https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
|
||||
|
||||
Both files are expected to be in the same directory as this script.
|
||||
"""
|
||||
gbp_data_path = Path(__file__).absolute().with_name("GraphemeBreakProperty.txt")
|
||||
emoji_data_path = Path(__file__).absolute().with_name("emoji-data.txt")
|
||||
gbp_ranges = list()
|
||||
emoji_ranges = list()
|
||||
with gbp_data_path.open(encoding="utf-8") as f:
|
||||
gbp_ranges = compactPropertyRanges(
|
||||
[x for line in f if (x := parsePropertyLine(line))]
|
||||
)
|
||||
with emoji_data_path.open(encoding="utf-8") as f:
|
||||
emoji_ranges = compactPropertyRanges(
|
||||
[x for line in f if (x := parsePropertyLine(line))]
|
||||
)
|
||||
|
||||
[gbp_ranges.append(x) for x in emoji_ranges if x.prop == "Extended_Pictographic"]
|
||||
gpb_cpp_data = generate_cpp_data("Grapheme_Break", gbp_ranges)
|
||||
return "\n".join([gpb_cpp_data])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(
|
||||
MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE.lstrip().format(
|
||||
content=generate_data_tables()
|
||||
)
|
||||
)
|
245
libcxx/utils/generate_extended_grapheme_cluster_test.py
Executable file
245
libcxx/utils/generate_extended_grapheme_cluster_test.py
Executable file
@ -0,0 +1,245 @@
|
||||
#!/usr/bin/env python
|
||||
# ===----------------------------------------------------------------------===##
|
||||
#
|
||||
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
# See https://llvm.org/LICENSE.txt for license information.
|
||||
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
#
|
||||
# ===----------------------------------------------------------------------===##
|
||||
|
||||
# The code is based on
|
||||
# https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_test_data_gen.py
|
||||
#
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional, TextIO
|
||||
from array import array
|
||||
|
||||
|
||||
@dataclass
|
||||
class BreakTestItem:
|
||||
code_points: list[int] = field(default_factory=list)
|
||||
encoded: str = ""
|
||||
breaks_utf8: list[int] = field(default_factory=list)
|
||||
breaks_utf16: list[int] = field(default_factory=list)
|
||||
breaks_utf32: list[int] = field(default_factory=list)
|
||||
|
||||
|
||||
class CommentLine:
|
||||
pass
|
||||
|
||||
|
||||
class EOF:
|
||||
pass
|
||||
|
||||
|
||||
def parseBreakTestLine(input: TextIO) -> Optional[BreakTestItem]:
|
||||
result = BreakTestItem()
|
||||
code_point = -1
|
||||
utf8 = 0
|
||||
utf16 = 0
|
||||
utf32 = 0
|
||||
|
||||
while True:
|
||||
c = input.read(1)
|
||||
if c == "\N{DIVISION SIGN}":
|
||||
# The line starts with a division sign, don't add it to the output.
|
||||
if code_point != -1:
|
||||
result.code_points.append(code_point)
|
||||
code_point = -1
|
||||
result.breaks_utf8.append(utf8)
|
||||
result.breaks_utf16.append(utf16)
|
||||
result.breaks_utf32.append(utf32)
|
||||
|
||||
assert input.read(1).isspace()
|
||||
continue
|
||||
if c == "\N{MULTIPLICATION SIGN}":
|
||||
assert input.read(1).isspace()
|
||||
continue
|
||||
if c.isalnum():
|
||||
while next := input.read(1):
|
||||
if next.isalnum():
|
||||
c += next
|
||||
else:
|
||||
assert next.isspace()
|
||||
break
|
||||
i = int(c, base=16)
|
||||
if code_point == -1:
|
||||
code_point = i
|
||||
|
||||
result.encoded += f"\\U{i:08x}"
|
||||
c = chr(i)
|
||||
utf8 += c.encode().__len__()
|
||||
# Since we only care about the number of code units the byte order
|
||||
# doesn't matter. The byte order is specified to avoid the BOM
|
||||
utf16 += int(c.encode("utf-16-le").__len__() / 2)
|
||||
utf32 += int(c.encode("utf-32-le").__len__() / 4)
|
||||
continue
|
||||
if c == "#":
|
||||
input.readline()
|
||||
return result
|
||||
if c == "\n":
|
||||
return result
|
||||
if c == "":
|
||||
return None
|
||||
assert False
|
||||
|
||||
|
||||
cpp_template = """// -*- C++ -*-
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// WARNING, this entire header is generated by
|
||||
// utiles/generate_extended_grapheme_cluster_test.py
|
||||
// DO NOT MODIFY!
|
||||
|
||||
// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
|
||||
//
|
||||
// See Terms of Use <https://www.unicode.org/copyright.html>
|
||||
// for definitions of Unicode Inc.'s Data Files and Software.
|
||||
//
|
||||
// NOTICE TO USER: Carefully read the following legal agreement.
|
||||
// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
|
||||
// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
|
||||
// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
|
||||
// TERMS AND CONDITIONS OF THIS AGREEMENT.
|
||||
// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
|
||||
// THE DATA FILES OR SOFTWARE.
|
||||
//
|
||||
// COPYRIGHT AND PERMISSION NOTICE
|
||||
//
|
||||
// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
|
||||
// Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining
|
||||
// a copy of the Unicode data files and any associated documentation
|
||||
// (the "Data Files") or Unicode software and any associated documentation
|
||||
// (the "Software") to deal in the Data Files or Software
|
||||
// without restriction, including without limitation the rights to use,
|
||||
// copy, modify, merge, publish, distribute, and/or sell copies of
|
||||
// the Data Files or Software, and to permit persons to whom the Data Files
|
||||
// or Software are furnished to do so, provided that either
|
||||
// (a) this copyright and permission notice appear with all copies
|
||||
// of the Data Files or Software, or
|
||||
// (b) this copyright and permission notice appear in associated
|
||||
// Documentation.
|
||||
//
|
||||
// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||
// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
||||
// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
// NONINFRINGEMENT OF THIRD PARTY RIGHTS.
|
||||
// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
|
||||
// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
|
||||
// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
|
||||
// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
|
||||
// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
||||
// PERFORMANCE OF THE DATA FILES OR SOFTWARE.
|
||||
//
|
||||
// Except as contained in this notice, the name of a copyright holder
|
||||
// shall not be used in advertising or otherwise to promote the sale,
|
||||
// use or other dealings in these Data Files or Software without prior
|
||||
// written authorization of the copyright holder.
|
||||
|
||||
#ifndef LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
|
||||
#define LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
|
||||
|
||||
#include <array>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
|
||||
template <class CharT>
|
||||
struct data {{
|
||||
/// The input to parse.
|
||||
std::basic_string_view<CharT> input;
|
||||
|
||||
/// The first code point all extended grapheme clusters in the input.
|
||||
std::vector<char32_t> code_points;
|
||||
|
||||
/// The offset of the last code units of the extended grapheme clusters in the input.
|
||||
///
|
||||
/// The vector has the same number of entries as \\ref code_points.
|
||||
std::vector<size_t> breaks;
|
||||
}};
|
||||
|
||||
/// The data for UTF-8.
|
||||
std::array<data<char>, {0}> data_utf8 = {{{{ {1} }}}};
|
||||
|
||||
/// The data for UTF-16.
|
||||
///
|
||||
/// Note that most of the data for the UTF-16 and UTF-32 are identical. However
|
||||
/// since the size of the code units differ the breaks can contain different
|
||||
/// values.
|
||||
std::array<data<wchar_t>, {0}> data_utf16 = {{{{ {2} }}}};
|
||||
|
||||
/// The data for UTF-8.
|
||||
///
|
||||
/// Note that most of the data for the UTF-16 and UTF-32 are identical. However
|
||||
/// since the size of the code units differ the breaks can contain different
|
||||
/// values.
|
||||
std::array<data<wchar_t>, {0}> data_utf32 = {{{{ {3} }}}};
|
||||
|
||||
#endif // LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H
|
||||
"""
|
||||
|
||||
cpp_test_data_line_template = "{{ {}, {{ {} }}, {{ {} }} }}"
|
||||
|
||||
|
||||
def lineToCppDataLineUtf8(line: BreakTestItem) -> str:
|
||||
return cpp_test_data_line_template.format(
|
||||
f'"{line.encoded}"',
|
||||
",".join([str(x) for x in line.code_points]),
|
||||
",".join([str(x) for x in line.breaks_utf8]),
|
||||
)
|
||||
|
||||
|
||||
def lineToCppDataLineUtf16(line: BreakTestItem) -> str:
|
||||
return cpp_test_data_line_template.format(
|
||||
f'L"{line.encoded}"',
|
||||
",".join([str(x) for x in line.code_points]),
|
||||
",".join([str(x) for x in line.breaks_utf16]),
|
||||
)
|
||||
|
||||
|
||||
def lineToCppDataLineUtf32(line: BreakTestItem) -> str:
|
||||
return cpp_test_data_line_template.format(
|
||||
f'L"{line.encoded}"',
|
||||
",".join([str(x) for x in line.code_points]),
|
||||
",".join([str(x) for x in line.breaks_utf32]),
|
||||
)
|
||||
|
||||
|
||||
"""
|
||||
Generate test data from "GraphemeBreakText.txt"
|
||||
This file can be downloaded from:
|
||||
https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt
|
||||
This script looks for GraphemeBreakTest.txt in same directory as this script
|
||||
"""
|
||||
|
||||
|
||||
def generate_all() -> str:
|
||||
test_data_path = Path(__file__)
|
||||
test_data_path = test_data_path.absolute()
|
||||
test_data_path = test_data_path.with_name("GraphemeBreakTest.txt")
|
||||
lines = list()
|
||||
with open(test_data_path, mode="rt", encoding="utf-8") as file:
|
||||
while line := parseBreakTestLine(file):
|
||||
if len(line.encoded) > 0:
|
||||
lines.append(line)
|
||||
return cpp_template.format(
|
||||
len(lines),
|
||||
",".join(map(lineToCppDataLineUtf8, lines)),
|
||||
",".join(map(lineToCppDataLineUtf16, lines)),
|
||||
",".join(map(lineToCppDataLineUtf32, lines)),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(generate_all())
|
Loading…
Reference in New Issue
Block a user