From e522533f4ec0b4b8de8fe0af0ce4fb494bbf9139 Mon Sep 17 00:00:00 2001 From: Ting-Yu Lin Date: Thu, 13 Jan 2022 18:36:03 +0000 Subject: [PATCH] Bug 1745113 Part 1 - Move ClusterIterator into Segmenter.h, and rename it. r=necko-reviewers,kershaw This patch doesn't change the behavior. Just move the code around. Differential Revision: https://phabricator.services.mozilla.com/D135639 --- gfx/thebes/gfxFont.cpp | 5 +- intl/lwbrk/LineBreaker.cpp | 2 +- intl/lwbrk/Segmenter.cpp | 113 ++++++++++++++++++ intl/lwbrk/Segmenter.h | 29 +++++ intl/unicharutil/util/nsUnicodeProperties.cpp | 111 +---------------- intl/unicharutil/util/nsUnicodeProperties.h | 28 ----- layout/forms/nsFileControlFrame.cpp | 5 +- layout/xul/nsTextBoxFrame.cpp | 8 +- .../converters/mozTXTToHTMLConv.cpp | 8 +- 9 files changed, 161 insertions(+), 148 deletions(-) diff --git a/gfx/thebes/gfxFont.cpp b/gfx/thebes/gfxFont.cpp index 8685523393e4..4d45116f5d73 100644 --- a/gfx/thebes/gfxFont.cpp +++ b/gfx/thebes/gfxFont.cpp @@ -10,6 +10,7 @@ #include "mozilla/FontPropertyTypes.h" #include "mozilla/gfx/2D.h" #include "mozilla/IntegerRange.h" +#include "mozilla/intl/Segmenter.h" #include "mozilla/MathAlgorithms.h" #include "mozilla/StaticPrefs_gfx.h" #include "mozilla/SVGContextPaint.h" @@ -582,9 +583,9 @@ void gfxShapedText::SetupClusterBoundaries(uint32_t aOffset, CompressedGlyph extendCluster = CompressedGlyph::MakeComplex(false, true); const char16_t* const stringStart = aString; - ClusterIterator iter(aString, aLength); + intl::GraphemeClusterBreakIteratorUtf16 iter(aString, aLength); - // the ClusterIterator won't be able to tell us if the string + // GraphemeClusterBreakIteratorUtf16 won't be able to tell us if the string // _begins_ with a cluster-extender, so we handle that here if (aLength) { uint32_t ch = *aString; diff --git a/intl/lwbrk/LineBreaker.cpp b/intl/lwbrk/LineBreaker.cpp index 45c073b7bb3f..61beef408e68 100644 --- a/intl/lwbrk/LineBreaker.cpp +++ b/intl/lwbrk/LineBreaker.cpp @@ -1090,7 +1090,7 @@ void LineBreaker::ComputeBreakPositions( if (aWordBreak == WordBreakRule::BreakAll) { // For break-all, we don't need to run a dictionary-based breaking // algorithm, we just allow breaks between all grapheme clusters. - ClusterIterator ci(aChars + cur, end - cur); + GraphemeClusterBreakIteratorUtf16 ci(aChars + cur, end - cur); while (!ci.AtEnd()) { ci.Next(); aBreakBefore[ci - aChars] = true; diff --git a/intl/lwbrk/Segmenter.cpp b/intl/lwbrk/Segmenter.cpp index aa88c71dde01..413fb182621b 100644 --- a/intl/lwbrk/Segmenter.cpp +++ b/intl/lwbrk/Segmenter.cpp @@ -10,6 +10,11 @@ #include "mozilla/intl/LineBreaker.h" #include "mozilla/intl/WordBreaker.h" +#include "mozilla/intl/UnicodeProperties.h" +#include "nsUnicodeProperties.h" +#include "nsCharTraits.h" + +using namespace mozilla::unicode; namespace mozilla::intl { @@ -50,6 +55,114 @@ Maybe WordBreakIteratorUtf16::Next() { return Some(mPos); } +enum HSType { + HST_NONE = U_HST_NOT_APPLICABLE, + HST_L = U_HST_LEADING_JAMO, + HST_V = U_HST_VOWEL_JAMO, + HST_T = U_HST_TRAILING_JAMO, + HST_LV = U_HST_LV_SYLLABLE, + HST_LVT = U_HST_LVT_SYLLABLE +}; + +static HSType GetHangulSyllableType(uint32_t aCh) { + return HSType(UnicodeProperties::GetIntPropertyValue( + aCh, UnicodeProperties::IntProperty::HangulSyllableType)); +} + +void GraphemeClusterBreakIteratorUtf16::Next() { + if (AtEnd()) { + NS_WARNING("ClusterIterator has already reached the end"); + return; + } + + uint32_t ch = *mPos++; + + if (mPos < mLimit && NS_IS_SURROGATE_PAIR(ch, *mPos)) { + ch = SURROGATE_TO_UCS4(ch, *mPos++); + } else if ((ch & ~0xff) == 0x1100 || (ch >= 0xa960 && ch <= 0xa97f) || + (ch >= 0xac00 && ch <= 0xd7ff)) { + // Handle conjoining Jamo that make Hangul syllables + HSType hangulState = GetHangulSyllableType(ch); + while (mPos < mLimit) { + ch = *mPos; + HSType hangulType = GetHangulSyllableType(ch); + switch (hangulType) { + case HST_L: + case HST_LV: + case HST_LVT: + if (hangulState == HST_L) { + hangulState = hangulType; + mPos++; + continue; + } + break; + case HST_V: + if ((hangulState != HST_NONE) && (hangulState != HST_T) && + (hangulState != HST_LVT)) { + hangulState = hangulType; + mPos++; + continue; + } + break; + case HST_T: + if (hangulState != HST_NONE && hangulState != HST_L) { + hangulState = hangulType; + mPos++; + continue; + } + break; + default: + break; + } + break; + } + } + + const uint32_t kVS16 = 0xfe0f; + const uint32_t kZWJ = 0x200d; + // UTF-16 surrogate values for Fitzpatrick type modifiers + const uint32_t kFitzpatrickHigh = 0xD83C; + const uint32_t kFitzpatrickLowFirst = 0xDFFB; + const uint32_t kFitzpatrickLowLast = 0xDFFF; + + bool baseIsEmoji = (GetEmojiPresentation(ch) == EmojiDefault) || + (GetEmojiPresentation(ch) == TextDefault && + ((mPos < mLimit && *mPos == kVS16) || + (mPos + 1 < mLimit && *mPos == kFitzpatrickHigh && + *(mPos + 1) >= kFitzpatrickLowFirst && + *(mPos + 1) <= kFitzpatrickLowLast))); + bool prevWasZwj = false; + + while (mPos < mLimit) { + ch = *mPos; + size_t chLen = 1; + + // Check for surrogate pairs; note that isolated surrogates will just + // be treated as generic (non-cluster-extending) characters here, + // which is fine for cluster-iterating purposes + if (mPos < mLimit - 1 && NS_IS_SURROGATE_PAIR(ch, *(mPos + 1))) { + ch = SURROGATE_TO_UCS4(ch, *(mPos + 1)); + chLen = 2; + } + + bool extendCluster = + IsClusterExtender(ch) || + (baseIsEmoji && prevWasZwj && + ((GetEmojiPresentation(ch) == EmojiDefault) || + (GetEmojiPresentation(ch) == TextDefault && mPos + chLen < mLimit && + *(mPos + chLen) == kVS16))); + if (!extendCluster) { + break; + } + + prevWasZwj = (ch == kZWJ); + mPos += chLen; + } + + NS_ASSERTION(mText < mPos && mPos <= mLimit, + "ClusterIterator::Next has overshot the string!"); +} + Result, ICUError> Segmenter::TryCreate( Span aLocale, const SegmenterOptions& aOptions) { if (aOptions.mGranularity == SegmenterGranularity::Grapheme || diff --git a/intl/lwbrk/Segmenter.h b/intl/lwbrk/Segmenter.h index 52c0734aee90..26fb8458b7f1 100644 --- a/intl/lwbrk/Segmenter.h +++ b/intl/lwbrk/Segmenter.h @@ -121,6 +121,35 @@ class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 { Maybe Next() override; }; +/** + * Grapheme cluster break iterator for UTF-16 text. + */ +class GraphemeClusterBreakIteratorUtf16 { + public: + GraphemeClusterBreakIteratorUtf16(const char16_t* aText, uint32_t aLength) + : mPos(aText), + mLimit(aText + aLength) +#ifdef DEBUG + , + mText(aText) +#endif + { + } + + operator const char16_t*() const { return mPos; } + + bool AtEnd() const { return mPos >= mLimit; } + + void Next(); + + private: + const char16_t* mPos; + const char16_t* mLimit; +#ifdef DEBUG + const char16_t* mText; +#endif +}; + /** * This component is a Mozilla-focused API for working with segmenters in * internationalization code. diff --git a/intl/unicharutil/util/nsUnicodeProperties.cpp b/intl/unicharutil/util/nsUnicodeProperties.cpp index 69edf03e4206..2acc0f0296d1 100644 --- a/intl/unicharutil/util/nsUnicodeProperties.cpp +++ b/intl/unicharutil/util/nsUnicodeProperties.cpp @@ -9,6 +9,7 @@ #include "mozilla/ArrayUtils.h" #include "mozilla/HashTable.h" +#include "mozilla/intl/Segmenter.h" #include "nsCharTraits.h" #include "BaseChars.h" @@ -167,114 +168,6 @@ bool IsClusterExtender(uint32_t aCh, uint8_t aCategory) { (aCh >= 0xe0020 && aCh <= 0xe007f)); // emoji (flag) tag characters } -enum HSType { - HST_NONE = U_HST_NOT_APPLICABLE, - HST_L = U_HST_LEADING_JAMO, - HST_V = U_HST_VOWEL_JAMO, - HST_T = U_HST_TRAILING_JAMO, - HST_LV = U_HST_LV_SYLLABLE, - HST_LVT = U_HST_LVT_SYLLABLE -}; - -static HSType GetHangulSyllableType(uint32_t aCh) { - return HSType(intl::UnicodeProperties::GetIntPropertyValue( - aCh, intl::UnicodeProperties::IntProperty::HangulSyllableType)); -} - -void ClusterIterator::Next() { - if (AtEnd()) { - NS_WARNING("ClusterIterator has already reached the end"); - return; - } - - uint32_t ch = *mPos++; - - if (mPos < mLimit && NS_IS_SURROGATE_PAIR(ch, *mPos)) { - ch = SURROGATE_TO_UCS4(ch, *mPos++); - } else if ((ch & ~0xff) == 0x1100 || (ch >= 0xa960 && ch <= 0xa97f) || - (ch >= 0xac00 && ch <= 0xd7ff)) { - // Handle conjoining Jamo that make Hangul syllables - HSType hangulState = GetHangulSyllableType(ch); - while (mPos < mLimit) { - ch = *mPos; - HSType hangulType = GetHangulSyllableType(ch); - switch (hangulType) { - case HST_L: - case HST_LV: - case HST_LVT: - if (hangulState == HST_L) { - hangulState = hangulType; - mPos++; - continue; - } - break; - case HST_V: - if ((hangulState != HST_NONE) && (hangulState != HST_T) && - (hangulState != HST_LVT)) { - hangulState = hangulType; - mPos++; - continue; - } - break; - case HST_T: - if (hangulState != HST_NONE && hangulState != HST_L) { - hangulState = hangulType; - mPos++; - continue; - } - break; - default: - break; - } - break; - } - } - - const uint32_t kVS16 = 0xfe0f; - const uint32_t kZWJ = 0x200d; - // UTF-16 surrogate values for Fitzpatrick type modifiers - const uint32_t kFitzpatrickHigh = 0xD83C; - const uint32_t kFitzpatrickLowFirst = 0xDFFB; - const uint32_t kFitzpatrickLowLast = 0xDFFF; - - bool baseIsEmoji = (GetEmojiPresentation(ch) == EmojiDefault) || - (GetEmojiPresentation(ch) == TextDefault && - ((mPos < mLimit && *mPos == kVS16) || - (mPos + 1 < mLimit && *mPos == kFitzpatrickHigh && - *(mPos + 1) >= kFitzpatrickLowFirst && - *(mPos + 1) <= kFitzpatrickLowLast))); - bool prevWasZwj = false; - - while (mPos < mLimit) { - ch = *mPos; - size_t chLen = 1; - - // Check for surrogate pairs; note that isolated surrogates will just - // be treated as generic (non-cluster-extending) characters here, - // which is fine for cluster-iterating purposes - if (mPos < mLimit - 1 && NS_IS_SURROGATE_PAIR(ch, *(mPos + 1))) { - ch = SURROGATE_TO_UCS4(ch, *(mPos + 1)); - chLen = 2; - } - - bool extendCluster = - IsClusterExtender(ch) || - (baseIsEmoji && prevWasZwj && - ((GetEmojiPresentation(ch) == EmojiDefault) || - (GetEmojiPresentation(ch) == TextDefault && mPos + chLen < mLimit && - *(mPos + chLen) == kVS16))); - if (!extendCluster) { - break; - } - - prevWasZwj = (ch == kZWJ); - mPos += chLen; - } - - NS_ASSERTION(mText < mPos && mPos <= mLimit, - "ClusterIterator::Next has overshot the string!"); -} - void ClusterReverseIterator::Next() { if (AtEnd()) { NS_WARNING("ClusterReverseIterator has already reached the end"); @@ -301,7 +194,7 @@ void ClusterReverseIterator::Next() { } uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength) { - ClusterIterator iter(aText, aLength); + intl::GraphemeClusterBreakIteratorUtf16 iter(aText, aLength); uint32_t result = 0; while (!iter.AtEnd()) { ++result; diff --git a/intl/unicharutil/util/nsUnicodeProperties.h b/intl/unicharutil/util/nsUnicodeProperties.h index e2d42a4922ea..a1471ca40bb9 100644 --- a/intl/unicharutil/util/nsUnicodeProperties.h +++ b/intl/unicharutil/util/nsUnicodeProperties.h @@ -158,34 +158,6 @@ inline bool IsClusterExtender(uint32_t aCh) { return IsClusterExtender(aCh, GetGeneralCategory(aCh)); } -// A simple iterator for a string of char16_t codepoints that advances -// by Unicode grapheme clusters -class ClusterIterator { - public: - ClusterIterator(const char16_t* aText, uint32_t aLength) - : mPos(aText), - mLimit(aText + aLength) -#ifdef DEBUG - , - mText(aText) -#endif - { - } - - operator const char16_t*() const { return mPos; } - - bool AtEnd() const { return mPos >= mLimit; } - - void Next(); - - private: - const char16_t* mPos; - const char16_t* mLimit; -#ifdef DEBUG - const char16_t* mText; -#endif -}; - // Count the number of grapheme clusters in the given string uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength); diff --git a/layout/forms/nsFileControlFrame.cpp b/layout/forms/nsFileControlFrame.cpp index cf7fbabe5153..26c0a3382adc 100644 --- a/layout/forms/nsFileControlFrame.cpp +++ b/layout/forms/nsFileControlFrame.cpp @@ -21,6 +21,7 @@ #include "mozilla/dom/HTMLButtonElement.h" #include "mozilla/dom/HTMLInputElement.h" #include "mozilla/dom/MutationEventBinding.h" +#include "mozilla/intl/Segmenter.h" #include "mozilla/Preferences.h" #include "mozilla/PresShell.h" #include "mozilla/StaticPrefs_dom.h" @@ -87,9 +88,9 @@ bool nsFileControlFrame::CropTextToWidth(gfxContext& aRenderingContext, // determine how much of the string will fit in the max width nscoord totalWidth = textWidth; - using mozilla::unicode::ClusterIterator; using mozilla::unicode::ClusterReverseIterator; - ClusterIterator leftIter(aText.Data(), aText.Length()); + intl::GraphemeClusterBreakIteratorUtf16 leftIter(aText.Data(), + aText.Length()); ClusterReverseIterator rightIter(aText.Data(), aText.Length()); const char16_t* leftPos = leftIter; const char16_t* rightPos = rightIter; diff --git a/layout/xul/nsTextBoxFrame.cpp b/layout/xul/nsTextBoxFrame.cpp index a9f653a197e3..5f5aa798b5ac 100644 --- a/layout/xul/nsTextBoxFrame.cpp +++ b/layout/xul/nsTextBoxFrame.cpp @@ -13,6 +13,7 @@ #include "mozilla/ComputedStyle.h" #include "mozilla/Preferences.h" #include "mozilla/PresShell.h" +#include "mozilla/intl/Segmenter.h" #include "mozilla/layers/RenderRootStateManager.h" #include "mozilla/gfx/2D.h" #include "nsFontMetrics.h" @@ -614,7 +615,7 @@ nscoord nsTextBoxFrame::CalculateTitleForWidth(gfxContext& aRenderingContext, titleWidth = 0; } - using mozilla::unicode::ClusterIterator; + using mozilla::intl::GraphemeClusterBreakIteratorUtf16; using mozilla::unicode::ClusterReverseIterator; // ok crop things @@ -622,7 +623,7 @@ nscoord nsTextBoxFrame::CalculateTitleForWidth(gfxContext& aRenderingContext, case CropAuto: case CropNone: case CropRight: { - ClusterIterator iter(mTitle.Data(), mTitle.Length()); + GraphemeClusterBreakIteratorUtf16 iter(mTitle.Data(), mTitle.Length()); const char16_t* dataBegin = iter; const char16_t* pos = dataBegin; nscoord charWidth; @@ -700,7 +701,8 @@ nscoord nsTextBoxFrame::CalculateTitleForWidth(gfxContext& aRenderingContext, // determine how much of the string will fit in the max width nscoord charWidth = 0; nscoord totalWidth = 0; - ClusterIterator leftIter(mTitle.Data(), mTitle.Length()); + GraphemeClusterBreakIteratorUtf16 leftIter(mTitle.Data(), + mTitle.Length()); ClusterReverseIterator rightIter(mTitle.Data(), mTitle.Length()); const char16_t* dataBegin = leftIter; const char16_t* dataEnd = rightIter; diff --git a/netwerk/streamconv/converters/mozTXTToHTMLConv.cpp b/netwerk/streamconv/converters/mozTXTToHTMLConv.cpp index 1d09d44c34cd..9914fb0dbde5 100644 --- a/netwerk/streamconv/converters/mozTXTToHTMLConv.cpp +++ b/netwerk/streamconv/converters/mozTXTToHTMLConv.cpp @@ -5,6 +5,7 @@ #include "mozilla/TextUtils.h" #include "mozTXTToHTMLConv.h" +#include "mozilla/intl/Segmenter.h" #include "nsNetUtil.h" #include "nsUnicharUtils.h" #include "nsUnicodeProperties.h" @@ -22,6 +23,7 @@ using mozilla::IsAscii; using mozilla::IsAsciiAlpha; using mozilla::IsAsciiDigit; +using mozilla::intl::GraphemeClusterBreakIteratorUtf16; const double growthRate = 1.2; @@ -557,7 +559,7 @@ bool mozTXTToHTMLConv::ItMatchesDelimited(const char16_t* aInString, // find length of the char/cluster to be ignored int32_t ignoreLen = before == LT_IGNORE ? 0 : 1; if (ignoreLen) { - mozilla::unicode::ClusterIterator ci(aInString, aInLength); + GraphemeClusterBreakIteratorUtf16 ci(aInString, aInLength); ci.Next(); ignoreLen = ci - aInString; } @@ -591,7 +593,7 @@ uint32_t mozTXTToHTMLConv::NumberOfMatches(const char16_t* aInString, uint32_t result = 0; const char16_t* end = aInString + aInStringLength; - for (mozilla::unicode::ClusterIterator ci(aInString, aInStringLength); + for (GraphemeClusterBreakIteratorUtf16 ci(aInString, aInStringLength); !ci.AtEnd(); ci.Next()) { if (ItMatchesDelimited(ci, end - ci, rep, aRepLen, before, after)) { result++; @@ -979,7 +981,7 @@ mozTXTToHTMLConv::ScanTXT(const nsAString& aInString, uint32_t whattodo, const char16_t* rawInputString = aInString.BeginReading(); uint32_t inLength = aInString.Length(); - for (mozilla::unicode::ClusterIterator ci(rawInputString, inLength); + for (GraphemeClusterBreakIteratorUtf16 ci(rawInputString, inLength); !ci.AtEnd();) { uint32_t i = ci - rawInputString; if (doGlyphSubstitution) {