mirror of
https://github.com/mozilla/gecko-dev.git
synced 2025-02-17 14:25:49 +00:00
Bug 1745113 Part 1 - Move ClusterIterator into Segmenter.h, and rename it. r=necko-reviewers,kershaw
This patch doesn't change the behavior. Just move the code around. Differential Revision: https://phabricator.services.mozilla.com/D135639
This commit is contained in:
parent
5237bb3b74
commit
e522533f4e
@ -10,6 +10,7 @@
|
||||
#include "mozilla/FontPropertyTypes.h"
|
||||
#include "mozilla/gfx/2D.h"
|
||||
#include "mozilla/IntegerRange.h"
|
||||
#include "mozilla/intl/Segmenter.h"
|
||||
#include "mozilla/MathAlgorithms.h"
|
||||
#include "mozilla/StaticPrefs_gfx.h"
|
||||
#include "mozilla/SVGContextPaint.h"
|
||||
@ -582,9 +583,9 @@ void gfxShapedText::SetupClusterBoundaries(uint32_t aOffset,
|
||||
CompressedGlyph extendCluster = CompressedGlyph::MakeComplex(false, true);
|
||||
|
||||
const char16_t* const stringStart = aString;
|
||||
ClusterIterator iter(aString, aLength);
|
||||
intl::GraphemeClusterBreakIteratorUtf16 iter(aString, aLength);
|
||||
|
||||
// the ClusterIterator won't be able to tell us if the string
|
||||
// GraphemeClusterBreakIteratorUtf16 won't be able to tell us if the string
|
||||
// _begins_ with a cluster-extender, so we handle that here
|
||||
if (aLength) {
|
||||
uint32_t ch = *aString;
|
||||
|
@ -1090,7 +1090,7 @@ void LineBreaker::ComputeBreakPositions(
|
||||
if (aWordBreak == WordBreakRule::BreakAll) {
|
||||
// For break-all, we don't need to run a dictionary-based breaking
|
||||
// algorithm, we just allow breaks between all grapheme clusters.
|
||||
ClusterIterator ci(aChars + cur, end - cur);
|
||||
GraphemeClusterBreakIteratorUtf16 ci(aChars + cur, end - cur);
|
||||
while (!ci.AtEnd()) {
|
||||
ci.Next();
|
||||
aBreakBefore[ci - aChars] = true;
|
||||
|
@ -10,6 +10,11 @@
|
||||
|
||||
#include "mozilla/intl/LineBreaker.h"
|
||||
#include "mozilla/intl/WordBreaker.h"
|
||||
#include "mozilla/intl/UnicodeProperties.h"
|
||||
#include "nsUnicodeProperties.h"
|
||||
#include "nsCharTraits.h"
|
||||
|
||||
using namespace mozilla::unicode;
|
||||
|
||||
namespace mozilla::intl {
|
||||
|
||||
@ -50,6 +55,114 @@ Maybe<uint32_t> WordBreakIteratorUtf16::Next() {
|
||||
return Some(mPos);
|
||||
}
|
||||
|
||||
enum HSType {
|
||||
HST_NONE = U_HST_NOT_APPLICABLE,
|
||||
HST_L = U_HST_LEADING_JAMO,
|
||||
HST_V = U_HST_VOWEL_JAMO,
|
||||
HST_T = U_HST_TRAILING_JAMO,
|
||||
HST_LV = U_HST_LV_SYLLABLE,
|
||||
HST_LVT = U_HST_LVT_SYLLABLE
|
||||
};
|
||||
|
||||
static HSType GetHangulSyllableType(uint32_t aCh) {
|
||||
return HSType(UnicodeProperties::GetIntPropertyValue(
|
||||
aCh, UnicodeProperties::IntProperty::HangulSyllableType));
|
||||
}
|
||||
|
||||
void GraphemeClusterBreakIteratorUtf16::Next() {
|
||||
if (AtEnd()) {
|
||||
NS_WARNING("ClusterIterator has already reached the end");
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t ch = *mPos++;
|
||||
|
||||
if (mPos < mLimit && NS_IS_SURROGATE_PAIR(ch, *mPos)) {
|
||||
ch = SURROGATE_TO_UCS4(ch, *mPos++);
|
||||
} else if ((ch & ~0xff) == 0x1100 || (ch >= 0xa960 && ch <= 0xa97f) ||
|
||||
(ch >= 0xac00 && ch <= 0xd7ff)) {
|
||||
// Handle conjoining Jamo that make Hangul syllables
|
||||
HSType hangulState = GetHangulSyllableType(ch);
|
||||
while (mPos < mLimit) {
|
||||
ch = *mPos;
|
||||
HSType hangulType = GetHangulSyllableType(ch);
|
||||
switch (hangulType) {
|
||||
case HST_L:
|
||||
case HST_LV:
|
||||
case HST_LVT:
|
||||
if (hangulState == HST_L) {
|
||||
hangulState = hangulType;
|
||||
mPos++;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
case HST_V:
|
||||
if ((hangulState != HST_NONE) && (hangulState != HST_T) &&
|
||||
(hangulState != HST_LVT)) {
|
||||
hangulState = hangulType;
|
||||
mPos++;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
case HST_T:
|
||||
if (hangulState != HST_NONE && hangulState != HST_L) {
|
||||
hangulState = hangulType;
|
||||
mPos++;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const uint32_t kVS16 = 0xfe0f;
|
||||
const uint32_t kZWJ = 0x200d;
|
||||
// UTF-16 surrogate values for Fitzpatrick type modifiers
|
||||
const uint32_t kFitzpatrickHigh = 0xD83C;
|
||||
const uint32_t kFitzpatrickLowFirst = 0xDFFB;
|
||||
const uint32_t kFitzpatrickLowLast = 0xDFFF;
|
||||
|
||||
bool baseIsEmoji = (GetEmojiPresentation(ch) == EmojiDefault) ||
|
||||
(GetEmojiPresentation(ch) == TextDefault &&
|
||||
((mPos < mLimit && *mPos == kVS16) ||
|
||||
(mPos + 1 < mLimit && *mPos == kFitzpatrickHigh &&
|
||||
*(mPos + 1) >= kFitzpatrickLowFirst &&
|
||||
*(mPos + 1) <= kFitzpatrickLowLast)));
|
||||
bool prevWasZwj = false;
|
||||
|
||||
while (mPos < mLimit) {
|
||||
ch = *mPos;
|
||||
size_t chLen = 1;
|
||||
|
||||
// Check for surrogate pairs; note that isolated surrogates will just
|
||||
// be treated as generic (non-cluster-extending) characters here,
|
||||
// which is fine for cluster-iterating purposes
|
||||
if (mPos < mLimit - 1 && NS_IS_SURROGATE_PAIR(ch, *(mPos + 1))) {
|
||||
ch = SURROGATE_TO_UCS4(ch, *(mPos + 1));
|
||||
chLen = 2;
|
||||
}
|
||||
|
||||
bool extendCluster =
|
||||
IsClusterExtender(ch) ||
|
||||
(baseIsEmoji && prevWasZwj &&
|
||||
((GetEmojiPresentation(ch) == EmojiDefault) ||
|
||||
(GetEmojiPresentation(ch) == TextDefault && mPos + chLen < mLimit &&
|
||||
*(mPos + chLen) == kVS16)));
|
||||
if (!extendCluster) {
|
||||
break;
|
||||
}
|
||||
|
||||
prevWasZwj = (ch == kZWJ);
|
||||
mPos += chLen;
|
||||
}
|
||||
|
||||
NS_ASSERTION(mText < mPos && mPos <= mLimit,
|
||||
"ClusterIterator::Next has overshot the string!");
|
||||
}
|
||||
|
||||
Result<UniquePtr<Segmenter>, ICUError> Segmenter::TryCreate(
|
||||
Span<const char> aLocale, const SegmenterOptions& aOptions) {
|
||||
if (aOptions.mGranularity == SegmenterGranularity::Grapheme ||
|
||||
|
@ -121,6 +121,35 @@ class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
|
||||
Maybe<uint32_t> Next() override;
|
||||
};
|
||||
|
||||
/**
|
||||
* Grapheme cluster break iterator for UTF-16 text.
|
||||
*/
|
||||
class GraphemeClusterBreakIteratorUtf16 {
|
||||
public:
|
||||
GraphemeClusterBreakIteratorUtf16(const char16_t* aText, uint32_t aLength)
|
||||
: mPos(aText),
|
||||
mLimit(aText + aLength)
|
||||
#ifdef DEBUG
|
||||
,
|
||||
mText(aText)
|
||||
#endif
|
||||
{
|
||||
}
|
||||
|
||||
operator const char16_t*() const { return mPos; }
|
||||
|
||||
bool AtEnd() const { return mPos >= mLimit; }
|
||||
|
||||
void Next();
|
||||
|
||||
private:
|
||||
const char16_t* mPos;
|
||||
const char16_t* mLimit;
|
||||
#ifdef DEBUG
|
||||
const char16_t* mText;
|
||||
#endif
|
||||
};
|
||||
|
||||
/**
|
||||
* This component is a Mozilla-focused API for working with segmenters in
|
||||
* internationalization code.
|
||||
|
@ -9,6 +9,7 @@
|
||||
|
||||
#include "mozilla/ArrayUtils.h"
|
||||
#include "mozilla/HashTable.h"
|
||||
#include "mozilla/intl/Segmenter.h"
|
||||
#include "nsCharTraits.h"
|
||||
|
||||
#include "BaseChars.h"
|
||||
@ -167,114 +168,6 @@ bool IsClusterExtender(uint32_t aCh, uint8_t aCategory) {
|
||||
(aCh >= 0xe0020 && aCh <= 0xe007f)); // emoji (flag) tag characters
|
||||
}
|
||||
|
||||
enum HSType {
|
||||
HST_NONE = U_HST_NOT_APPLICABLE,
|
||||
HST_L = U_HST_LEADING_JAMO,
|
||||
HST_V = U_HST_VOWEL_JAMO,
|
||||
HST_T = U_HST_TRAILING_JAMO,
|
||||
HST_LV = U_HST_LV_SYLLABLE,
|
||||
HST_LVT = U_HST_LVT_SYLLABLE
|
||||
};
|
||||
|
||||
static HSType GetHangulSyllableType(uint32_t aCh) {
|
||||
return HSType(intl::UnicodeProperties::GetIntPropertyValue(
|
||||
aCh, intl::UnicodeProperties::IntProperty::HangulSyllableType));
|
||||
}
|
||||
|
||||
void ClusterIterator::Next() {
|
||||
if (AtEnd()) {
|
||||
NS_WARNING("ClusterIterator has already reached the end");
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t ch = *mPos++;
|
||||
|
||||
if (mPos < mLimit && NS_IS_SURROGATE_PAIR(ch, *mPos)) {
|
||||
ch = SURROGATE_TO_UCS4(ch, *mPos++);
|
||||
} else if ((ch & ~0xff) == 0x1100 || (ch >= 0xa960 && ch <= 0xa97f) ||
|
||||
(ch >= 0xac00 && ch <= 0xd7ff)) {
|
||||
// Handle conjoining Jamo that make Hangul syllables
|
||||
HSType hangulState = GetHangulSyllableType(ch);
|
||||
while (mPos < mLimit) {
|
||||
ch = *mPos;
|
||||
HSType hangulType = GetHangulSyllableType(ch);
|
||||
switch (hangulType) {
|
||||
case HST_L:
|
||||
case HST_LV:
|
||||
case HST_LVT:
|
||||
if (hangulState == HST_L) {
|
||||
hangulState = hangulType;
|
||||
mPos++;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
case HST_V:
|
||||
if ((hangulState != HST_NONE) && (hangulState != HST_T) &&
|
||||
(hangulState != HST_LVT)) {
|
||||
hangulState = hangulType;
|
||||
mPos++;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
case HST_T:
|
||||
if (hangulState != HST_NONE && hangulState != HST_L) {
|
||||
hangulState = hangulType;
|
||||
mPos++;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const uint32_t kVS16 = 0xfe0f;
|
||||
const uint32_t kZWJ = 0x200d;
|
||||
// UTF-16 surrogate values for Fitzpatrick type modifiers
|
||||
const uint32_t kFitzpatrickHigh = 0xD83C;
|
||||
const uint32_t kFitzpatrickLowFirst = 0xDFFB;
|
||||
const uint32_t kFitzpatrickLowLast = 0xDFFF;
|
||||
|
||||
bool baseIsEmoji = (GetEmojiPresentation(ch) == EmojiDefault) ||
|
||||
(GetEmojiPresentation(ch) == TextDefault &&
|
||||
((mPos < mLimit && *mPos == kVS16) ||
|
||||
(mPos + 1 < mLimit && *mPos == kFitzpatrickHigh &&
|
||||
*(mPos + 1) >= kFitzpatrickLowFirst &&
|
||||
*(mPos + 1) <= kFitzpatrickLowLast)));
|
||||
bool prevWasZwj = false;
|
||||
|
||||
while (mPos < mLimit) {
|
||||
ch = *mPos;
|
||||
size_t chLen = 1;
|
||||
|
||||
// Check for surrogate pairs; note that isolated surrogates will just
|
||||
// be treated as generic (non-cluster-extending) characters here,
|
||||
// which is fine for cluster-iterating purposes
|
||||
if (mPos < mLimit - 1 && NS_IS_SURROGATE_PAIR(ch, *(mPos + 1))) {
|
||||
ch = SURROGATE_TO_UCS4(ch, *(mPos + 1));
|
||||
chLen = 2;
|
||||
}
|
||||
|
||||
bool extendCluster =
|
||||
IsClusterExtender(ch) ||
|
||||
(baseIsEmoji && prevWasZwj &&
|
||||
((GetEmojiPresentation(ch) == EmojiDefault) ||
|
||||
(GetEmojiPresentation(ch) == TextDefault && mPos + chLen < mLimit &&
|
||||
*(mPos + chLen) == kVS16)));
|
||||
if (!extendCluster) {
|
||||
break;
|
||||
}
|
||||
|
||||
prevWasZwj = (ch == kZWJ);
|
||||
mPos += chLen;
|
||||
}
|
||||
|
||||
NS_ASSERTION(mText < mPos && mPos <= mLimit,
|
||||
"ClusterIterator::Next has overshot the string!");
|
||||
}
|
||||
|
||||
void ClusterReverseIterator::Next() {
|
||||
if (AtEnd()) {
|
||||
NS_WARNING("ClusterReverseIterator has already reached the end");
|
||||
@ -301,7 +194,7 @@ void ClusterReverseIterator::Next() {
|
||||
}
|
||||
|
||||
uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength) {
|
||||
ClusterIterator iter(aText, aLength);
|
||||
intl::GraphemeClusterBreakIteratorUtf16 iter(aText, aLength);
|
||||
uint32_t result = 0;
|
||||
while (!iter.AtEnd()) {
|
||||
++result;
|
||||
|
@ -158,34 +158,6 @@ inline bool IsClusterExtender(uint32_t aCh) {
|
||||
return IsClusterExtender(aCh, GetGeneralCategory(aCh));
|
||||
}
|
||||
|
||||
// A simple iterator for a string of char16_t codepoints that advances
|
||||
// by Unicode grapheme clusters
|
||||
class ClusterIterator {
|
||||
public:
|
||||
ClusterIterator(const char16_t* aText, uint32_t aLength)
|
||||
: mPos(aText),
|
||||
mLimit(aText + aLength)
|
||||
#ifdef DEBUG
|
||||
,
|
||||
mText(aText)
|
||||
#endif
|
||||
{
|
||||
}
|
||||
|
||||
operator const char16_t*() const { return mPos; }
|
||||
|
||||
bool AtEnd() const { return mPos >= mLimit; }
|
||||
|
||||
void Next();
|
||||
|
||||
private:
|
||||
const char16_t* mPos;
|
||||
const char16_t* mLimit;
|
||||
#ifdef DEBUG
|
||||
const char16_t* mText;
|
||||
#endif
|
||||
};
|
||||
|
||||
// Count the number of grapheme clusters in the given string
|
||||
uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength);
|
||||
|
||||
|
@ -21,6 +21,7 @@
|
||||
#include "mozilla/dom/HTMLButtonElement.h"
|
||||
#include "mozilla/dom/HTMLInputElement.h"
|
||||
#include "mozilla/dom/MutationEventBinding.h"
|
||||
#include "mozilla/intl/Segmenter.h"
|
||||
#include "mozilla/Preferences.h"
|
||||
#include "mozilla/PresShell.h"
|
||||
#include "mozilla/StaticPrefs_dom.h"
|
||||
@ -87,9 +88,9 @@ bool nsFileControlFrame::CropTextToWidth(gfxContext& aRenderingContext,
|
||||
|
||||
// determine how much of the string will fit in the max width
|
||||
nscoord totalWidth = textWidth;
|
||||
using mozilla::unicode::ClusterIterator;
|
||||
using mozilla::unicode::ClusterReverseIterator;
|
||||
ClusterIterator leftIter(aText.Data(), aText.Length());
|
||||
intl::GraphemeClusterBreakIteratorUtf16 leftIter(aText.Data(),
|
||||
aText.Length());
|
||||
ClusterReverseIterator rightIter(aText.Data(), aText.Length());
|
||||
const char16_t* leftPos = leftIter;
|
||||
const char16_t* rightPos = rightIter;
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include "mozilla/ComputedStyle.h"
|
||||
#include "mozilla/Preferences.h"
|
||||
#include "mozilla/PresShell.h"
|
||||
#include "mozilla/intl/Segmenter.h"
|
||||
#include "mozilla/layers/RenderRootStateManager.h"
|
||||
#include "mozilla/gfx/2D.h"
|
||||
#include "nsFontMetrics.h"
|
||||
@ -614,7 +615,7 @@ nscoord nsTextBoxFrame::CalculateTitleForWidth(gfxContext& aRenderingContext,
|
||||
titleWidth = 0;
|
||||
}
|
||||
|
||||
using mozilla::unicode::ClusterIterator;
|
||||
using mozilla::intl::GraphemeClusterBreakIteratorUtf16;
|
||||
using mozilla::unicode::ClusterReverseIterator;
|
||||
|
||||
// ok crop things
|
||||
@ -622,7 +623,7 @@ nscoord nsTextBoxFrame::CalculateTitleForWidth(gfxContext& aRenderingContext,
|
||||
case CropAuto:
|
||||
case CropNone:
|
||||
case CropRight: {
|
||||
ClusterIterator iter(mTitle.Data(), mTitle.Length());
|
||||
GraphemeClusterBreakIteratorUtf16 iter(mTitle.Data(), mTitle.Length());
|
||||
const char16_t* dataBegin = iter;
|
||||
const char16_t* pos = dataBegin;
|
||||
nscoord charWidth;
|
||||
@ -700,7 +701,8 @@ nscoord nsTextBoxFrame::CalculateTitleForWidth(gfxContext& aRenderingContext,
|
||||
// determine how much of the string will fit in the max width
|
||||
nscoord charWidth = 0;
|
||||
nscoord totalWidth = 0;
|
||||
ClusterIterator leftIter(mTitle.Data(), mTitle.Length());
|
||||
GraphemeClusterBreakIteratorUtf16 leftIter(mTitle.Data(),
|
||||
mTitle.Length());
|
||||
ClusterReverseIterator rightIter(mTitle.Data(), mTitle.Length());
|
||||
const char16_t* dataBegin = leftIter;
|
||||
const char16_t* dataEnd = rightIter;
|
||||
|
@ -5,6 +5,7 @@
|
||||
|
||||
#include "mozilla/TextUtils.h"
|
||||
#include "mozTXTToHTMLConv.h"
|
||||
#include "mozilla/intl/Segmenter.h"
|
||||
#include "nsNetUtil.h"
|
||||
#include "nsUnicharUtils.h"
|
||||
#include "nsUnicodeProperties.h"
|
||||
@ -22,6 +23,7 @@
|
||||
using mozilla::IsAscii;
|
||||
using mozilla::IsAsciiAlpha;
|
||||
using mozilla::IsAsciiDigit;
|
||||
using mozilla::intl::GraphemeClusterBreakIteratorUtf16;
|
||||
|
||||
const double growthRate = 1.2;
|
||||
|
||||
@ -557,7 +559,7 @@ bool mozTXTToHTMLConv::ItMatchesDelimited(const char16_t* aInString,
|
||||
// find length of the char/cluster to be ignored
|
||||
int32_t ignoreLen = before == LT_IGNORE ? 0 : 1;
|
||||
if (ignoreLen) {
|
||||
mozilla::unicode::ClusterIterator ci(aInString, aInLength);
|
||||
GraphemeClusterBreakIteratorUtf16 ci(aInString, aInLength);
|
||||
ci.Next();
|
||||
ignoreLen = ci - aInString;
|
||||
}
|
||||
@ -591,7 +593,7 @@ uint32_t mozTXTToHTMLConv::NumberOfMatches(const char16_t* aInString,
|
||||
uint32_t result = 0;
|
||||
|
||||
const char16_t* end = aInString + aInStringLength;
|
||||
for (mozilla::unicode::ClusterIterator ci(aInString, aInStringLength);
|
||||
for (GraphemeClusterBreakIteratorUtf16 ci(aInString, aInStringLength);
|
||||
!ci.AtEnd(); ci.Next()) {
|
||||
if (ItMatchesDelimited(ci, end - ci, rep, aRepLen, before, after)) {
|
||||
result++;
|
||||
@ -979,7 +981,7 @@ mozTXTToHTMLConv::ScanTXT(const nsAString& aInString, uint32_t whattodo,
|
||||
const char16_t* rawInputString = aInString.BeginReading();
|
||||
uint32_t inLength = aInString.Length();
|
||||
|
||||
for (mozilla::unicode::ClusterIterator ci(rawInputString, inLength);
|
||||
for (GraphemeClusterBreakIteratorUtf16 ci(rawInputString, inLength);
|
||||
!ci.AtEnd();) {
|
||||
uint32_t i = ci - rawInputString;
|
||||
if (doGlyphSubstitution) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user