From e522533f4ec0b4b8de8fe0af0ce4fb494bbf9139 Mon Sep 17 00:00:00 2001
From: Ting-Yu Lin <tlin@mozilla.com>
Date: Thu, 13 Jan 2022 18:36:03 +0000
Subject: [PATCH] Bug 1745113 Part 1 - Move ClusterIterator into Segmenter.h,
 and rename it. r=necko-reviewers,kershaw

This patch doesn't change the behavior. Just move the code around.

Differential Revision: https://phabricator.services.mozilla.com/D135639
---
 gfx/thebes/gfxFont.cpp                        |   5 +-
 intl/lwbrk/LineBreaker.cpp                    |   2 +-
 intl/lwbrk/Segmenter.cpp                      | 113 ++++++++++++++++++
 intl/lwbrk/Segmenter.h                        |  29 +++++
 intl/unicharutil/util/nsUnicodeProperties.cpp | 111 +----------------
 intl/unicharutil/util/nsUnicodeProperties.h   |  28 -----
 layout/forms/nsFileControlFrame.cpp           |   5 +-
 layout/xul/nsTextBoxFrame.cpp                 |   8 +-
 .../converters/mozTXTToHTMLConv.cpp           |   8 +-
 9 files changed, 161 insertions(+), 148 deletions(-)

diff --git a/gfx/thebes/gfxFont.cpp b/gfx/thebes/gfxFont.cpp
index 8685523393e4..4d45116f5d73 100644
--- a/gfx/thebes/gfxFont.cpp
+++ b/gfx/thebes/gfxFont.cpp
@@ -10,6 +10,7 @@
 #include "mozilla/FontPropertyTypes.h"
 #include "mozilla/gfx/2D.h"
 #include "mozilla/IntegerRange.h"
+#include "mozilla/intl/Segmenter.h"
 #include "mozilla/MathAlgorithms.h"
 #include "mozilla/StaticPrefs_gfx.h"
 #include "mozilla/SVGContextPaint.h"
@@ -582,9 +583,9 @@ void gfxShapedText::SetupClusterBoundaries(uint32_t aOffset,
   CompressedGlyph extendCluster = CompressedGlyph::MakeComplex(false, true);
 
   const char16_t* const stringStart = aString;
-  ClusterIterator iter(aString, aLength);
+  intl::GraphemeClusterBreakIteratorUtf16 iter(aString, aLength);
 
-  // the ClusterIterator won't be able to tell us if the string
+  // GraphemeClusterBreakIteratorUtf16 won't be able to tell us if the string
   // _begins_ with a cluster-extender, so we handle that here
   if (aLength) {
     uint32_t ch = *aString;
diff --git a/intl/lwbrk/LineBreaker.cpp b/intl/lwbrk/LineBreaker.cpp
index 45c073b7bb3f..61beef408e68 100644
--- a/intl/lwbrk/LineBreaker.cpp
+++ b/intl/lwbrk/LineBreaker.cpp
@@ -1090,7 +1090,7 @@ void LineBreaker::ComputeBreakPositions(
       if (aWordBreak == WordBreakRule::BreakAll) {
         // For break-all, we don't need to run a dictionary-based breaking
         // algorithm, we just allow breaks between all grapheme clusters.
-        ClusterIterator ci(aChars + cur, end - cur);
+        GraphemeClusterBreakIteratorUtf16 ci(aChars + cur, end - cur);
         while (!ci.AtEnd()) {
           ci.Next();
           aBreakBefore[ci - aChars] = true;
diff --git a/intl/lwbrk/Segmenter.cpp b/intl/lwbrk/Segmenter.cpp
index aa88c71dde01..413fb182621b 100644
--- a/intl/lwbrk/Segmenter.cpp
+++ b/intl/lwbrk/Segmenter.cpp
@@ -10,6 +10,11 @@
 
 #include "mozilla/intl/LineBreaker.h"
 #include "mozilla/intl/WordBreaker.h"
+#include "mozilla/intl/UnicodeProperties.h"
+#include "nsUnicodeProperties.h"
+#include "nsCharTraits.h"
+
+using namespace mozilla::unicode;
 
 namespace mozilla::intl {
 
@@ -50,6 +55,114 @@ Maybe<uint32_t> WordBreakIteratorUtf16::Next() {
   return Some(mPos);
 }
 
+enum HSType {
+  HST_NONE = U_HST_NOT_APPLICABLE,
+  HST_L = U_HST_LEADING_JAMO,
+  HST_V = U_HST_VOWEL_JAMO,
+  HST_T = U_HST_TRAILING_JAMO,
+  HST_LV = U_HST_LV_SYLLABLE,
+  HST_LVT = U_HST_LVT_SYLLABLE
+};
+
+static HSType GetHangulSyllableType(uint32_t aCh) {
+  return HSType(UnicodeProperties::GetIntPropertyValue(
+      aCh, UnicodeProperties::IntProperty::HangulSyllableType));
+}
+
+void GraphemeClusterBreakIteratorUtf16::Next() {
+  if (AtEnd()) {
+    NS_WARNING("ClusterIterator has already reached the end");
+    return;
+  }
+
+  uint32_t ch = *mPos++;
+
+  if (mPos < mLimit && NS_IS_SURROGATE_PAIR(ch, *mPos)) {
+    ch = SURROGATE_TO_UCS4(ch, *mPos++);
+  } else if ((ch & ~0xff) == 0x1100 || (ch >= 0xa960 && ch <= 0xa97f) ||
+             (ch >= 0xac00 && ch <= 0xd7ff)) {
+    // Handle conjoining Jamo that make Hangul syllables
+    HSType hangulState = GetHangulSyllableType(ch);
+    while (mPos < mLimit) {
+      ch = *mPos;
+      HSType hangulType = GetHangulSyllableType(ch);
+      switch (hangulType) {
+        case HST_L:
+        case HST_LV:
+        case HST_LVT:
+          if (hangulState == HST_L) {
+            hangulState = hangulType;
+            mPos++;
+            continue;
+          }
+          break;
+        case HST_V:
+          if ((hangulState != HST_NONE) && (hangulState != HST_T) &&
+              (hangulState != HST_LVT)) {
+            hangulState = hangulType;
+            mPos++;
+            continue;
+          }
+          break;
+        case HST_T:
+          if (hangulState != HST_NONE && hangulState != HST_L) {
+            hangulState = hangulType;
+            mPos++;
+            continue;
+          }
+          break;
+        default:
+          break;
+      }
+      break;
+    }
+  }
+
+  const uint32_t kVS16 = 0xfe0f;
+  const uint32_t kZWJ = 0x200d;
+  // UTF-16 surrogate values for Fitzpatrick type modifiers
+  const uint32_t kFitzpatrickHigh = 0xD83C;
+  const uint32_t kFitzpatrickLowFirst = 0xDFFB;
+  const uint32_t kFitzpatrickLowLast = 0xDFFF;
+
+  bool baseIsEmoji = (GetEmojiPresentation(ch) == EmojiDefault) ||
+                     (GetEmojiPresentation(ch) == TextDefault &&
+                      ((mPos < mLimit && *mPos == kVS16) ||
+                       (mPos + 1 < mLimit && *mPos == kFitzpatrickHigh &&
+                        *(mPos + 1) >= kFitzpatrickLowFirst &&
+                        *(mPos + 1) <= kFitzpatrickLowLast)));
+  bool prevWasZwj = false;
+
+  while (mPos < mLimit) {
+    ch = *mPos;
+    size_t chLen = 1;
+
+    // Check for surrogate pairs; note that isolated surrogates will just
+    // be treated as generic (non-cluster-extending) characters here,
+    // which is fine for cluster-iterating purposes
+    if (mPos < mLimit - 1 && NS_IS_SURROGATE_PAIR(ch, *(mPos + 1))) {
+      ch = SURROGATE_TO_UCS4(ch, *(mPos + 1));
+      chLen = 2;
+    }
+
+    bool extendCluster =
+        IsClusterExtender(ch) ||
+        (baseIsEmoji && prevWasZwj &&
+         ((GetEmojiPresentation(ch) == EmojiDefault) ||
+          (GetEmojiPresentation(ch) == TextDefault && mPos + chLen < mLimit &&
+           *(mPos + chLen) == kVS16)));
+    if (!extendCluster) {
+      break;
+    }
+
+    prevWasZwj = (ch == kZWJ);
+    mPos += chLen;
+  }
+
+  NS_ASSERTION(mText < mPos && mPos <= mLimit,
+               "ClusterIterator::Next has overshot the string!");
+}
+
 Result<UniquePtr<Segmenter>, ICUError> Segmenter::TryCreate(
     Span<const char> aLocale, const SegmenterOptions& aOptions) {
   if (aOptions.mGranularity == SegmenterGranularity::Grapheme ||
diff --git a/intl/lwbrk/Segmenter.h b/intl/lwbrk/Segmenter.h
index 52c0734aee90..26fb8458b7f1 100644
--- a/intl/lwbrk/Segmenter.h
+++ b/intl/lwbrk/Segmenter.h
@@ -121,6 +121,35 @@ class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
   Maybe<uint32_t> Next() override;
 };
 
+/**
+ * Grapheme cluster break iterator for UTF-16 text.
+ */
+class GraphemeClusterBreakIteratorUtf16 {
+ public:
+  GraphemeClusterBreakIteratorUtf16(const char16_t* aText, uint32_t aLength)
+      : mPos(aText),
+        mLimit(aText + aLength)
+#ifdef DEBUG
+        ,
+        mText(aText)
+#endif
+  {
+  }
+
+  operator const char16_t*() const { return mPos; }
+
+  bool AtEnd() const { return mPos >= mLimit; }
+
+  void Next();
+
+ private:
+  const char16_t* mPos;
+  const char16_t* mLimit;
+#ifdef DEBUG
+  const char16_t* mText;
+#endif
+};
+
 /**
  * This component is a Mozilla-focused API for working with segmenters in
  * internationalization code.
diff --git a/intl/unicharutil/util/nsUnicodeProperties.cpp b/intl/unicharutil/util/nsUnicodeProperties.cpp
index 69edf03e4206..2acc0f0296d1 100644
--- a/intl/unicharutil/util/nsUnicodeProperties.cpp
+++ b/intl/unicharutil/util/nsUnicodeProperties.cpp
@@ -9,6 +9,7 @@
 
 #include "mozilla/ArrayUtils.h"
 #include "mozilla/HashTable.h"
+#include "mozilla/intl/Segmenter.h"
 #include "nsCharTraits.h"
 
 #include "BaseChars.h"
@@ -167,114 +168,6 @@ bool IsClusterExtender(uint32_t aCh, uint8_t aCategory) {
       (aCh >= 0xe0020 && aCh <= 0xe007f));   // emoji (flag) tag characters
 }
 
-enum HSType {
-  HST_NONE = U_HST_NOT_APPLICABLE,
-  HST_L = U_HST_LEADING_JAMO,
-  HST_V = U_HST_VOWEL_JAMO,
-  HST_T = U_HST_TRAILING_JAMO,
-  HST_LV = U_HST_LV_SYLLABLE,
-  HST_LVT = U_HST_LVT_SYLLABLE
-};
-
-static HSType GetHangulSyllableType(uint32_t aCh) {
-  return HSType(intl::UnicodeProperties::GetIntPropertyValue(
-      aCh, intl::UnicodeProperties::IntProperty::HangulSyllableType));
-}
-
-void ClusterIterator::Next() {
-  if (AtEnd()) {
-    NS_WARNING("ClusterIterator has already reached the end");
-    return;
-  }
-
-  uint32_t ch = *mPos++;
-
-  if (mPos < mLimit && NS_IS_SURROGATE_PAIR(ch, *mPos)) {
-    ch = SURROGATE_TO_UCS4(ch, *mPos++);
-  } else if ((ch & ~0xff) == 0x1100 || (ch >= 0xa960 && ch <= 0xa97f) ||
-             (ch >= 0xac00 && ch <= 0xd7ff)) {
-    // Handle conjoining Jamo that make Hangul syllables
-    HSType hangulState = GetHangulSyllableType(ch);
-    while (mPos < mLimit) {
-      ch = *mPos;
-      HSType hangulType = GetHangulSyllableType(ch);
-      switch (hangulType) {
-        case HST_L:
-        case HST_LV:
-        case HST_LVT:
-          if (hangulState == HST_L) {
-            hangulState = hangulType;
-            mPos++;
-            continue;
-          }
-          break;
-        case HST_V:
-          if ((hangulState != HST_NONE) && (hangulState != HST_T) &&
-              (hangulState != HST_LVT)) {
-            hangulState = hangulType;
-            mPos++;
-            continue;
-          }
-          break;
-        case HST_T:
-          if (hangulState != HST_NONE && hangulState != HST_L) {
-            hangulState = hangulType;
-            mPos++;
-            continue;
-          }
-          break;
-        default:
-          break;
-      }
-      break;
-    }
-  }
-
-  const uint32_t kVS16 = 0xfe0f;
-  const uint32_t kZWJ = 0x200d;
-  // UTF-16 surrogate values for Fitzpatrick type modifiers
-  const uint32_t kFitzpatrickHigh = 0xD83C;
-  const uint32_t kFitzpatrickLowFirst = 0xDFFB;
-  const uint32_t kFitzpatrickLowLast = 0xDFFF;
-
-  bool baseIsEmoji = (GetEmojiPresentation(ch) == EmojiDefault) ||
-                     (GetEmojiPresentation(ch) == TextDefault &&
-                      ((mPos < mLimit && *mPos == kVS16) ||
-                       (mPos + 1 < mLimit && *mPos == kFitzpatrickHigh &&
-                        *(mPos + 1) >= kFitzpatrickLowFirst &&
-                        *(mPos + 1) <= kFitzpatrickLowLast)));
-  bool prevWasZwj = false;
-
-  while (mPos < mLimit) {
-    ch = *mPos;
-    size_t chLen = 1;
-
-    // Check for surrogate pairs; note that isolated surrogates will just
-    // be treated as generic (non-cluster-extending) characters here,
-    // which is fine for cluster-iterating purposes
-    if (mPos < mLimit - 1 && NS_IS_SURROGATE_PAIR(ch, *(mPos + 1))) {
-      ch = SURROGATE_TO_UCS4(ch, *(mPos + 1));
-      chLen = 2;
-    }
-
-    bool extendCluster =
-        IsClusterExtender(ch) ||
-        (baseIsEmoji && prevWasZwj &&
-         ((GetEmojiPresentation(ch) == EmojiDefault) ||
-          (GetEmojiPresentation(ch) == TextDefault && mPos + chLen < mLimit &&
-           *(mPos + chLen) == kVS16)));
-    if (!extendCluster) {
-      break;
-    }
-
-    prevWasZwj = (ch == kZWJ);
-    mPos += chLen;
-  }
-
-  NS_ASSERTION(mText < mPos && mPos <= mLimit,
-               "ClusterIterator::Next has overshot the string!");
-}
-
 void ClusterReverseIterator::Next() {
   if (AtEnd()) {
     NS_WARNING("ClusterReverseIterator has already reached the end");
@@ -301,7 +194,7 @@ void ClusterReverseIterator::Next() {
 }
 
 uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength) {
-  ClusterIterator iter(aText, aLength);
+  intl::GraphemeClusterBreakIteratorUtf16 iter(aText, aLength);
   uint32_t result = 0;
   while (!iter.AtEnd()) {
     ++result;
diff --git a/intl/unicharutil/util/nsUnicodeProperties.h b/intl/unicharutil/util/nsUnicodeProperties.h
index e2d42a4922ea..a1471ca40bb9 100644
--- a/intl/unicharutil/util/nsUnicodeProperties.h
+++ b/intl/unicharutil/util/nsUnicodeProperties.h
@@ -158,34 +158,6 @@ inline bool IsClusterExtender(uint32_t aCh) {
   return IsClusterExtender(aCh, GetGeneralCategory(aCh));
 }
 
-// A simple iterator for a string of char16_t codepoints that advances
-// by Unicode grapheme clusters
-class ClusterIterator {
- public:
-  ClusterIterator(const char16_t* aText, uint32_t aLength)
-      : mPos(aText),
-        mLimit(aText + aLength)
-#ifdef DEBUG
-        ,
-        mText(aText)
-#endif
-  {
-  }
-
-  operator const char16_t*() const { return mPos; }
-
-  bool AtEnd() const { return mPos >= mLimit; }
-
-  void Next();
-
- private:
-  const char16_t* mPos;
-  const char16_t* mLimit;
-#ifdef DEBUG
-  const char16_t* mText;
-#endif
-};
-
 // Count the number of grapheme clusters in the given string
 uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength);
 
diff --git a/layout/forms/nsFileControlFrame.cpp b/layout/forms/nsFileControlFrame.cpp
index cf7fbabe5153..26c0a3382adc 100644
--- a/layout/forms/nsFileControlFrame.cpp
+++ b/layout/forms/nsFileControlFrame.cpp
@@ -21,6 +21,7 @@
 #include "mozilla/dom/HTMLButtonElement.h"
 #include "mozilla/dom/HTMLInputElement.h"
 #include "mozilla/dom/MutationEventBinding.h"
+#include "mozilla/intl/Segmenter.h"
 #include "mozilla/Preferences.h"
 #include "mozilla/PresShell.h"
 #include "mozilla/StaticPrefs_dom.h"
@@ -87,9 +88,9 @@ bool nsFileControlFrame::CropTextToWidth(gfxContext& aRenderingContext,
 
   // determine how much of the string will fit in the max width
   nscoord totalWidth = textWidth;
-  using mozilla::unicode::ClusterIterator;
   using mozilla::unicode::ClusterReverseIterator;
-  ClusterIterator leftIter(aText.Data(), aText.Length());
+  intl::GraphemeClusterBreakIteratorUtf16 leftIter(aText.Data(),
+                                                   aText.Length());
   ClusterReverseIterator rightIter(aText.Data(), aText.Length());
   const char16_t* leftPos = leftIter;
   const char16_t* rightPos = rightIter;
diff --git a/layout/xul/nsTextBoxFrame.cpp b/layout/xul/nsTextBoxFrame.cpp
index a9f653a197e3..5f5aa798b5ac 100644
--- a/layout/xul/nsTextBoxFrame.cpp
+++ b/layout/xul/nsTextBoxFrame.cpp
@@ -13,6 +13,7 @@
 #include "mozilla/ComputedStyle.h"
 #include "mozilla/Preferences.h"
 #include "mozilla/PresShell.h"
+#include "mozilla/intl/Segmenter.h"
 #include "mozilla/layers/RenderRootStateManager.h"
 #include "mozilla/gfx/2D.h"
 #include "nsFontMetrics.h"
@@ -614,7 +615,7 @@ nscoord nsTextBoxFrame::CalculateTitleForWidth(gfxContext& aRenderingContext,
     titleWidth = 0;
   }
 
-  using mozilla::unicode::ClusterIterator;
+  using mozilla::intl::GraphemeClusterBreakIteratorUtf16;
   using mozilla::unicode::ClusterReverseIterator;
 
   // ok crop things
@@ -622,7 +623,7 @@ nscoord nsTextBoxFrame::CalculateTitleForWidth(gfxContext& aRenderingContext,
     case CropAuto:
     case CropNone:
     case CropRight: {
-      ClusterIterator iter(mTitle.Data(), mTitle.Length());
+      GraphemeClusterBreakIteratorUtf16 iter(mTitle.Data(), mTitle.Length());
       const char16_t* dataBegin = iter;
       const char16_t* pos = dataBegin;
       nscoord charWidth;
@@ -700,7 +701,8 @@ nscoord nsTextBoxFrame::CalculateTitleForWidth(gfxContext& aRenderingContext,
       // determine how much of the string will fit in the max width
       nscoord charWidth = 0;
       nscoord totalWidth = 0;
-      ClusterIterator leftIter(mTitle.Data(), mTitle.Length());
+      GraphemeClusterBreakIteratorUtf16 leftIter(mTitle.Data(),
+                                                 mTitle.Length());
       ClusterReverseIterator rightIter(mTitle.Data(), mTitle.Length());
       const char16_t* dataBegin = leftIter;
       const char16_t* dataEnd = rightIter;
diff --git a/netwerk/streamconv/converters/mozTXTToHTMLConv.cpp b/netwerk/streamconv/converters/mozTXTToHTMLConv.cpp
index 1d09d44c34cd..9914fb0dbde5 100644
--- a/netwerk/streamconv/converters/mozTXTToHTMLConv.cpp
+++ b/netwerk/streamconv/converters/mozTXTToHTMLConv.cpp
@@ -5,6 +5,7 @@
 
 #include "mozilla/TextUtils.h"
 #include "mozTXTToHTMLConv.h"
+#include "mozilla/intl/Segmenter.h"
 #include "nsNetUtil.h"
 #include "nsUnicharUtils.h"
 #include "nsUnicodeProperties.h"
@@ -22,6 +23,7 @@
 using mozilla::IsAscii;
 using mozilla::IsAsciiAlpha;
 using mozilla::IsAsciiDigit;
+using mozilla::intl::GraphemeClusterBreakIteratorUtf16;
 
 const double growthRate = 1.2;
 
@@ -557,7 +559,7 @@ bool mozTXTToHTMLConv::ItMatchesDelimited(const char16_t* aInString,
   // find length of the char/cluster to be ignored
   int32_t ignoreLen = before == LT_IGNORE ? 0 : 1;
   if (ignoreLen) {
-    mozilla::unicode::ClusterIterator ci(aInString, aInLength);
+    GraphemeClusterBreakIteratorUtf16 ci(aInString, aInLength);
     ci.Next();
     ignoreLen = ci - aInString;
   }
@@ -591,7 +593,7 @@ uint32_t mozTXTToHTMLConv::NumberOfMatches(const char16_t* aInString,
   uint32_t result = 0;
 
   const char16_t* end = aInString + aInStringLength;
-  for (mozilla::unicode::ClusterIterator ci(aInString, aInStringLength);
+  for (GraphemeClusterBreakIteratorUtf16 ci(aInString, aInStringLength);
        !ci.AtEnd(); ci.Next()) {
     if (ItMatchesDelimited(ci, end - ci, rep, aRepLen, before, after)) {
       result++;
@@ -979,7 +981,7 @@ mozTXTToHTMLConv::ScanTXT(const nsAString& aInString, uint32_t whattodo,
   const char16_t* rawInputString = aInString.BeginReading();
   uint32_t inLength = aInString.Length();
 
-  for (mozilla::unicode::ClusterIterator ci(rawInputString, inLength);
+  for (GraphemeClusterBreakIteratorUtf16 ci(rawInputString, inLength);
        !ci.AtEnd();) {
     uint32_t i = ci - rawInputString;
     if (doGlyphSubstitution) {