Bug 1856267 - Don't use ICU4X segmenter if word is ASCII only and non-breakable character. r=TYLin,jfkthame

`mCurrentWordContainsComplexChar` and `wordHasComplexChar` seem to be used whether a word has breakable character. I would like to rename it for clean up. Also, `nsLineBreaker` has a fast path if character is ASCII, alphabet (AL) and number (NU). But IS and QU are also unbreakable. So we should add it by using simple table. Differential Revision: https://phabricator.services.mozilla.com/D190379
2024-11-29 07:42:04 +00:00 · 2023-10-18 13:21:57 +00:00 · 2023-10-18 13:21:57 +00:00 · dccb39dd6d
commit dccb39dd6d
parent 546538be6e
2 changed files with 88 additions and 72 deletions
--- a/dom/base/nsLineBreaker.cpp
+++ b/dom/base/nsLineBreaker.cpp
@ -23,10 +23,39 @@ using mozilla::intl::LocaleParser;
 using mozilla::intl::UnicodeProperties;
 using mozilla::intl::WordBreakRule;

+// There is no break opportunity between any pair of characters that has line
+// break class of either AL (Alphabetic), IS (Infix Numeric Separator), NU
+// (Numeric), or QU (Quotation). See
+// https://www.unicode.org/Public/UCD/latest/ucd/LineBreak.txt for Unicode code
+// point and line break class mapping.
+static constexpr uint8_t kNonBreakableASCII[] = {
+    // clang-format off
+// 0x20-0x2f
+0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,
+// 0x30-0x3f
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+// 0x40-0x4f
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+// 0x50-0x5f
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
+// 0x60-0x6f
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+// 0x70-0x7f
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
+    // clang-format on
+};
+
+template <typename T>
+static constexpr bool IsNonBreakableChar(T aChar) {
+  if (aChar < 0x20 || aChar > 0x7f) {
+    return false;
+  }
+  return !!kNonBreakableASCII[aChar - 0x20];
+}
+
 nsLineBreaker::nsLineBreaker()
    : mCurrentWordLanguage(nullptr),
      mCurrentWordContainsMixedLang(false),
-      mCurrentWordContainsComplexChar(false),
      mScriptIsChineseOrJapanese(false),
      mAfterBreakableSpace(false),
      mBreakHere(false),
@ -113,7 +142,7 @@ nsresult nsLineBreaker::FlushCurrentWord() {
    memset(breakState.Elements(),
           gfxTextRun::CompressedGlyph::FLAG_BREAK_TYPE_NORMAL,
           length * sizeof(uint8_t));
-  } else if (!mCurrentWordContainsComplexChar) {
+  } else if (!mCurrentWordMightBeBreakable) {
    // For break-strict set everything internal to "break", otherwise
    // to "no break"!
    memset(breakState.Elements(),
@ -188,7 +217,7 @@ nsresult nsLineBreaker::FlushCurrentWord() {

  mCurrentWord.Clear();
  mTextItems.Clear();
-  mCurrentWordContainsComplexChar = false;
+  mCurrentWordMightBeBreakable = false;
  mCurrentWordContainsMixedLang = false;
  mCurrentWordLanguage = nullptr;
  mWordContinuation = false;
@ -216,8 +245,9 @@ nsresult nsLineBreaker::AppendText(nsAtom* aHyphenationLanguage,

    while (offset < aLength && !IsSpace(aText[offset])) {
      mCurrentWord.AppendElement(aText[offset]);
-      if (!mCurrentWordContainsComplexChar && IsComplexChar(aText[offset])) {
-        mCurrentWordContainsComplexChar = true;
+      if (!mCurrentWordMightBeBreakable &&
+          !IsNonBreakableChar<char16_t>(aText[offset])) {
+        mCurrentWordMightBeBreakable = true;
      }
      UpdateCurrentWordLanguage(aHyphenationLanguage);
      ++offset;
@ -273,7 +303,7 @@ nsresult nsLineBreaker::AppendText(nsAtom* aHyphenationLanguage,
    }
  }
  uint32_t wordStart = offset;
-  bool wordHasComplexChar = false;
+  bool wordMightBeBreakable = false;

  RefPtr<nsHyphenator> hyphenator;
  if ((aFlags & BREAK_USE_AUTO_HYPHENATION) &&
@ -305,7 +335,7 @@ nsresult nsLineBreaker::AppendText(nsAtom* aHyphenationLanguage,
            memset(breakState.Elements() + wordStart,
                   gfxTextRun::CompressedGlyph::FLAG_BREAK_TYPE_NORMAL,
                   offset - wordStart);
-          } else if (wordHasComplexChar) {
+          } else if (wordMightBeBreakable) {
            // Save current start-of-word state because ComputeBreakPositions()
            // will set it to false.
            AutoRestore<uint8_t> saveWordStartBreakState(breakState[wordStart]);
@ -323,33 +353,34 @@ nsresult nsLineBreaker::AppendText(nsAtom* aHyphenationLanguage,
                              capitalizationState.Elements() + wordStart);
        }
      }
-      wordHasComplexChar = false;
+      wordMightBeBreakable = false;
      mWordContinuation = false;
      ++offset;
      if (offset >= aLength) {
        break;
      }
      wordStart = offset;
-    } else {
-      if (!wordHasComplexChar && IsComplexChar(ch)) {
-        wordHasComplexChar = true;
-      }
-      ++offset;
-      if (offset >= aLength) {
-        // Save this word
-        mCurrentWordContainsComplexChar = wordHasComplexChar;
-        uint32_t len = offset - wordStart;
-        char16_t* elems = mCurrentWord.AppendElements(len);
-        if (!elems) {
-          return NS_ERROR_OUT_OF_MEMORY;
-        }
-        memcpy(elems, aText + wordStart, sizeof(char16_t) * len);
-        mTextItems.AppendElement(TextItem(aSink, wordStart, len, aFlags));
-        // Ensure that the break-before for this word is written out
-        offset = wordStart + 1;
-        UpdateCurrentWordLanguage(aHyphenationLanguage);
-        break;
+      continue;
+    }
+
+    if (!wordMightBeBreakable && !IsNonBreakableChar<char16_t>(ch)) {
+      wordMightBeBreakable = true;
+    }
+    ++offset;
+    if (offset >= aLength) {
+      // Save this word
+      mCurrentWordMightBeBreakable = wordMightBeBreakable;
+      uint32_t len = offset - wordStart;
+      char16_t* elems = mCurrentWord.AppendElements(len, mozilla::fallible);
+      if (!elems) {
+        return NS_ERROR_OUT_OF_MEMORY;
      }
+      memcpy(elems, aText + wordStart, sizeof(char16_t) * len);
+      mTextItems.AppendElement(TextItem(aSink, wordStart, len, aFlags));
+      // Ensure that the break-before for this word is written out
+      offset = wordStart + 1;
+      UpdateCurrentWordLanguage(aHyphenationLanguage);
+      break;
    }
  }

@ -403,9 +434,9 @@ nsresult nsLineBreaker::AppendText(nsAtom* aHyphenationLanguage,

    while (offset < aLength && !IsSpace(aText[offset])) {
      mCurrentWord.AppendElement(aText[offset]);
-      if (!mCurrentWordContainsComplexChar &&
-          IsComplexASCIIChar(aText[offset])) {
-        mCurrentWordContainsComplexChar = true;
+      if (!mCurrentWordMightBeBreakable &&
+          !IsNonBreakableChar<uint8_t>(aText[offset])) {
+        mCurrentWordMightBeBreakable = true;
      }
      ++offset;
    }
@ -451,7 +482,7 @@ nsresult nsLineBreaker::AppendText(nsAtom* aHyphenationLanguage,
    }
  }
  uint32_t wordStart = offset;
-  bool wordHasComplexChar = false;
+  bool wordMightBeBreakable = false;

  for (;;) {
    uint8_t ch = aText[offset];
@ -477,7 +508,7 @@ nsresult nsLineBreaker::AppendText(nsAtom* aHyphenationLanguage,
          memset(breakState.Elements() + wordStart,
                 gfxTextRun::CompressedGlyph::FLAG_BREAK_TYPE_NORMAL,
                 offset - wordStart);
-        } else if (wordHasComplexChar) {
+        } else if (wordMightBeBreakable) {
          // Save current start-of-word state because ComputeBreakPositions()
          // will set it to false.
          AutoRestore<uint8_t> saveWordStartBreakState(breakState[wordStart]);
@ -487,35 +518,36 @@ nsresult nsLineBreaker::AppendText(nsAtom* aHyphenationLanguage,
        }
      }

-      wordHasComplexChar = false;
+      wordMightBeBreakable = false;
      mWordContinuation = false;
      ++offset;
      if (offset >= aLength) {
        break;
      }
      wordStart = offset;
-    } else {
-      if (!wordHasComplexChar && IsComplexASCIIChar(ch)) {
-        wordHasComplexChar = true;
+      continue;
+    }
+
+    if (!wordMightBeBreakable && !IsNonBreakableChar<uint8_t>(ch)) {
+      wordMightBeBreakable = true;
+    }
+    ++offset;
+    if (offset >= aLength) {
+      // Save this word
+      mCurrentWordMightBeBreakable = wordMightBeBreakable;
+      uint32_t len = offset - wordStart;
+      char16_t* elems = mCurrentWord.AppendElements(len, mozilla::fallible);
+      if (!elems) {
+        return NS_ERROR_OUT_OF_MEMORY;
      }
-      ++offset;
-      if (offset >= aLength) {
-        // Save this word
-        mCurrentWordContainsComplexChar = wordHasComplexChar;
-        uint32_t len = offset - wordStart;
-        char16_t* elems = mCurrentWord.AppendElements(len);
-        if (!elems) {
-          return NS_ERROR_OUT_OF_MEMORY;
-        }
-        uint32_t i;
-        for (i = wordStart; i < offset; ++i) {
-          elems[i - wordStart] = aText[i];
-        }
-        mTextItems.AppendElement(TextItem(aSink, wordStart, len, aFlags));
-        // Ensure that the break-before for this word is written out
-        offset = wordStart + 1;
-        break;
+      uint32_t i;
+      for (i = wordStart; i < offset; ++i) {
+        elems[i - wordStart] = aText[i];
      }
+      mTextItems.AppendElement(TextItem(aSink, wordStart, len, aFlags));
+      // Ensure that the break-before for this word is written out
+      offset = wordStart + 1;
+      break;
    }
  }

--- a/dom/base/nsLineBreaker.h
+++ b/dom/base/nsLineBreaker.h
@ -57,8 +57,7 @@ class nsILineBreakSink {
 *
 * The current strategy is that we break the overall text into
 * whitespace-delimited "words". Then those words are passed to the LineBreaker
- * service for deeper analysis if they contain a "complex" character as
- * described below.
+ * for deeper analysis if they might contain breakable characters.
 *
 * This class also handles detection of which characters should be capitalized
 * for text-transform:capitalize. This is a good place to handle that because
@ -73,27 +72,12 @@ class nsLineBreaker {
    return mozilla::intl::NS_IsSpace(u);
  }

-  static inline bool IsComplexASCIIChar(char16_t u) {
-    return !((0x0030 <= u && u <= 0x0039) || (0x0041 <= u && u <= 0x005A) ||
-             (0x0061 <= u && u <= 0x007A) || (0x000a == u));
-  }
-
-  static inline bool IsComplexChar(char16_t u) {
-    return IsComplexASCIIChar(u) ||
-           mozilla::intl::NS_NeedsPlatformNativeHandling(u) ||
-           (0x1100 <= u && u <= 0x11ff) ||  // Hangul Jamo
-           (0x2000 <= u && u <= 0x21ff) ||  // Punctuations and Symbols
-           (0x2e80 <= u && u <= 0xd7ff) ||  // several CJK blocks
-           (0xf900 <= u && u <= 0xfaff) ||  // CJK Compatibility Idographs
-           (0xff00 <= u && u <= 0xffef);    // Halfwidth and Fullwidth Forms
-  }
-
  // Break opportunities exist at the end of each run of breakable whitespace
  // (see IsSpace above). Break opportunities can also exist between pairs of
  // non-whitespace characters, as determined by mozilla::intl::LineBreaker.
  // We pass a whitespace-
  // delimited word to LineBreaker if it contains at least one character
-  // matching IsComplexChar.
+  // that has breakable line breaking classes.
  // We provide flags to control on a per-chunk basis where breaks are allowed.
  // At any character boundary, exactly one text chunk governs whether a
  // break is allowed at that boundary.
@ -263,7 +247,7 @@ class nsLineBreaker {
  AutoTArray<TextItem, 2> mTextItems;
  nsAtom* mCurrentWordLanguage;
  bool mCurrentWordContainsMixedLang;
-  bool mCurrentWordContainsComplexChar;
+  bool mCurrentWordMightBeBreakable = false;
  bool mScriptIsChineseOrJapanese;

  // True if the previous character was breakable whitespace