Bug 1856267 - Don't use ICU4X segmenter if word is ASCII only and non-breakable character. r=TYLin,jfkthame

`mCurrentWordContainsComplexChar` and `wordHasComplexChar` seem to be used
 whether a word has breakable character. I would like to rename it for clean
up.

Also, `nsLineBreaker` has a fast path if character is ASCII, alphabet (AL) and
number (NU). But IS and QU are also unbreakable. So we should add it by using
simple table.

Differential Revision: https://phabricator.services.mozilla.com/D190379
This commit is contained in:
Makoto Kato 2023-10-18 13:21:57 +00:00
parent 546538be6e
commit dccb39dd6d
2 changed files with 88 additions and 72 deletions

View File

@ -23,10 +23,39 @@ using mozilla::intl::LocaleParser;
using mozilla::intl::UnicodeProperties;
using mozilla::intl::WordBreakRule;
// There is no break opportunity between any pair of characters that has line
// break class of either AL (Alphabetic), IS (Infix Numeric Separator), NU
// (Numeric), or QU (Quotation). See
// https://www.unicode.org/Public/UCD/latest/ucd/LineBreak.txt for Unicode code
// point and line break class mapping.
static constexpr uint8_t kNonBreakableASCII[] = {
// clang-format off
// 0x20-0x2f
0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,
// 0x30-0x3f
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
// 0x40-0x4f
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// 0x50-0x5f
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
// 0x60-0x6f
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// 0x70-0x7f
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
// clang-format on
};
template <typename T>
static constexpr bool IsNonBreakableChar(T aChar) {
if (aChar < 0x20 || aChar > 0x7f) {
return false;
}
return !!kNonBreakableASCII[aChar - 0x20];
}
nsLineBreaker::nsLineBreaker()
: mCurrentWordLanguage(nullptr),
mCurrentWordContainsMixedLang(false),
mCurrentWordContainsComplexChar(false),
mScriptIsChineseOrJapanese(false),
mAfterBreakableSpace(false),
mBreakHere(false),
@ -113,7 +142,7 @@ nsresult nsLineBreaker::FlushCurrentWord() {
memset(breakState.Elements(),
gfxTextRun::CompressedGlyph::FLAG_BREAK_TYPE_NORMAL,
length * sizeof(uint8_t));
} else if (!mCurrentWordContainsComplexChar) {
} else if (!mCurrentWordMightBeBreakable) {
// For break-strict set everything internal to "break", otherwise
// to "no break"!
memset(breakState.Elements(),
@ -188,7 +217,7 @@ nsresult nsLineBreaker::FlushCurrentWord() {
mCurrentWord.Clear();
mTextItems.Clear();
mCurrentWordContainsComplexChar = false;
mCurrentWordMightBeBreakable = false;
mCurrentWordContainsMixedLang = false;
mCurrentWordLanguage = nullptr;
mWordContinuation = false;
@ -216,8 +245,9 @@ nsresult nsLineBreaker::AppendText(nsAtom* aHyphenationLanguage,
while (offset < aLength && !IsSpace(aText[offset])) {
mCurrentWord.AppendElement(aText[offset]);
if (!mCurrentWordContainsComplexChar && IsComplexChar(aText[offset])) {
mCurrentWordContainsComplexChar = true;
if (!mCurrentWordMightBeBreakable &&
!IsNonBreakableChar<char16_t>(aText[offset])) {
mCurrentWordMightBeBreakable = true;
}
UpdateCurrentWordLanguage(aHyphenationLanguage);
++offset;
@ -273,7 +303,7 @@ nsresult nsLineBreaker::AppendText(nsAtom* aHyphenationLanguage,
}
}
uint32_t wordStart = offset;
bool wordHasComplexChar = false;
bool wordMightBeBreakable = false;
RefPtr<nsHyphenator> hyphenator;
if ((aFlags & BREAK_USE_AUTO_HYPHENATION) &&
@ -305,7 +335,7 @@ nsresult nsLineBreaker::AppendText(nsAtom* aHyphenationLanguage,
memset(breakState.Elements() + wordStart,
gfxTextRun::CompressedGlyph::FLAG_BREAK_TYPE_NORMAL,
offset - wordStart);
} else if (wordHasComplexChar) {
} else if (wordMightBeBreakable) {
// Save current start-of-word state because ComputeBreakPositions()
// will set it to false.
AutoRestore<uint8_t> saveWordStartBreakState(breakState[wordStart]);
@ -323,33 +353,34 @@ nsresult nsLineBreaker::AppendText(nsAtom* aHyphenationLanguage,
capitalizationState.Elements() + wordStart);
}
}
wordHasComplexChar = false;
wordMightBeBreakable = false;
mWordContinuation = false;
++offset;
if (offset >= aLength) {
break;
}
wordStart = offset;
} else {
if (!wordHasComplexChar && IsComplexChar(ch)) {
wordHasComplexChar = true;
}
++offset;
if (offset >= aLength) {
// Save this word
mCurrentWordContainsComplexChar = wordHasComplexChar;
uint32_t len = offset - wordStart;
char16_t* elems = mCurrentWord.AppendElements(len);
if (!elems) {
return NS_ERROR_OUT_OF_MEMORY;
}
memcpy(elems, aText + wordStart, sizeof(char16_t) * len);
mTextItems.AppendElement(TextItem(aSink, wordStart, len, aFlags));
// Ensure that the break-before for this word is written out
offset = wordStart + 1;
UpdateCurrentWordLanguage(aHyphenationLanguage);
break;
continue;
}
if (!wordMightBeBreakable && !IsNonBreakableChar<char16_t>(ch)) {
wordMightBeBreakable = true;
}
++offset;
if (offset >= aLength) {
// Save this word
mCurrentWordMightBeBreakable = wordMightBeBreakable;
uint32_t len = offset - wordStart;
char16_t* elems = mCurrentWord.AppendElements(len, mozilla::fallible);
if (!elems) {
return NS_ERROR_OUT_OF_MEMORY;
}
memcpy(elems, aText + wordStart, sizeof(char16_t) * len);
mTextItems.AppendElement(TextItem(aSink, wordStart, len, aFlags));
// Ensure that the break-before for this word is written out
offset = wordStart + 1;
UpdateCurrentWordLanguage(aHyphenationLanguage);
break;
}
}
@ -403,9 +434,9 @@ nsresult nsLineBreaker::AppendText(nsAtom* aHyphenationLanguage,
while (offset < aLength && !IsSpace(aText[offset])) {
mCurrentWord.AppendElement(aText[offset]);
if (!mCurrentWordContainsComplexChar &&
IsComplexASCIIChar(aText[offset])) {
mCurrentWordContainsComplexChar = true;
if (!mCurrentWordMightBeBreakable &&
!IsNonBreakableChar<uint8_t>(aText[offset])) {
mCurrentWordMightBeBreakable = true;
}
++offset;
}
@ -451,7 +482,7 @@ nsresult nsLineBreaker::AppendText(nsAtom* aHyphenationLanguage,
}
}
uint32_t wordStart = offset;
bool wordHasComplexChar = false;
bool wordMightBeBreakable = false;
for (;;) {
uint8_t ch = aText[offset];
@ -477,7 +508,7 @@ nsresult nsLineBreaker::AppendText(nsAtom* aHyphenationLanguage,
memset(breakState.Elements() + wordStart,
gfxTextRun::CompressedGlyph::FLAG_BREAK_TYPE_NORMAL,
offset - wordStart);
} else if (wordHasComplexChar) {
} else if (wordMightBeBreakable) {
// Save current start-of-word state because ComputeBreakPositions()
// will set it to false.
AutoRestore<uint8_t> saveWordStartBreakState(breakState[wordStart]);
@ -487,35 +518,36 @@ nsresult nsLineBreaker::AppendText(nsAtom* aHyphenationLanguage,
}
}
wordHasComplexChar = false;
wordMightBeBreakable = false;
mWordContinuation = false;
++offset;
if (offset >= aLength) {
break;
}
wordStart = offset;
} else {
if (!wordHasComplexChar && IsComplexASCIIChar(ch)) {
wordHasComplexChar = true;
continue;
}
if (!wordMightBeBreakable && !IsNonBreakableChar<uint8_t>(ch)) {
wordMightBeBreakable = true;
}
++offset;
if (offset >= aLength) {
// Save this word
mCurrentWordMightBeBreakable = wordMightBeBreakable;
uint32_t len = offset - wordStart;
char16_t* elems = mCurrentWord.AppendElements(len, mozilla::fallible);
if (!elems) {
return NS_ERROR_OUT_OF_MEMORY;
}
++offset;
if (offset >= aLength) {
// Save this word
mCurrentWordContainsComplexChar = wordHasComplexChar;
uint32_t len = offset - wordStart;
char16_t* elems = mCurrentWord.AppendElements(len);
if (!elems) {
return NS_ERROR_OUT_OF_MEMORY;
}
uint32_t i;
for (i = wordStart; i < offset; ++i) {
elems[i - wordStart] = aText[i];
}
mTextItems.AppendElement(TextItem(aSink, wordStart, len, aFlags));
// Ensure that the break-before for this word is written out
offset = wordStart + 1;
break;
uint32_t i;
for (i = wordStart; i < offset; ++i) {
elems[i - wordStart] = aText[i];
}
mTextItems.AppendElement(TextItem(aSink, wordStart, len, aFlags));
// Ensure that the break-before for this word is written out
offset = wordStart + 1;
break;
}
}

View File

@ -57,8 +57,7 @@ class nsILineBreakSink {
*
* The current strategy is that we break the overall text into
* whitespace-delimited "words". Then those words are passed to the LineBreaker
* service for deeper analysis if they contain a "complex" character as
* described below.
* for deeper analysis if they might contain breakable characters.
*
* This class also handles detection of which characters should be capitalized
* for text-transform:capitalize. This is a good place to handle that because
@ -73,27 +72,12 @@ class nsLineBreaker {
return mozilla::intl::NS_IsSpace(u);
}
static inline bool IsComplexASCIIChar(char16_t u) {
return !((0x0030 <= u && u <= 0x0039) || (0x0041 <= u && u <= 0x005A) ||
(0x0061 <= u && u <= 0x007A) || (0x000a == u));
}
static inline bool IsComplexChar(char16_t u) {
return IsComplexASCIIChar(u) ||
mozilla::intl::NS_NeedsPlatformNativeHandling(u) ||
(0x1100 <= u && u <= 0x11ff) || // Hangul Jamo
(0x2000 <= u && u <= 0x21ff) || // Punctuations and Symbols
(0x2e80 <= u && u <= 0xd7ff) || // several CJK blocks
(0xf900 <= u && u <= 0xfaff) || // CJK Compatibility Idographs
(0xff00 <= u && u <= 0xffef); // Halfwidth and Fullwidth Forms
}
// Break opportunities exist at the end of each run of breakable whitespace
// (see IsSpace above). Break opportunities can also exist between pairs of
// non-whitespace characters, as determined by mozilla::intl::LineBreaker.
// We pass a whitespace-
// delimited word to LineBreaker if it contains at least one character
// matching IsComplexChar.
// that has breakable line breaking classes.
// We provide flags to control on a per-chunk basis where breaks are allowed.
// At any character boundary, exactly one text chunk governs whether a
// break is allowed at that boundary.
@ -263,7 +247,7 @@ class nsLineBreaker {
AutoTArray<TextItem, 2> mTextItems;
nsAtom* mCurrentWordLanguage;
bool mCurrentWordContainsMixedLang;
bool mCurrentWordContainsComplexChar;
bool mCurrentWordMightBeBreakable = false;
bool mScriptIsChineseOrJapanese;
// True if the previous character was breakable whitespace