mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-29 07:42:04 +00:00
Bug 1856267 - Don't use ICU4X segmenter if word is ASCII only and non-breakable character. r=TYLin,jfkthame
`mCurrentWordContainsComplexChar` and `wordHasComplexChar` seem to be used whether a word has breakable character. I would like to rename it for clean up. Also, `nsLineBreaker` has a fast path if character is ASCII, alphabet (AL) and number (NU). But IS and QU are also unbreakable. So we should add it by using simple table. Differential Revision: https://phabricator.services.mozilla.com/D190379
This commit is contained in:
parent
546538be6e
commit
dccb39dd6d
@ -23,10 +23,39 @@ using mozilla::intl::LocaleParser;
|
||||
using mozilla::intl::UnicodeProperties;
|
||||
using mozilla::intl::WordBreakRule;
|
||||
|
||||
// There is no break opportunity between any pair of characters that has line
|
||||
// break class of either AL (Alphabetic), IS (Infix Numeric Separator), NU
|
||||
// (Numeric), or QU (Quotation). See
|
||||
// https://www.unicode.org/Public/UCD/latest/ucd/LineBreak.txt for Unicode code
|
||||
// point and line break class mapping.
|
||||
static constexpr uint8_t kNonBreakableASCII[] = {
|
||||
// clang-format off
|
||||
// 0x20-0x2f
|
||||
0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,
|
||||
// 0x30-0x3f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
|
||||
// 0x40-0x4f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
// 0x50-0x5f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
|
||||
// 0x60-0x6f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
// 0x70-0x7f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
static constexpr bool IsNonBreakableChar(T aChar) {
|
||||
if (aChar < 0x20 || aChar > 0x7f) {
|
||||
return false;
|
||||
}
|
||||
return !!kNonBreakableASCII[aChar - 0x20];
|
||||
}
|
||||
|
||||
nsLineBreaker::nsLineBreaker()
|
||||
: mCurrentWordLanguage(nullptr),
|
||||
mCurrentWordContainsMixedLang(false),
|
||||
mCurrentWordContainsComplexChar(false),
|
||||
mScriptIsChineseOrJapanese(false),
|
||||
mAfterBreakableSpace(false),
|
||||
mBreakHere(false),
|
||||
@ -113,7 +142,7 @@ nsresult nsLineBreaker::FlushCurrentWord() {
|
||||
memset(breakState.Elements(),
|
||||
gfxTextRun::CompressedGlyph::FLAG_BREAK_TYPE_NORMAL,
|
||||
length * sizeof(uint8_t));
|
||||
} else if (!mCurrentWordContainsComplexChar) {
|
||||
} else if (!mCurrentWordMightBeBreakable) {
|
||||
// For break-strict set everything internal to "break", otherwise
|
||||
// to "no break"!
|
||||
memset(breakState.Elements(),
|
||||
@ -188,7 +217,7 @@ nsresult nsLineBreaker::FlushCurrentWord() {
|
||||
|
||||
mCurrentWord.Clear();
|
||||
mTextItems.Clear();
|
||||
mCurrentWordContainsComplexChar = false;
|
||||
mCurrentWordMightBeBreakable = false;
|
||||
mCurrentWordContainsMixedLang = false;
|
||||
mCurrentWordLanguage = nullptr;
|
||||
mWordContinuation = false;
|
||||
@ -216,8 +245,9 @@ nsresult nsLineBreaker::AppendText(nsAtom* aHyphenationLanguage,
|
||||
|
||||
while (offset < aLength && !IsSpace(aText[offset])) {
|
||||
mCurrentWord.AppendElement(aText[offset]);
|
||||
if (!mCurrentWordContainsComplexChar && IsComplexChar(aText[offset])) {
|
||||
mCurrentWordContainsComplexChar = true;
|
||||
if (!mCurrentWordMightBeBreakable &&
|
||||
!IsNonBreakableChar<char16_t>(aText[offset])) {
|
||||
mCurrentWordMightBeBreakable = true;
|
||||
}
|
||||
UpdateCurrentWordLanguage(aHyphenationLanguage);
|
||||
++offset;
|
||||
@ -273,7 +303,7 @@ nsresult nsLineBreaker::AppendText(nsAtom* aHyphenationLanguage,
|
||||
}
|
||||
}
|
||||
uint32_t wordStart = offset;
|
||||
bool wordHasComplexChar = false;
|
||||
bool wordMightBeBreakable = false;
|
||||
|
||||
RefPtr<nsHyphenator> hyphenator;
|
||||
if ((aFlags & BREAK_USE_AUTO_HYPHENATION) &&
|
||||
@ -305,7 +335,7 @@ nsresult nsLineBreaker::AppendText(nsAtom* aHyphenationLanguage,
|
||||
memset(breakState.Elements() + wordStart,
|
||||
gfxTextRun::CompressedGlyph::FLAG_BREAK_TYPE_NORMAL,
|
||||
offset - wordStart);
|
||||
} else if (wordHasComplexChar) {
|
||||
} else if (wordMightBeBreakable) {
|
||||
// Save current start-of-word state because ComputeBreakPositions()
|
||||
// will set it to false.
|
||||
AutoRestore<uint8_t> saveWordStartBreakState(breakState[wordStart]);
|
||||
@ -323,33 +353,34 @@ nsresult nsLineBreaker::AppendText(nsAtom* aHyphenationLanguage,
|
||||
capitalizationState.Elements() + wordStart);
|
||||
}
|
||||
}
|
||||
wordHasComplexChar = false;
|
||||
wordMightBeBreakable = false;
|
||||
mWordContinuation = false;
|
||||
++offset;
|
||||
if (offset >= aLength) {
|
||||
break;
|
||||
}
|
||||
wordStart = offset;
|
||||
} else {
|
||||
if (!wordHasComplexChar && IsComplexChar(ch)) {
|
||||
wordHasComplexChar = true;
|
||||
}
|
||||
++offset;
|
||||
if (offset >= aLength) {
|
||||
// Save this word
|
||||
mCurrentWordContainsComplexChar = wordHasComplexChar;
|
||||
uint32_t len = offset - wordStart;
|
||||
char16_t* elems = mCurrentWord.AppendElements(len);
|
||||
if (!elems) {
|
||||
return NS_ERROR_OUT_OF_MEMORY;
|
||||
}
|
||||
memcpy(elems, aText + wordStart, sizeof(char16_t) * len);
|
||||
mTextItems.AppendElement(TextItem(aSink, wordStart, len, aFlags));
|
||||
// Ensure that the break-before for this word is written out
|
||||
offset = wordStart + 1;
|
||||
UpdateCurrentWordLanguage(aHyphenationLanguage);
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!wordMightBeBreakable && !IsNonBreakableChar<char16_t>(ch)) {
|
||||
wordMightBeBreakable = true;
|
||||
}
|
||||
++offset;
|
||||
if (offset >= aLength) {
|
||||
// Save this word
|
||||
mCurrentWordMightBeBreakable = wordMightBeBreakable;
|
||||
uint32_t len = offset - wordStart;
|
||||
char16_t* elems = mCurrentWord.AppendElements(len, mozilla::fallible);
|
||||
if (!elems) {
|
||||
return NS_ERROR_OUT_OF_MEMORY;
|
||||
}
|
||||
memcpy(elems, aText + wordStart, sizeof(char16_t) * len);
|
||||
mTextItems.AppendElement(TextItem(aSink, wordStart, len, aFlags));
|
||||
// Ensure that the break-before for this word is written out
|
||||
offset = wordStart + 1;
|
||||
UpdateCurrentWordLanguage(aHyphenationLanguage);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@ -403,9 +434,9 @@ nsresult nsLineBreaker::AppendText(nsAtom* aHyphenationLanguage,
|
||||
|
||||
while (offset < aLength && !IsSpace(aText[offset])) {
|
||||
mCurrentWord.AppendElement(aText[offset]);
|
||||
if (!mCurrentWordContainsComplexChar &&
|
||||
IsComplexASCIIChar(aText[offset])) {
|
||||
mCurrentWordContainsComplexChar = true;
|
||||
if (!mCurrentWordMightBeBreakable &&
|
||||
!IsNonBreakableChar<uint8_t>(aText[offset])) {
|
||||
mCurrentWordMightBeBreakable = true;
|
||||
}
|
||||
++offset;
|
||||
}
|
||||
@ -451,7 +482,7 @@ nsresult nsLineBreaker::AppendText(nsAtom* aHyphenationLanguage,
|
||||
}
|
||||
}
|
||||
uint32_t wordStart = offset;
|
||||
bool wordHasComplexChar = false;
|
||||
bool wordMightBeBreakable = false;
|
||||
|
||||
for (;;) {
|
||||
uint8_t ch = aText[offset];
|
||||
@ -477,7 +508,7 @@ nsresult nsLineBreaker::AppendText(nsAtom* aHyphenationLanguage,
|
||||
memset(breakState.Elements() + wordStart,
|
||||
gfxTextRun::CompressedGlyph::FLAG_BREAK_TYPE_NORMAL,
|
||||
offset - wordStart);
|
||||
} else if (wordHasComplexChar) {
|
||||
} else if (wordMightBeBreakable) {
|
||||
// Save current start-of-word state because ComputeBreakPositions()
|
||||
// will set it to false.
|
||||
AutoRestore<uint8_t> saveWordStartBreakState(breakState[wordStart]);
|
||||
@ -487,35 +518,36 @@ nsresult nsLineBreaker::AppendText(nsAtom* aHyphenationLanguage,
|
||||
}
|
||||
}
|
||||
|
||||
wordHasComplexChar = false;
|
||||
wordMightBeBreakable = false;
|
||||
mWordContinuation = false;
|
||||
++offset;
|
||||
if (offset >= aLength) {
|
||||
break;
|
||||
}
|
||||
wordStart = offset;
|
||||
} else {
|
||||
if (!wordHasComplexChar && IsComplexASCIIChar(ch)) {
|
||||
wordHasComplexChar = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!wordMightBeBreakable && !IsNonBreakableChar<uint8_t>(ch)) {
|
||||
wordMightBeBreakable = true;
|
||||
}
|
||||
++offset;
|
||||
if (offset >= aLength) {
|
||||
// Save this word
|
||||
mCurrentWordMightBeBreakable = wordMightBeBreakable;
|
||||
uint32_t len = offset - wordStart;
|
||||
char16_t* elems = mCurrentWord.AppendElements(len, mozilla::fallible);
|
||||
if (!elems) {
|
||||
return NS_ERROR_OUT_OF_MEMORY;
|
||||
}
|
||||
++offset;
|
||||
if (offset >= aLength) {
|
||||
// Save this word
|
||||
mCurrentWordContainsComplexChar = wordHasComplexChar;
|
||||
uint32_t len = offset - wordStart;
|
||||
char16_t* elems = mCurrentWord.AppendElements(len);
|
||||
if (!elems) {
|
||||
return NS_ERROR_OUT_OF_MEMORY;
|
||||
}
|
||||
uint32_t i;
|
||||
for (i = wordStart; i < offset; ++i) {
|
||||
elems[i - wordStart] = aText[i];
|
||||
}
|
||||
mTextItems.AppendElement(TextItem(aSink, wordStart, len, aFlags));
|
||||
// Ensure that the break-before for this word is written out
|
||||
offset = wordStart + 1;
|
||||
break;
|
||||
uint32_t i;
|
||||
for (i = wordStart; i < offset; ++i) {
|
||||
elems[i - wordStart] = aText[i];
|
||||
}
|
||||
mTextItems.AppendElement(TextItem(aSink, wordStart, len, aFlags));
|
||||
// Ensure that the break-before for this word is written out
|
||||
offset = wordStart + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -57,8 +57,7 @@ class nsILineBreakSink {
|
||||
*
|
||||
* The current strategy is that we break the overall text into
|
||||
* whitespace-delimited "words". Then those words are passed to the LineBreaker
|
||||
* service for deeper analysis if they contain a "complex" character as
|
||||
* described below.
|
||||
* for deeper analysis if they might contain breakable characters.
|
||||
*
|
||||
* This class also handles detection of which characters should be capitalized
|
||||
* for text-transform:capitalize. This is a good place to handle that because
|
||||
@ -73,27 +72,12 @@ class nsLineBreaker {
|
||||
return mozilla::intl::NS_IsSpace(u);
|
||||
}
|
||||
|
||||
static inline bool IsComplexASCIIChar(char16_t u) {
|
||||
return !((0x0030 <= u && u <= 0x0039) || (0x0041 <= u && u <= 0x005A) ||
|
||||
(0x0061 <= u && u <= 0x007A) || (0x000a == u));
|
||||
}
|
||||
|
||||
static inline bool IsComplexChar(char16_t u) {
|
||||
return IsComplexASCIIChar(u) ||
|
||||
mozilla::intl::NS_NeedsPlatformNativeHandling(u) ||
|
||||
(0x1100 <= u && u <= 0x11ff) || // Hangul Jamo
|
||||
(0x2000 <= u && u <= 0x21ff) || // Punctuations and Symbols
|
||||
(0x2e80 <= u && u <= 0xd7ff) || // several CJK blocks
|
||||
(0xf900 <= u && u <= 0xfaff) || // CJK Compatibility Idographs
|
||||
(0xff00 <= u && u <= 0xffef); // Halfwidth and Fullwidth Forms
|
||||
}
|
||||
|
||||
// Break opportunities exist at the end of each run of breakable whitespace
|
||||
// (see IsSpace above). Break opportunities can also exist between pairs of
|
||||
// non-whitespace characters, as determined by mozilla::intl::LineBreaker.
|
||||
// We pass a whitespace-
|
||||
// delimited word to LineBreaker if it contains at least one character
|
||||
// matching IsComplexChar.
|
||||
// that has breakable line breaking classes.
|
||||
// We provide flags to control on a per-chunk basis where breaks are allowed.
|
||||
// At any character boundary, exactly one text chunk governs whether a
|
||||
// break is allowed at that boundary.
|
||||
@ -263,7 +247,7 @@ class nsLineBreaker {
|
||||
AutoTArray<TextItem, 2> mTextItems;
|
||||
nsAtom* mCurrentWordLanguage;
|
||||
bool mCurrentWordContainsMixedLang;
|
||||
bool mCurrentWordContainsComplexChar;
|
||||
bool mCurrentWordMightBeBreakable = false;
|
||||
bool mScriptIsChineseOrJapanese;
|
||||
|
||||
// True if the previous character was breakable whitespace
|
||||
|
Loading…
Reference in New Issue
Block a user