mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-12-18 04:44:17 +00:00
653f4b3694
This is the main patch for the bug. It aims to change the grapheme cluster break's `Next()` API by implementing SegmentIteratorUtf16 interface, and adapt the callers. It shouldn't change the behavior. While rewriting the caller, one caveat worth mentioning is the loop termination condition. If the old code relies on `!AtEnd()` as the loop termination condition, and it advances the iterator at the end of the loop, it meant to *skip* its logic when the break position is at the end of the string. For example, see the `mozTXTToHTMLConv::NumberOfMatches`. This patch also hooks grapheme cluster break iterator into Segmenter::TryCreate() interface. Existing test coverage for the file changed: - netwerk/test/unit/test_mozTXTToHTMLConv.js - layout/reftests/forms/input/file/dynamic-max-width.html Differential Revision: https://phabricator.services.mozilla.com/D135643
234 lines
6.8 KiB
C++
234 lines
6.8 KiB
C++
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
|
|
/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
|
* You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
|
|
/* Classes to iterate over grapheme, word, sentence, or line. */
|
|
|
|
#include "mozilla/intl/Segmenter.h"
|
|
|
|
#include "mozilla/intl/LineBreaker.h"
|
|
#include "mozilla/intl/WordBreaker.h"
|
|
#include "mozilla/intl/UnicodeProperties.h"
|
|
#include "nsUnicodeProperties.h"
|
|
#include "nsCharTraits.h"
|
|
|
|
using namespace mozilla::unicode;
|
|
|
|
namespace mozilla::intl {
|
|
|
|
SegmentIteratorUtf16::SegmentIteratorUtf16(Span<const char16_t> aText)
|
|
: mText(aText) {}
|
|
|
|
Maybe<uint32_t> SegmentIteratorUtf16::Seek(uint32_t aPos) {
|
|
if (mPos < aPos) {
|
|
mPos = aPos;
|
|
}
|
|
return Next();
|
|
}
|
|
|
|
LineBreakIteratorUtf16::LineBreakIteratorUtf16(Span<const char16_t> aText,
|
|
const LineBreakOptions& aOptions)
|
|
: SegmentIteratorUtf16(aText), mOptions(aOptions) {}
|
|
|
|
Maybe<uint32_t> LineBreakIteratorUtf16::Next() {
|
|
const int32_t nextPos =
|
|
LineBreaker::Next(mText.Elements(), mText.Length(), mPos);
|
|
if (nextPos == NS_LINEBREAKER_NEED_MORE_TEXT) {
|
|
return Nothing();
|
|
}
|
|
mPos = nextPos;
|
|
return Some(mPos);
|
|
}
|
|
|
|
WordBreakIteratorUtf16::WordBreakIteratorUtf16(Span<const char16_t> aText)
|
|
: SegmentIteratorUtf16(aText) {}
|
|
|
|
Maybe<uint32_t> WordBreakIteratorUtf16::Next() {
|
|
const int32_t nextPos =
|
|
WordBreaker::Next(mText.Elements(), mText.Length(), mPos);
|
|
if (nextPos == NS_WORDBREAKER_NEED_MORE_TEXT) {
|
|
return Nothing();
|
|
}
|
|
mPos = nextPos;
|
|
return Some(mPos);
|
|
}
|
|
|
|
GraphemeClusterBreakIteratorUtf16::GraphemeClusterBreakIteratorUtf16(
|
|
Span<const char16_t> aText)
|
|
: SegmentIteratorUtf16(aText) {}
|
|
|
|
enum HSType {
|
|
HST_NONE = U_HST_NOT_APPLICABLE,
|
|
HST_L = U_HST_LEADING_JAMO,
|
|
HST_V = U_HST_VOWEL_JAMO,
|
|
HST_T = U_HST_TRAILING_JAMO,
|
|
HST_LV = U_HST_LV_SYLLABLE,
|
|
HST_LVT = U_HST_LVT_SYLLABLE
|
|
};
|
|
|
|
static HSType GetHangulSyllableType(uint32_t aCh) {
|
|
return HSType(UnicodeProperties::GetIntPropertyValue(
|
|
aCh, UnicodeProperties::IntProperty::HangulSyllableType));
|
|
}
|
|
|
|
Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Next() {
|
|
const auto len = mText.Length();
|
|
if (mPos >= len) {
|
|
// The iterator has already reached the end.
|
|
return Nothing();
|
|
}
|
|
|
|
uint32_t ch = mText[mPos++];
|
|
|
|
if (mPos < len && NS_IS_SURROGATE_PAIR(ch, mText[mPos])) {
|
|
ch = SURROGATE_TO_UCS4(ch, mText[mPos++]);
|
|
} else if ((ch & ~0xff) == 0x1100 || (ch >= 0xa960 && ch <= 0xa97f) ||
|
|
(ch >= 0xac00 && ch <= 0xd7ff)) {
|
|
// Handle conjoining Jamo that make Hangul syllables
|
|
HSType hangulState = GetHangulSyllableType(ch);
|
|
while (mPos < len) {
|
|
ch = mText[mPos];
|
|
HSType hangulType = GetHangulSyllableType(ch);
|
|
switch (hangulType) {
|
|
case HST_L:
|
|
case HST_LV:
|
|
case HST_LVT:
|
|
if (hangulState == HST_L) {
|
|
hangulState = hangulType;
|
|
mPos++;
|
|
continue;
|
|
}
|
|
break;
|
|
case HST_V:
|
|
if ((hangulState != HST_NONE) && (hangulState != HST_T) &&
|
|
(hangulState != HST_LVT)) {
|
|
hangulState = hangulType;
|
|
mPos++;
|
|
continue;
|
|
}
|
|
break;
|
|
case HST_T:
|
|
if (hangulState != HST_NONE && hangulState != HST_L) {
|
|
hangulState = hangulType;
|
|
mPos++;
|
|
continue;
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
const uint32_t kVS16 = 0xfe0f;
|
|
const uint32_t kZWJ = 0x200d;
|
|
// UTF-16 surrogate values for Fitzpatrick type modifiers
|
|
const uint32_t kFitzpatrickHigh = 0xD83C;
|
|
const uint32_t kFitzpatrickLowFirst = 0xDFFB;
|
|
const uint32_t kFitzpatrickLowLast = 0xDFFF;
|
|
|
|
bool baseIsEmoji = (GetEmojiPresentation(ch) == EmojiDefault) ||
|
|
(GetEmojiPresentation(ch) == TextDefault &&
|
|
((mPos < len && mText[mPos] == kVS16) ||
|
|
(mPos + 1 < len && mText[mPos] == kFitzpatrickHigh &&
|
|
mText[mPos + 1] >= kFitzpatrickLowFirst &&
|
|
mText[mPos + 1] <= kFitzpatrickLowLast)));
|
|
bool prevWasZwj = false;
|
|
|
|
while (mPos < len) {
|
|
ch = mText[mPos];
|
|
size_t chLen = 1;
|
|
|
|
// Check for surrogate pairs; note that isolated surrogates will just
|
|
// be treated as generic (non-cluster-extending) characters here,
|
|
// which is fine for cluster-iterating purposes
|
|
if (mPos < len - 1 && NS_IS_SURROGATE_PAIR(ch, mText[mPos + 1])) {
|
|
ch = SURROGATE_TO_UCS4(ch, mText[mPos + 1]);
|
|
chLen = 2;
|
|
}
|
|
|
|
bool extendCluster =
|
|
IsClusterExtender(ch) ||
|
|
(baseIsEmoji && prevWasZwj &&
|
|
((GetEmojiPresentation(ch) == EmojiDefault) ||
|
|
(GetEmojiPresentation(ch) == TextDefault && mPos + chLen < len &&
|
|
mText[mPos + chLen] == kVS16)));
|
|
if (!extendCluster) {
|
|
break;
|
|
}
|
|
|
|
prevWasZwj = (ch == kZWJ);
|
|
mPos += chLen;
|
|
}
|
|
|
|
MOZ_ASSERT(mPos <= len, "Next() has overshot the string!");
|
|
return Some(mPos);
|
|
}
|
|
|
|
GraphemeClusterBreakReverseIteratorUtf16::
|
|
GraphemeClusterBreakReverseIteratorUtf16(Span<const char16_t> aText)
|
|
: SegmentIteratorUtf16(aText) {
|
|
mPos = mText.Length();
|
|
}
|
|
|
|
Maybe<uint32_t> GraphemeClusterBreakReverseIteratorUtf16::Next() {
|
|
if (mPos == 0) {
|
|
return Nothing();
|
|
}
|
|
|
|
uint32_t ch;
|
|
do {
|
|
ch = mText[--mPos];
|
|
|
|
if (mPos > 0 && NS_IS_SURROGATE_PAIR(mText[mPos - 1], ch)) {
|
|
ch = SURROGATE_TO_UCS4(mText[--mPos], ch);
|
|
}
|
|
|
|
if (!IsClusterExtender(ch)) {
|
|
break;
|
|
}
|
|
} while (mPos > 0);
|
|
|
|
// XXX May need to handle conjoining Jamo
|
|
|
|
return Some(mPos);
|
|
}
|
|
|
|
Maybe<uint32_t> GraphemeClusterBreakReverseIteratorUtf16::Seek(uint32_t aPos) {
|
|
if (mPos > aPos) {
|
|
mPos = aPos;
|
|
}
|
|
return Next();
|
|
}
|
|
|
|
Result<UniquePtr<Segmenter>, ICUError> Segmenter::TryCreate(
|
|
Span<const char> aLocale, const SegmenterOptions& aOptions) {
|
|
if (aOptions.mGranularity == SegmenterGranularity::Sentence) {
|
|
// Grapheme and Sentence iterator are not yet implemented.
|
|
return Err(ICUError::InternalError);
|
|
}
|
|
return MakeUnique<Segmenter>(aLocale, aOptions);
|
|
}
|
|
|
|
UniquePtr<SegmentIteratorUtf16> Segmenter::Segment(
|
|
Span<const char16_t> aText) const {
|
|
switch (mOptions.mGranularity) {
|
|
case SegmenterGranularity::Grapheme:
|
|
return MakeUnique<GraphemeClusterBreakIteratorUtf16>(aText);
|
|
case SegmenterGranularity::Sentence:
|
|
MOZ_ASSERT_UNREACHABLE("Unimplemented yet!");
|
|
return nullptr;
|
|
case SegmenterGranularity::Word:
|
|
return MakeUnique<WordBreakIteratorUtf16>(aText);
|
|
case SegmenterGranularity::Line:
|
|
return MakeUnique<LineBreakIteratorUtf16>(aText);
|
|
}
|
|
MOZ_ASSERT_UNREACHABLE("All granularities must be handled!");
|
|
return nullptr;
|
|
}
|
|
|
|
} // namespace mozilla::intl
|