mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-23 21:01:08 +00:00
Bug 1719535 - Part 5. Add ICU4X based segmenter modules. r=TYLin,jfkthame
Depends on D167673 Differential Revision: https://phabricator.services.mozilla.com/D167675
This commit is contained in:
parent
696dad9f78
commit
791b803c53
@ -13,6 +13,20 @@
|
||||
#include "mozilla/intl/Segmenter.h"
|
||||
#include "mozilla/intl/UnicodeProperties.h"
|
||||
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
# include "ICU4XDataProvider.h"
|
||||
# include "ICU4XLineBreakIteratorLatin1.hpp"
|
||||
# include "ICU4XLineBreakIteratorUtf16.hpp"
|
||||
# include "ICU4XLineSegmenter.h"
|
||||
# include "mozilla/CheckedInt.h"
|
||||
# include "mozilla/ClearOnShutdown.h"
|
||||
# include "mozilla/intl/ICU4XGeckoDataProvider.h"
|
||||
# include "mozilla/StaticPrefs_intl.h"
|
||||
# include "nsThreadUtils.h"
|
||||
|
||||
# include <mutex>
|
||||
#endif
|
||||
|
||||
using namespace mozilla::unicode;
|
||||
using namespace mozilla::intl;
|
||||
|
||||
@ -978,9 +992,136 @@ static bool SuppressBreakForKeepAll(uint32_t aPrev, uint32_t aCh) {
|
||||
affectedByKeepAll(GetLineBreakClass(aCh));
|
||||
}
|
||||
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
static capi::ICU4XLineBreakStrictness ConvertLineBreakRuleToICU4X(
|
||||
LineBreakRule aLevel) {
|
||||
switch (aLevel) {
|
||||
case LineBreakRule::Auto:
|
||||
return capi::ICU4XLineBreakStrictness_Strict;
|
||||
case LineBreakRule::Strict:
|
||||
return capi::ICU4XLineBreakStrictness_Strict;
|
||||
case LineBreakRule::Loose:
|
||||
return capi::ICU4XLineBreakStrictness_Loose;
|
||||
case LineBreakRule::Normal:
|
||||
return capi::ICU4XLineBreakStrictness_Normal;
|
||||
case LineBreakRule::Anywhere:
|
||||
return capi::ICU4XLineBreakStrictness_Anywhere;
|
||||
}
|
||||
MOZ_ASSERT_UNREACHABLE("should have been handled already");
|
||||
return capi::ICU4XLineBreakStrictness_Normal;
|
||||
}
|
||||
|
||||
static capi::ICU4XLineBreakWordOption ConvertWordBreakRuleToICU4X(
|
||||
WordBreakRule aWordBreak) {
|
||||
switch (aWordBreak) {
|
||||
case WordBreakRule::Normal:
|
||||
return capi::ICU4XLineBreakWordOption_Normal;
|
||||
case WordBreakRule::BreakAll:
|
||||
return capi::ICU4XLineBreakWordOption_BreakAll;
|
||||
case WordBreakRule::KeepAll:
|
||||
return capi::ICU4XLineBreakWordOption_KeepAll;
|
||||
}
|
||||
MOZ_ASSERT_UNREACHABLE("should have been handled already");
|
||||
return capi::ICU4XLineBreakWordOption_Normal;
|
||||
}
|
||||
|
||||
static capi::ICU4XLineSegmenter* sLineSegmenter = nullptr;
|
||||
|
||||
static capi::ICU4XLineSegmenter* GetDefaultLineSegmenter() {
|
||||
static std::once_flag sOnce;
|
||||
|
||||
std::call_once(sOnce, [] {
|
||||
auto result = capi::ICU4XLineSegmenter_create_auto(GetDataProvider());
|
||||
MOZ_ASSERT(result.is_ok);
|
||||
sLineSegmenter = result.ok;
|
||||
|
||||
if (NS_IsMainThread()) {
|
||||
mozilla::RunOnShutdown([] {
|
||||
if (sLineSegmenter) {
|
||||
capi::ICU4XLineSegmenter_destroy(sLineSegmenter);
|
||||
}
|
||||
sLineSegmenter = nullptr;
|
||||
});
|
||||
return;
|
||||
}
|
||||
NS_DispatchToMainThread(
|
||||
NS_NewRunnableFunction("GetDefaultLineSegmenter", [] {
|
||||
mozilla::RunOnShutdown([] {
|
||||
if (sLineSegmenter) {
|
||||
capi::ICU4XLineSegmenter_destroy(sLineSegmenter);
|
||||
}
|
||||
sLineSegmenter = nullptr;
|
||||
});
|
||||
}));
|
||||
});
|
||||
|
||||
return sLineSegmenter;
|
||||
}
|
||||
|
||||
static bool UseDefaultLineSegmenter(WordBreakRule aWordBreak,
|
||||
LineBreakRule aLevel,
|
||||
bool aIsChineseOrJapanese) {
|
||||
return aWordBreak == WordBreakRule::Normal &&
|
||||
(aLevel == LineBreakRule::Strict || aLevel == LineBreakRule::Auto) &&
|
||||
!aIsChineseOrJapanese;
|
||||
}
|
||||
|
||||
static capi::ICU4XLineSegmenter* GetLineSegmenter(bool aUseDefault,
|
||||
WordBreakRule aWordBreak,
|
||||
LineBreakRule aLevel,
|
||||
bool aIsChineseOrJapanese) {
|
||||
if (aUseDefault) {
|
||||
MOZ_ASSERT(
|
||||
UseDefaultLineSegmenter(aWordBreak, aLevel, aIsChineseOrJapanese));
|
||||
return GetDefaultLineSegmenter();
|
||||
}
|
||||
|
||||
capi::ICU4XLineBreakOptionsV1 options;
|
||||
options.word_option = ConvertWordBreakRuleToICU4X(aWordBreak);
|
||||
options.strictness = ConvertLineBreakRuleToICU4X(aLevel);
|
||||
options.ja_zh = aIsChineseOrJapanese;
|
||||
|
||||
auto result = capi::ICU4XLineSegmenter_create_lstm_with_options_v1(
|
||||
GetDataProvider(), options);
|
||||
MOZ_ASSERT(result.is_ok);
|
||||
return result.ok;
|
||||
}
|
||||
#endif
|
||||
|
||||
void LineBreaker::ComputeBreakPositions(
|
||||
const char16_t* aChars, uint32_t aLength, WordBreakRule aWordBreak,
|
||||
LineBreakRule aLevel, bool aIsChineseOrJapanese, uint8_t* aBreakBefore) {
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
if (StaticPrefs::intl_icu4x_segmenter_enabled()) {
|
||||
memset(aBreakBefore, 0, aLength);
|
||||
|
||||
CheckedInt<int32_t> length = aLength;
|
||||
if (!length.isValid()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const bool useDefault =
|
||||
UseDefaultLineSegmenter(aWordBreak, aLevel, aIsChineseOrJapanese);
|
||||
capi::ICU4XLineSegmenter* lineSegmenter =
|
||||
GetLineSegmenter(useDefault, aWordBreak, aLevel, aIsChineseOrJapanese);
|
||||
ICU4XLineBreakIteratorUtf16 iterator(capi::ICU4XLineSegmenter_segment_utf16(
|
||||
lineSegmenter, (const uint16_t*)aChars, aLength));
|
||||
|
||||
while (true) {
|
||||
const int32_t nextPos = iterator.next();
|
||||
if (nextPos < 0 || nextPos >= length.value()) {
|
||||
break;
|
||||
}
|
||||
aBreakBefore[nextPos] = 1;
|
||||
}
|
||||
|
||||
if (!useDefault) {
|
||||
capi::ICU4XLineSegmenter_destroy(lineSegmenter);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
uint32_t cur;
|
||||
int8_t lastClass = CLASS_NONE;
|
||||
ContextState state(aChars, aLength);
|
||||
@ -1110,6 +1251,38 @@ void LineBreaker::ComputeBreakPositions(const uint8_t* aChars, uint32_t aLength,
|
||||
LineBreakRule aLevel,
|
||||
bool aIsChineseOrJapanese,
|
||||
uint8_t* aBreakBefore) {
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
if (StaticPrefs::intl_icu4x_segmenter_enabled()) {
|
||||
memset(aBreakBefore, 0, aLength);
|
||||
|
||||
CheckedInt<int32_t> length = aLength;
|
||||
if (!length.isValid()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const bool useDefault =
|
||||
UseDefaultLineSegmenter(aWordBreak, aLevel, aIsChineseOrJapanese);
|
||||
capi::ICU4XLineSegmenter* lineSegmenter =
|
||||
GetLineSegmenter(useDefault, aWordBreak, aLevel, aIsChineseOrJapanese);
|
||||
ICU4XLineBreakIteratorLatin1 iterator(
|
||||
capi::ICU4XLineSegmenter_segment_latin1(
|
||||
lineSegmenter, (const uint8_t*)aChars, aLength));
|
||||
|
||||
while (true) {
|
||||
const int32_t nextPos = iterator.next();
|
||||
if (nextPos < 0 || nextPos >= length.value()) {
|
||||
break;
|
||||
}
|
||||
aBreakBefore[nextPos] = 1;
|
||||
}
|
||||
|
||||
if (!useDefault) {
|
||||
capi::ICU4XLineSegmenter_destroy(lineSegmenter);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
uint32_t cur;
|
||||
int8_t lastClass = CLASS_NONE;
|
||||
ContextState state(aChars, aLength);
|
||||
|
@ -11,9 +11,19 @@
|
||||
#include "mozilla/intl/LineBreaker.h"
|
||||
#include "mozilla/intl/WordBreaker.h"
|
||||
#include "mozilla/intl/UnicodeProperties.h"
|
||||
#include "mozilla/StaticPrefs_intl.h"
|
||||
#include "nsUnicodeProperties.h"
|
||||
#include "nsCharTraits.h"
|
||||
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
# include "ICU4XDataProvider.h"
|
||||
# include "ICU4XGraphemeClusterSegmenter.h"
|
||||
# include "ICU4XLineSegmenter.h"
|
||||
# include "ICU4XSentenceSegmenter.h"
|
||||
# include "ICU4XWordSegmenter.h"
|
||||
# include "mozilla/intl/ICU4XGeckoDataProvider.h"
|
||||
#endif
|
||||
|
||||
using namespace mozilla::unicode;
|
||||
|
||||
namespace mozilla::intl {
|
||||
@ -30,9 +40,45 @@ Maybe<uint32_t> SegmentIteratorUtf16::Seek(uint32_t aPos) {
|
||||
|
||||
LineBreakIteratorUtf16::LineBreakIteratorUtf16(Span<const char16_t> aText,
|
||||
const LineBreakOptions& aOptions)
|
||||
: SegmentIteratorUtf16(aText), mOptions(aOptions) {}
|
||||
: SegmentIteratorUtf16(aText), mOptions(aOptions) {
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
if (!StaticPrefs::intl_icu4x_segmenter_enabled()) {
|
||||
return;
|
||||
}
|
||||
auto result =
|
||||
capi::ICU4XLineSegmenter_create_auto(mozilla::intl::GetDataProvider());
|
||||
MOZ_RELEASE_ASSERT(result.is_ok);
|
||||
mSegmenter = result.ok;
|
||||
mIterator = capi::ICU4XLineSegmenter_segment_utf16(
|
||||
mSegmenter, (const uint16_t*)mText.Elements(), mText.Length());
|
||||
#endif
|
||||
}
|
||||
|
||||
LineBreakIteratorUtf16::~LineBreakIteratorUtf16() {
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
if (mIterator) {
|
||||
capi::ICU4XLineBreakIteratorUtf16_destroy(mIterator);
|
||||
}
|
||||
if (mSegmenter) {
|
||||
capi::ICU4XLineSegmenter_destroy(mSegmenter);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
Maybe<uint32_t> LineBreakIteratorUtf16::Next() {
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
if (mIterator) {
|
||||
const int32_t nextPos = capi::ICU4XLineBreakIteratorUtf16_next(mIterator);
|
||||
if (nextPos < 0) {
|
||||
return Nothing();
|
||||
}
|
||||
if (!nextPos) {
|
||||
return Next();
|
||||
}
|
||||
mPos = nextPos;
|
||||
return Some(mPos);
|
||||
}
|
||||
#endif
|
||||
const int32_t nextPos =
|
||||
LineBreaker::Next(mText.Elements(), mText.Length(), mPos);
|
||||
if (nextPos == NS_LINEBREAKER_NEED_MORE_TEXT) {
|
||||
@ -42,10 +88,71 @@ Maybe<uint32_t> LineBreakIteratorUtf16::Next() {
|
||||
return Some(mPos);
|
||||
}
|
||||
|
||||
Maybe<uint32_t> LineBreakIteratorUtf16::Seek(uint32_t aPos) {
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
if (mIterator) {
|
||||
if (mPos >= aPos) {
|
||||
return Next();
|
||||
}
|
||||
|
||||
while (mPos < aPos) {
|
||||
const int32_t nextPos = capi::ICU4XLineBreakIteratorUtf16_next(mIterator);
|
||||
if (nextPos < 0) {
|
||||
return Nothing();
|
||||
}
|
||||
mPos = static_cast<uint32_t>(nextPos);
|
||||
}
|
||||
|
||||
if (aPos < mPos) {
|
||||
return Some(mPos);
|
||||
}
|
||||
|
||||
return Next();
|
||||
}
|
||||
#endif
|
||||
return SegmentIteratorUtf16::Seek(aPos);
|
||||
}
|
||||
|
||||
WordBreakIteratorUtf16::WordBreakIteratorUtf16(Span<const char16_t> aText)
|
||||
: SegmentIteratorUtf16(aText) {}
|
||||
: SegmentIteratorUtf16(aText) {
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
if (!StaticPrefs::intl_icu4x_segmenter_enabled()) {
|
||||
return;
|
||||
}
|
||||
auto result =
|
||||
capi::ICU4XWordSegmenter_create_auto(mozilla::intl::GetDataProvider());
|
||||
MOZ_RELEASE_ASSERT(result.is_ok);
|
||||
mSegmenter = result.ok;
|
||||
mIterator = capi::ICU4XWordSegmenter_segment_utf16(
|
||||
mSegmenter, (const uint16_t*)mText.Elements(), mText.Length());
|
||||
#endif
|
||||
}
|
||||
|
||||
WordBreakIteratorUtf16::~WordBreakIteratorUtf16() {
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
if (mIterator) {
|
||||
capi::ICU4XWordBreakIteratorUtf16_destroy(mIterator);
|
||||
}
|
||||
if (mSegmenter) {
|
||||
capi::ICU4XWordSegmenter_destroy(mSegmenter);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
Maybe<uint32_t> WordBreakIteratorUtf16::Next() {
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
if (mIterator) {
|
||||
const int32_t nextPos = capi::ICU4XWordBreakIteratorUtf16_next(mIterator);
|
||||
if (nextPos < 0) {
|
||||
return Nothing();
|
||||
}
|
||||
if (!nextPos) {
|
||||
return Next();
|
||||
}
|
||||
mPos = nextPos;
|
||||
return Some(mPos);
|
||||
}
|
||||
#endif
|
||||
const int32_t nextPos =
|
||||
WordBreaker::Next(mText.Elements(), mText.Length(), mPos);
|
||||
if (nextPos == NS_WORDBREAKER_NEED_MORE_TEXT) {
|
||||
@ -55,9 +162,57 @@ Maybe<uint32_t> WordBreakIteratorUtf16::Next() {
|
||||
return Some(mPos);
|
||||
}
|
||||
|
||||
Maybe<uint32_t> WordBreakIteratorUtf16::Seek(uint32_t aPos) {
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
if (mIterator) {
|
||||
if (mPos >= aPos) {
|
||||
return Next();
|
||||
}
|
||||
|
||||
while (mPos < aPos) {
|
||||
const int32_t nextPos = capi::ICU4XWordBreakIteratorUtf16_next(mIterator);
|
||||
if (nextPos < 0) {
|
||||
return Nothing();
|
||||
}
|
||||
mPos = static_cast<uint32_t>(nextPos);
|
||||
}
|
||||
|
||||
if (aPos < mPos) {
|
||||
return Some(mPos);
|
||||
}
|
||||
|
||||
return Next();
|
||||
}
|
||||
#endif
|
||||
return SegmentIteratorUtf16::Seek(aPos);
|
||||
}
|
||||
|
||||
GraphemeClusterBreakIteratorUtf16::GraphemeClusterBreakIteratorUtf16(
|
||||
Span<const char16_t> aText)
|
||||
: SegmentIteratorUtf16(aText) {}
|
||||
: SegmentIteratorUtf16(aText) {
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
if (!StaticPrefs::intl_icu4x_segmenter_enabled()) {
|
||||
return;
|
||||
}
|
||||
auto result = capi::ICU4XGraphemeClusterSegmenter_create(
|
||||
mozilla::intl::GetDataProvider());
|
||||
MOZ_RELEASE_ASSERT(result.is_ok);
|
||||
mSegmenter = result.ok;
|
||||
mIterator = capi::ICU4XGraphemeClusterSegmenter_segment_utf16(
|
||||
mSegmenter, (const uint16_t*)mText.Elements(), mText.Length());
|
||||
#endif
|
||||
}
|
||||
|
||||
GraphemeClusterBreakIteratorUtf16::~GraphemeClusterBreakIteratorUtf16() {
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
if (mIterator) {
|
||||
capi::ICU4XGraphemeClusterBreakIteratorUtf16_destroy(mIterator);
|
||||
}
|
||||
if (mSegmenter) {
|
||||
capi::ICU4XGraphemeClusterSegmenter_destroy(mSegmenter);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
enum HSType {
|
||||
HST_NONE = U_HST_NOT_APPLICABLE,
|
||||
@ -75,6 +230,20 @@ static HSType GetHangulSyllableType(uint32_t aCh) {
|
||||
|
||||
Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Next() {
|
||||
const auto len = mText.Length();
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
if (mIterator) {
|
||||
const int32_t nextPos =
|
||||
capi::ICU4XGraphemeClusterBreakIteratorUtf16_next(mIterator);
|
||||
if (nextPos < 0) {
|
||||
return Nothing();
|
||||
}
|
||||
if (!nextPos) {
|
||||
return Next();
|
||||
}
|
||||
mPos = nextPos;
|
||||
return Some(mPos);
|
||||
}
|
||||
#endif
|
||||
if (mPos >= len) {
|
||||
// The iterator has already reached the end.
|
||||
return Nothing();
|
||||
@ -195,6 +364,32 @@ Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Next() {
|
||||
return Some(mPos);
|
||||
}
|
||||
|
||||
Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Seek(uint32_t aPos) {
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
if (mIterator) {
|
||||
if (mPos >= aPos) {
|
||||
return Next();
|
||||
}
|
||||
|
||||
while (mPos < aPos) {
|
||||
const int32_t nextPos =
|
||||
capi::ICU4XGraphemeClusterBreakIteratorUtf16_next(mIterator);
|
||||
if (nextPos < 0) {
|
||||
return Nothing();
|
||||
}
|
||||
mPos = static_cast<uint32_t>(nextPos);
|
||||
}
|
||||
|
||||
if (aPos < mPos) {
|
||||
return Some(mPos);
|
||||
}
|
||||
|
||||
return Next();
|
||||
}
|
||||
#endif
|
||||
return SegmentIteratorUtf16::Seek(aPos);
|
||||
}
|
||||
|
||||
GraphemeClusterBreakReverseIteratorUtf16::
|
||||
GraphemeClusterBreakReverseIteratorUtf16(Span<const char16_t> aText)
|
||||
: SegmentIteratorUtf16(aText) {
|
||||
@ -231,12 +426,77 @@ Maybe<uint32_t> GraphemeClusterBreakReverseIteratorUtf16::Seek(uint32_t aPos) {
|
||||
return Next();
|
||||
}
|
||||
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
SentenceBreakIteratorUtf16::SentenceBreakIteratorUtf16(
|
||||
Span<const char16_t> aText)
|
||||
: SegmentIteratorUtf16(aText) {
|
||||
auto result =
|
||||
capi::ICU4XSentenceSegmenter_create(mozilla::intl::GetDataProvider());
|
||||
MOZ_RELEASE_ASSERT(result.is_ok);
|
||||
mSegmenter = result.ok;
|
||||
mIterator = capi::ICU4XSentenceSegmenter_segment_utf16(
|
||||
mSegmenter, (const uint16_t*)mText.Elements(), mText.Length());
|
||||
}
|
||||
|
||||
SentenceBreakIteratorUtf16::~SentenceBreakIteratorUtf16() {
|
||||
if (mIterator) {
|
||||
capi::ICU4XSentenceBreakIteratorUtf16_destroy(mIterator);
|
||||
}
|
||||
if (mSegmenter) {
|
||||
capi::ICU4XSentenceSegmenter_destroy(mSegmenter);
|
||||
}
|
||||
}
|
||||
|
||||
Maybe<uint32_t> SentenceBreakIteratorUtf16::Seek(uint32_t aPos) {
|
||||
if (!mIterator) {
|
||||
return Nothing();
|
||||
}
|
||||
|
||||
if (mPos >= aPos) {
|
||||
return Next();
|
||||
}
|
||||
|
||||
while (mPos < aPos) {
|
||||
const int32_t nextPos =
|
||||
capi::ICU4XSentenceBreakIteratorUtf16_next(mIterator);
|
||||
if (nextPos < 0) {
|
||||
return Nothing();
|
||||
}
|
||||
mPos = static_cast<uint32_t>(nextPos);
|
||||
}
|
||||
|
||||
if (aPos < mPos) {
|
||||
return Some(mPos);
|
||||
}
|
||||
|
||||
return Next();
|
||||
}
|
||||
|
||||
Maybe<uint32_t> SentenceBreakIteratorUtf16::Next() {
|
||||
if (!mIterator) {
|
||||
return Nothing();
|
||||
}
|
||||
|
||||
const int32_t nextPos = capi::ICU4XSentenceBreakIteratorUtf16_next(mIterator);
|
||||
if (nextPos < 0) {
|
||||
return Nothing();
|
||||
}
|
||||
if (!nextPos) {
|
||||
return Next();
|
||||
}
|
||||
mPos = nextPos;
|
||||
return Some(mPos);
|
||||
}
|
||||
#endif
|
||||
|
||||
Result<UniquePtr<Segmenter>, ICUError> Segmenter::TryCreate(
|
||||
Span<const char> aLocale, const SegmenterOptions& aOptions) {
|
||||
#if !defined(MOZ_ICU4X) || !defined(JS_HAS_INTL_API)
|
||||
if (aOptions.mGranularity == SegmenterGranularity::Sentence) {
|
||||
// Grapheme and Sentence iterator are not yet implemented.
|
||||
return Err(ICUError::InternalError);
|
||||
}
|
||||
#endif
|
||||
return MakeUnique<Segmenter>(aLocale, aOptions);
|
||||
}
|
||||
|
||||
@ -246,6 +506,11 @@ UniquePtr<SegmentIteratorUtf16> Segmenter::Segment(
|
||||
case SegmenterGranularity::Grapheme:
|
||||
return MakeUnique<GraphemeClusterBreakIteratorUtf16>(aText);
|
||||
case SegmenterGranularity::Sentence:
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
if (StaticPrefs::intl_icu4x_segmenter_enabled()) {
|
||||
return MakeUnique<SentenceBreakIteratorUtf16>(aText);
|
||||
}
|
||||
#endif
|
||||
MOZ_ASSERT_UNREACHABLE("Unimplemented yet!");
|
||||
return nullptr;
|
||||
case SegmenterGranularity::Word:
|
||||
|
@ -15,6 +15,19 @@
|
||||
#include "mozilla/Span.h"
|
||||
#include "mozilla/UniquePtr.h"
|
||||
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
namespace capi {
|
||||
struct ICU4XLineSegmenter;
|
||||
struct ICU4XLineBreakIteratorUtf16;
|
||||
struct ICU4XWordSegmenter;
|
||||
struct ICU4XWordBreakIteratorUtf16;
|
||||
struct ICU4XGraphemeClusterSegmenter;
|
||||
struct ICU4XGraphemeClusterBreakIteratorUtf16;
|
||||
struct ICU4XSentenceSegmenter;
|
||||
struct ICU4XSentenceBreakIteratorUtf16;
|
||||
} // namespace capi
|
||||
#endif
|
||||
|
||||
namespace mozilla::intl {
|
||||
|
||||
enum class SegmenterGranularity : uint8_t {
|
||||
@ -104,11 +117,18 @@ class LineBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
|
||||
public:
|
||||
explicit LineBreakIteratorUtf16(Span<const char16_t> aText,
|
||||
const LineBreakOptions& aOptions = {});
|
||||
~LineBreakIteratorUtf16() override;
|
||||
|
||||
Maybe<uint32_t> Next() override;
|
||||
Maybe<uint32_t> Seek(uint32_t aPos) override;
|
||||
|
||||
private:
|
||||
LineBreakOptions mOptions;
|
||||
|
||||
#ifdef MOZ_ICU4X
|
||||
capi::ICU4XLineSegmenter* mSegmenter = nullptr;
|
||||
capi::ICU4XLineBreakIteratorUtf16* mIterator = nullptr;
|
||||
#endif
|
||||
};
|
||||
|
||||
/**
|
||||
@ -117,8 +137,16 @@ class LineBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
|
||||
class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
|
||||
public:
|
||||
explicit WordBreakIteratorUtf16(Span<const char16_t> aText);
|
||||
~WordBreakIteratorUtf16() override;
|
||||
|
||||
Maybe<uint32_t> Next() override;
|
||||
Maybe<uint32_t> Seek(uint32_t aPos) override;
|
||||
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
private:
|
||||
capi::ICU4XWordSegmenter* mSegmenter = nullptr;
|
||||
capi::ICU4XWordBreakIteratorUtf16* mIterator = nullptr;
|
||||
#endif
|
||||
};
|
||||
|
||||
/**
|
||||
@ -127,8 +155,16 @@ class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
|
||||
class GraphemeClusterBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
|
||||
public:
|
||||
explicit GraphemeClusterBreakIteratorUtf16(Span<const char16_t> aText);
|
||||
~GraphemeClusterBreakIteratorUtf16() override;
|
||||
|
||||
Maybe<uint32_t> Next() override;
|
||||
Maybe<uint32_t> Seek(uint32_t aPos) override;
|
||||
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
private:
|
||||
capi::ICU4XGraphemeClusterSegmenter* mSegmenter = nullptr;
|
||||
capi::ICU4XGraphemeClusterBreakIteratorUtf16* mIterator = nullptr;
|
||||
#endif
|
||||
};
|
||||
|
||||
/**
|
||||
@ -146,6 +182,24 @@ class GraphemeClusterBreakReverseIteratorUtf16 final
|
||||
Maybe<uint32_t> Seek(uint32_t aPos) override;
|
||||
};
|
||||
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
/**
|
||||
* Sentence break iterator for UTF-16 text.
|
||||
*/
|
||||
class SentenceBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
|
||||
public:
|
||||
explicit SentenceBreakIteratorUtf16(Span<const char16_t> aText);
|
||||
~SentenceBreakIteratorUtf16() override;
|
||||
|
||||
Maybe<uint32_t> Next() override;
|
||||
Maybe<uint32_t> Seek(uint32_t aPos) override;
|
||||
|
||||
private:
|
||||
capi::ICU4XSentenceSegmenter* mSegmenter = nullptr;
|
||||
capi::ICU4XSentenceBreakIteratorUtf16* mIterator = nullptr;
|
||||
};
|
||||
#endif
|
||||
|
||||
/**
|
||||
* This component is a Mozilla-focused API for working with segmenters in
|
||||
* internationalization code.
|
||||
|
@ -10,6 +10,14 @@
|
||||
#include "nsTArray.h"
|
||||
#include "nsUnicodeProperties.h"
|
||||
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
# include "ICU4XDataProvider.h"
|
||||
# include "ICU4XWordBreakIteratorUtf16.hpp"
|
||||
# include "ICU4XWordSegmenter.hpp"
|
||||
# include "mozilla/intl/ICU4XGeckoDataProvider.h"
|
||||
# include "mozilla/StaticPrefs_intl.h"
|
||||
#endif
|
||||
|
||||
using mozilla::intl::Script;
|
||||
using mozilla::intl::UnicodeProperties;
|
||||
using mozilla::intl::WordBreaker;
|
||||
@ -102,9 +110,34 @@ WordRange WordBreaker::FindWord(const char16_t* aText, uint32_t aLen,
|
||||
return {aLen, aLen};
|
||||
}
|
||||
|
||||
WordBreakClass c = GetClass(aText[aPos]);
|
||||
WordRange range{0, aLen};
|
||||
|
||||
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
|
||||
if (StaticPrefs::intl_icu4x_segmenter_enabled()) {
|
||||
auto result =
|
||||
capi::ICU4XWordSegmenter_create_auto(mozilla::intl::GetDataProvider());
|
||||
MOZ_ASSERT(result.is_ok);
|
||||
ICU4XWordSegmenter segmenter(result.ok);
|
||||
ICU4XWordBreakIteratorUtf16 iterator =
|
||||
segmenter.segment_utf16(diplomat::span((const uint16_t*)aText, aLen));
|
||||
|
||||
uint32_t previousPos = 0;
|
||||
while (true) {
|
||||
const int32_t nextPos = iterator.next();
|
||||
if (nextPos < 0) {
|
||||
return {previousPos, aLen};
|
||||
}
|
||||
if ((uint32_t)nextPos > aPos) {
|
||||
return {previousPos, (uint32_t)nextPos};
|
||||
}
|
||||
|
||||
previousPos = nextPos;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
WordBreakClass c = GetClass(aText[aPos]);
|
||||
|
||||
// Scan forward
|
||||
for (uint32_t i = aPos + 1; i <= aLen; i++) {
|
||||
if (c != GetClass(aText[i])) {
|
||||
|
@ -7,11 +7,15 @@
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
#include "mozilla/intl/Segmenter.h"
|
||||
#include "mozilla/Preferences.h"
|
||||
|
||||
namespace mozilla::intl {
|
||||
|
||||
TEST(IntlSegmenter, TestLineBreakIteratorUtf16)
|
||||
TEST(IntlSegmenter, TestLineBreakIteratorUtf16SeekOld)
|
||||
{
|
||||
nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", false);
|
||||
EXPECT_TRUE(rv == NS_OK);
|
||||
|
||||
const SegmenterOptions options{SegmenterGranularity::Line};
|
||||
auto result = Segmenter::TryCreate("en", options);
|
||||
ASSERT_TRUE(result.isOk());
|
||||
@ -30,7 +34,50 @@ TEST(IntlSegmenter, TestLineBreakIteratorUtf16)
|
||||
ASSERT_EQ(segIter->Seek(0u), Nothing());
|
||||
}
|
||||
|
||||
TEST(IntlSegmenter, TestWordBreakIteratorUtf16)
|
||||
TEST(IntlSegmenter, TestLineBreakIteratorUtf16Seek)
|
||||
{
|
||||
nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true);
|
||||
EXPECT_TRUE(rv == NS_OK);
|
||||
|
||||
const SegmenterOptions options{SegmenterGranularity::Line};
|
||||
auto result = Segmenter::TryCreate("en", options);
|
||||
ASSERT_TRUE(result.isOk());
|
||||
auto lineSegmenter = result.unwrap();
|
||||
|
||||
const char16_t text[] = u"hello world";
|
||||
UniquePtr<SegmentIteratorUtf16> segIter =
|
||||
lineSegmenter->Segment(MakeStringSpan(text));
|
||||
|
||||
// Seek to space between "hello" and "world".
|
||||
// UAX#14 rule returns before "w".
|
||||
ASSERT_EQ(segIter->Seek(5u), Some(6u));
|
||||
|
||||
ASSERT_EQ(segIter->Next(), Some(11u));
|
||||
|
||||
ASSERT_EQ(segIter->Next(), Nothing());
|
||||
|
||||
// Same as calling Next().
|
||||
ASSERT_EQ(segIter->Seek(0u), Nothing());
|
||||
}
|
||||
|
||||
TEST(IntlSegmenter, TestWordBreakIteratorUtf16Simple)
|
||||
{
|
||||
const SegmenterOptions options{SegmenterGranularity::Word};
|
||||
auto result = Segmenter::TryCreate("en", options);
|
||||
ASSERT_TRUE(result.isOk());
|
||||
auto wordSegmenter = result.unwrap();
|
||||
|
||||
const char16_t text[] = u"hello world";
|
||||
UniquePtr<SegmentIteratorUtf16> segIter =
|
||||
wordSegmenter->Segment(MakeStringSpan(text));
|
||||
|
||||
ASSERT_EQ(segIter->Next(), Some(5u));
|
||||
ASSERT_EQ(segIter->Next(), Some(6u));
|
||||
ASSERT_EQ(segIter->Next(), Some(11u));
|
||||
ASSERT_EQ(segIter->Next(), Nothing());
|
||||
}
|
||||
|
||||
TEST(IntlSegmenter, TestWordBreakIteratorUtf16Seek)
|
||||
{
|
||||
const SegmenterOptions options{SegmenterGranularity::Word};
|
||||
auto result = Segmenter::TryCreate("en", options);
|
||||
@ -51,7 +98,32 @@ TEST(IntlSegmenter, TestWordBreakIteratorUtf16)
|
||||
ASSERT_EQ(segIter->Seek(0u), Nothing());
|
||||
}
|
||||
|
||||
TEST(IntlSegmenter, TestGraphemeClusterBreakIteratorUtf16)
|
||||
TEST(IntlSegmenter, TestGraphemeClusterBreakIteratorUtf16Simple)
|
||||
{
|
||||
SegmenterOptions options{SegmenterGranularity::Grapheme};
|
||||
auto result = Segmenter::TryCreate("en", options);
|
||||
ASSERT_TRUE(result.isOk());
|
||||
auto graphemeClusterSegmenter = result.unwrap();
|
||||
|
||||
const char16_t text[] = u"hello world";
|
||||
UniquePtr<SegmentIteratorUtf16> segIter =
|
||||
graphemeClusterSegmenter->Segment(MakeStringSpan(text));
|
||||
|
||||
ASSERT_EQ(segIter->Next(), Some(1u));
|
||||
ASSERT_EQ(segIter->Next(), Some(2u));
|
||||
ASSERT_EQ(segIter->Next(), Some(3u));
|
||||
ASSERT_EQ(segIter->Next(), Some(4u));
|
||||
ASSERT_EQ(segIter->Next(), Some(5u));
|
||||
ASSERT_EQ(segIter->Next(), Some(6u));
|
||||
ASSERT_EQ(segIter->Next(), Some(7u));
|
||||
ASSERT_EQ(segIter->Next(), Some(8u));
|
||||
ASSERT_EQ(segIter->Next(), Some(9u));
|
||||
ASSERT_EQ(segIter->Next(), Some(10u));
|
||||
ASSERT_EQ(segIter->Next(), Some(11u));
|
||||
ASSERT_EQ(segIter->Next(), Nothing());
|
||||
}
|
||||
|
||||
TEST(IntlSegmenter, TestGraphemeClusterBreakIteratorUtf16Seek)
|
||||
{
|
||||
SegmenterOptions options{SegmenterGranularity::Grapheme};
|
||||
auto result = Segmenter::TryCreate("en", options);
|
||||
@ -97,9 +169,41 @@ TEST(IntlSegmenter, TestGraphemeClusterBreakReverseIteratorUtf16)
|
||||
|
||||
TEST(IntlSegmenter, TestSentenceBreakIteratorUtf16)
|
||||
{
|
||||
nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true);
|
||||
EXPECT_TRUE(rv == NS_OK);
|
||||
|
||||
SegmenterOptions options{SegmenterGranularity::Sentence};
|
||||
auto result = Segmenter::TryCreate("en", options);
|
||||
ASSERT_TRUE(result.isErr());
|
||||
ASSERT_TRUE(result.isOk());
|
||||
auto sentenceSegmenter = result.unwrap();
|
||||
|
||||
const char16_t text[] = u"Hello world. Hello world.";
|
||||
UniquePtr<SegmentIteratorUtf16> segIter =
|
||||
sentenceSegmenter->Segment(MakeStringSpan(text));
|
||||
|
||||
ASSERT_EQ(segIter->Next(), Some(13u));
|
||||
ASSERT_EQ(segIter->Next(), Some(25u));
|
||||
ASSERT_EQ(segIter->Next(), Nothing());
|
||||
|
||||
// Same as calling Next().
|
||||
ASSERT_EQ(segIter->Seek(0u), Nothing());
|
||||
}
|
||||
|
||||
TEST(IntlSegmenter, TestSentenceBreakIteratorUtf16Seek)
|
||||
{
|
||||
nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true);
|
||||
EXPECT_TRUE(rv == NS_OK);
|
||||
|
||||
SegmenterOptions options{SegmenterGranularity::Sentence};
|
||||
auto result = Segmenter::TryCreate("en", options);
|
||||
ASSERT_TRUE(result.isOk());
|
||||
auto sentenceSegmenter = result.unwrap();
|
||||
|
||||
const char16_t text[] = u"Hello world. Hello world.";
|
||||
UniquePtr<SegmentIteratorUtf16> segIter =
|
||||
sentenceSegmenter->Segment(MakeStringSpan(text));
|
||||
|
||||
ASSERT_EQ(segIter->Seek(5u), Some(13u));
|
||||
}
|
||||
|
||||
} // namespace mozilla::intl
|
||||
|
@ -42,4 +42,16 @@ else:
|
||||
"rulebrk.c",
|
||||
]
|
||||
|
||||
if CONFIG["JS_HAS_INTL_API"] and CONFIG["MOZ_ICU4X"]:
|
||||
LOCAL_INCLUDES += [
|
||||
"/third_party/rust/icu_capi/cpp/include",
|
||||
]
|
||||
# Disable warnings when including C++ headers of ICU4X.
|
||||
# - https://github.com/rust-diplomat/diplomat/issues/277
|
||||
# - https://github.com/rust-diplomat/diplomat/issues/335
|
||||
CXXFLAGS += [
|
||||
"-Wno-mismatched-tags",
|
||||
"-Wno-pessimizing-move",
|
||||
]
|
||||
|
||||
FINAL_LIBRARY = "xul"
|
||||
|
@ -7292,6 +7292,12 @@
|
||||
mirror: always
|
||||
#endif
|
||||
|
||||
# If true, we use UAX14/29 compatible segmenter rules using ICU4X
|
||||
- name: intl.icu4x.segmenter.enabled
|
||||
type: RelaxedAtomicBool
|
||||
value: false
|
||||
mirror: always
|
||||
|
||||
#---------------------------------------------------------------------------
|
||||
# Prefs starting with "javascript."
|
||||
#
|
||||
|
Loading…
Reference in New Issue
Block a user