From 791b803c53b5613ae9d61065eb8eeaacd44f82c8 Mon Sep 17 00:00:00 2001 From: Makoto Kato Date: Wed, 2 Aug 2023 10:32:50 +0000 Subject: [PATCH] Bug 1719535 - Part 5. Add ICU4X based segmenter modules. r=TYLin,jfkthame Depends on D167673 Differential Revision: https://phabricator.services.mozilla.com/D167675 --- intl/lwbrk/LineBreaker.cpp | 173 +++++++++++++++ intl/lwbrk/Segmenter.cpp | 271 ++++++++++++++++++++++- intl/lwbrk/Segmenter.h | 54 +++++ intl/lwbrk/WordBreaker.cpp | 35 ++- intl/lwbrk/gtest/TestSegmenter.cpp | 112 +++++++++- intl/lwbrk/moz.build | 12 + modules/libpref/init/StaticPrefList.yaml | 6 + 7 files changed, 655 insertions(+), 8 deletions(-) diff --git a/intl/lwbrk/LineBreaker.cpp b/intl/lwbrk/LineBreaker.cpp index 2784b0d3023b..440f219ab5fb 100644 --- a/intl/lwbrk/LineBreaker.cpp +++ b/intl/lwbrk/LineBreaker.cpp @@ -13,6 +13,20 @@ #include "mozilla/intl/Segmenter.h" #include "mozilla/intl/UnicodeProperties.h" +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) +# include "ICU4XDataProvider.h" +# include "ICU4XLineBreakIteratorLatin1.hpp" +# include "ICU4XLineBreakIteratorUtf16.hpp" +# include "ICU4XLineSegmenter.h" +# include "mozilla/CheckedInt.h" +# include "mozilla/ClearOnShutdown.h" +# include "mozilla/intl/ICU4XGeckoDataProvider.h" +# include "mozilla/StaticPrefs_intl.h" +# include "nsThreadUtils.h" + +# include +#endif + using namespace mozilla::unicode; using namespace mozilla::intl; @@ -978,9 +992,136 @@ static bool SuppressBreakForKeepAll(uint32_t aPrev, uint32_t aCh) { affectedByKeepAll(GetLineBreakClass(aCh)); } +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) +static capi::ICU4XLineBreakStrictness ConvertLineBreakRuleToICU4X( + LineBreakRule aLevel) { + switch (aLevel) { + case LineBreakRule::Auto: + return capi::ICU4XLineBreakStrictness_Strict; + case LineBreakRule::Strict: + return capi::ICU4XLineBreakStrictness_Strict; + case LineBreakRule::Loose: + return capi::ICU4XLineBreakStrictness_Loose; + case LineBreakRule::Normal: + return capi::ICU4XLineBreakStrictness_Normal; + case LineBreakRule::Anywhere: + return capi::ICU4XLineBreakStrictness_Anywhere; + } + MOZ_ASSERT_UNREACHABLE("should have been handled already"); + return capi::ICU4XLineBreakStrictness_Normal; +} + +static capi::ICU4XLineBreakWordOption ConvertWordBreakRuleToICU4X( + WordBreakRule aWordBreak) { + switch (aWordBreak) { + case WordBreakRule::Normal: + return capi::ICU4XLineBreakWordOption_Normal; + case WordBreakRule::BreakAll: + return capi::ICU4XLineBreakWordOption_BreakAll; + case WordBreakRule::KeepAll: + return capi::ICU4XLineBreakWordOption_KeepAll; + } + MOZ_ASSERT_UNREACHABLE("should have been handled already"); + return capi::ICU4XLineBreakWordOption_Normal; +} + +static capi::ICU4XLineSegmenter* sLineSegmenter = nullptr; + +static capi::ICU4XLineSegmenter* GetDefaultLineSegmenter() { + static std::once_flag sOnce; + + std::call_once(sOnce, [] { + auto result = capi::ICU4XLineSegmenter_create_auto(GetDataProvider()); + MOZ_ASSERT(result.is_ok); + sLineSegmenter = result.ok; + + if (NS_IsMainThread()) { + mozilla::RunOnShutdown([] { + if (sLineSegmenter) { + capi::ICU4XLineSegmenter_destroy(sLineSegmenter); + } + sLineSegmenter = nullptr; + }); + return; + } + NS_DispatchToMainThread( + NS_NewRunnableFunction("GetDefaultLineSegmenter", [] { + mozilla::RunOnShutdown([] { + if (sLineSegmenter) { + capi::ICU4XLineSegmenter_destroy(sLineSegmenter); + } + sLineSegmenter = nullptr; + }); + })); + }); + + return sLineSegmenter; +} + +static bool UseDefaultLineSegmenter(WordBreakRule aWordBreak, + LineBreakRule aLevel, + bool aIsChineseOrJapanese) { + return aWordBreak == WordBreakRule::Normal && + (aLevel == LineBreakRule::Strict || aLevel == LineBreakRule::Auto) && + !aIsChineseOrJapanese; +} + +static capi::ICU4XLineSegmenter* GetLineSegmenter(bool aUseDefault, + WordBreakRule aWordBreak, + LineBreakRule aLevel, + bool aIsChineseOrJapanese) { + if (aUseDefault) { + MOZ_ASSERT( + UseDefaultLineSegmenter(aWordBreak, aLevel, aIsChineseOrJapanese)); + return GetDefaultLineSegmenter(); + } + + capi::ICU4XLineBreakOptionsV1 options; + options.word_option = ConvertWordBreakRuleToICU4X(aWordBreak); + options.strictness = ConvertLineBreakRuleToICU4X(aLevel); + options.ja_zh = aIsChineseOrJapanese; + + auto result = capi::ICU4XLineSegmenter_create_lstm_with_options_v1( + GetDataProvider(), options); + MOZ_ASSERT(result.is_ok); + return result.ok; +} +#endif + void LineBreaker::ComputeBreakPositions( const char16_t* aChars, uint32_t aLength, WordBreakRule aWordBreak, LineBreakRule aLevel, bool aIsChineseOrJapanese, uint8_t* aBreakBefore) { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (StaticPrefs::intl_icu4x_segmenter_enabled()) { + memset(aBreakBefore, 0, aLength); + + CheckedInt length = aLength; + if (!length.isValid()) { + return; + } + + const bool useDefault = + UseDefaultLineSegmenter(aWordBreak, aLevel, aIsChineseOrJapanese); + capi::ICU4XLineSegmenter* lineSegmenter = + GetLineSegmenter(useDefault, aWordBreak, aLevel, aIsChineseOrJapanese); + ICU4XLineBreakIteratorUtf16 iterator(capi::ICU4XLineSegmenter_segment_utf16( + lineSegmenter, (const uint16_t*)aChars, aLength)); + + while (true) { + const int32_t nextPos = iterator.next(); + if (nextPos < 0 || nextPos >= length.value()) { + break; + } + aBreakBefore[nextPos] = 1; + } + + if (!useDefault) { + capi::ICU4XLineSegmenter_destroy(lineSegmenter); + } + return; + } +#endif + uint32_t cur; int8_t lastClass = CLASS_NONE; ContextState state(aChars, aLength); @@ -1110,6 +1251,38 @@ void LineBreaker::ComputeBreakPositions(const uint8_t* aChars, uint32_t aLength, LineBreakRule aLevel, bool aIsChineseOrJapanese, uint8_t* aBreakBefore) { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (StaticPrefs::intl_icu4x_segmenter_enabled()) { + memset(aBreakBefore, 0, aLength); + + CheckedInt length = aLength; + if (!length.isValid()) { + return; + } + + const bool useDefault = + UseDefaultLineSegmenter(aWordBreak, aLevel, aIsChineseOrJapanese); + capi::ICU4XLineSegmenter* lineSegmenter = + GetLineSegmenter(useDefault, aWordBreak, aLevel, aIsChineseOrJapanese); + ICU4XLineBreakIteratorLatin1 iterator( + capi::ICU4XLineSegmenter_segment_latin1( + lineSegmenter, (const uint8_t*)aChars, aLength)); + + while (true) { + const int32_t nextPos = iterator.next(); + if (nextPos < 0 || nextPos >= length.value()) { + break; + } + aBreakBefore[nextPos] = 1; + } + + if (!useDefault) { + capi::ICU4XLineSegmenter_destroy(lineSegmenter); + } + return; + } +#endif + uint32_t cur; int8_t lastClass = CLASS_NONE; ContextState state(aChars, aLength); diff --git a/intl/lwbrk/Segmenter.cpp b/intl/lwbrk/Segmenter.cpp index 53d87336a376..5869669eeb21 100644 --- a/intl/lwbrk/Segmenter.cpp +++ b/intl/lwbrk/Segmenter.cpp @@ -11,9 +11,19 @@ #include "mozilla/intl/LineBreaker.h" #include "mozilla/intl/WordBreaker.h" #include "mozilla/intl/UnicodeProperties.h" +#include "mozilla/StaticPrefs_intl.h" #include "nsUnicodeProperties.h" #include "nsCharTraits.h" +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) +# include "ICU4XDataProvider.h" +# include "ICU4XGraphemeClusterSegmenter.h" +# include "ICU4XLineSegmenter.h" +# include "ICU4XSentenceSegmenter.h" +# include "ICU4XWordSegmenter.h" +# include "mozilla/intl/ICU4XGeckoDataProvider.h" +#endif + using namespace mozilla::unicode; namespace mozilla::intl { @@ -30,9 +40,45 @@ Maybe SegmentIteratorUtf16::Seek(uint32_t aPos) { LineBreakIteratorUtf16::LineBreakIteratorUtf16(Span aText, const LineBreakOptions& aOptions) - : SegmentIteratorUtf16(aText), mOptions(aOptions) {} + : SegmentIteratorUtf16(aText), mOptions(aOptions) { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (!StaticPrefs::intl_icu4x_segmenter_enabled()) { + return; + } + auto result = + capi::ICU4XLineSegmenter_create_auto(mozilla::intl::GetDataProvider()); + MOZ_RELEASE_ASSERT(result.is_ok); + mSegmenter = result.ok; + mIterator = capi::ICU4XLineSegmenter_segment_utf16( + mSegmenter, (const uint16_t*)mText.Elements(), mText.Length()); +#endif +} + +LineBreakIteratorUtf16::~LineBreakIteratorUtf16() { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + capi::ICU4XLineBreakIteratorUtf16_destroy(mIterator); + } + if (mSegmenter) { + capi::ICU4XLineSegmenter_destroy(mSegmenter); + } +#endif +} Maybe LineBreakIteratorUtf16::Next() { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + const int32_t nextPos = capi::ICU4XLineBreakIteratorUtf16_next(mIterator); + if (nextPos < 0) { + return Nothing(); + } + if (!nextPos) { + return Next(); + } + mPos = nextPos; + return Some(mPos); + } +#endif const int32_t nextPos = LineBreaker::Next(mText.Elements(), mText.Length(), mPos); if (nextPos == NS_LINEBREAKER_NEED_MORE_TEXT) { @@ -42,10 +88,71 @@ Maybe LineBreakIteratorUtf16::Next() { return Some(mPos); } +Maybe LineBreakIteratorUtf16::Seek(uint32_t aPos) { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + if (mPos >= aPos) { + return Next(); + } + + while (mPos < aPos) { + const int32_t nextPos = capi::ICU4XLineBreakIteratorUtf16_next(mIterator); + if (nextPos < 0) { + return Nothing(); + } + mPos = static_cast(nextPos); + } + + if (aPos < mPos) { + return Some(mPos); + } + + return Next(); + } +#endif + return SegmentIteratorUtf16::Seek(aPos); +} + WordBreakIteratorUtf16::WordBreakIteratorUtf16(Span aText) - : SegmentIteratorUtf16(aText) {} + : SegmentIteratorUtf16(aText) { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (!StaticPrefs::intl_icu4x_segmenter_enabled()) { + return; + } + auto result = + capi::ICU4XWordSegmenter_create_auto(mozilla::intl::GetDataProvider()); + MOZ_RELEASE_ASSERT(result.is_ok); + mSegmenter = result.ok; + mIterator = capi::ICU4XWordSegmenter_segment_utf16( + mSegmenter, (const uint16_t*)mText.Elements(), mText.Length()); +#endif +} + +WordBreakIteratorUtf16::~WordBreakIteratorUtf16() { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + capi::ICU4XWordBreakIteratorUtf16_destroy(mIterator); + } + if (mSegmenter) { + capi::ICU4XWordSegmenter_destroy(mSegmenter); + } +#endif +} Maybe WordBreakIteratorUtf16::Next() { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + const int32_t nextPos = capi::ICU4XWordBreakIteratorUtf16_next(mIterator); + if (nextPos < 0) { + return Nothing(); + } + if (!nextPos) { + return Next(); + } + mPos = nextPos; + return Some(mPos); + } +#endif const int32_t nextPos = WordBreaker::Next(mText.Elements(), mText.Length(), mPos); if (nextPos == NS_WORDBREAKER_NEED_MORE_TEXT) { @@ -55,9 +162,57 @@ Maybe WordBreakIteratorUtf16::Next() { return Some(mPos); } +Maybe WordBreakIteratorUtf16::Seek(uint32_t aPos) { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + if (mPos >= aPos) { + return Next(); + } + + while (mPos < aPos) { + const int32_t nextPos = capi::ICU4XWordBreakIteratorUtf16_next(mIterator); + if (nextPos < 0) { + return Nothing(); + } + mPos = static_cast(nextPos); + } + + if (aPos < mPos) { + return Some(mPos); + } + + return Next(); + } +#endif + return SegmentIteratorUtf16::Seek(aPos); +} + GraphemeClusterBreakIteratorUtf16::GraphemeClusterBreakIteratorUtf16( Span aText) - : SegmentIteratorUtf16(aText) {} + : SegmentIteratorUtf16(aText) { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (!StaticPrefs::intl_icu4x_segmenter_enabled()) { + return; + } + auto result = capi::ICU4XGraphemeClusterSegmenter_create( + mozilla::intl::GetDataProvider()); + MOZ_RELEASE_ASSERT(result.is_ok); + mSegmenter = result.ok; + mIterator = capi::ICU4XGraphemeClusterSegmenter_segment_utf16( + mSegmenter, (const uint16_t*)mText.Elements(), mText.Length()); +#endif +} + +GraphemeClusterBreakIteratorUtf16::~GraphemeClusterBreakIteratorUtf16() { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + capi::ICU4XGraphemeClusterBreakIteratorUtf16_destroy(mIterator); + } + if (mSegmenter) { + capi::ICU4XGraphemeClusterSegmenter_destroy(mSegmenter); + } +#endif +} enum HSType { HST_NONE = U_HST_NOT_APPLICABLE, @@ -75,6 +230,20 @@ static HSType GetHangulSyllableType(uint32_t aCh) { Maybe GraphemeClusterBreakIteratorUtf16::Next() { const auto len = mText.Length(); +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + const int32_t nextPos = + capi::ICU4XGraphemeClusterBreakIteratorUtf16_next(mIterator); + if (nextPos < 0) { + return Nothing(); + } + if (!nextPos) { + return Next(); + } + mPos = nextPos; + return Some(mPos); + } +#endif if (mPos >= len) { // The iterator has already reached the end. return Nothing(); @@ -195,6 +364,32 @@ Maybe GraphemeClusterBreakIteratorUtf16::Next() { return Some(mPos); } +Maybe GraphemeClusterBreakIteratorUtf16::Seek(uint32_t aPos) { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + if (mPos >= aPos) { + return Next(); + } + + while (mPos < aPos) { + const int32_t nextPos = + capi::ICU4XGraphemeClusterBreakIteratorUtf16_next(mIterator); + if (nextPos < 0) { + return Nothing(); + } + mPos = static_cast(nextPos); + } + + if (aPos < mPos) { + return Some(mPos); + } + + return Next(); + } +#endif + return SegmentIteratorUtf16::Seek(aPos); +} + GraphemeClusterBreakReverseIteratorUtf16:: GraphemeClusterBreakReverseIteratorUtf16(Span aText) : SegmentIteratorUtf16(aText) { @@ -231,12 +426,77 @@ Maybe GraphemeClusterBreakReverseIteratorUtf16::Seek(uint32_t aPos) { return Next(); } +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) +SentenceBreakIteratorUtf16::SentenceBreakIteratorUtf16( + Span aText) + : SegmentIteratorUtf16(aText) { + auto result = + capi::ICU4XSentenceSegmenter_create(mozilla::intl::GetDataProvider()); + MOZ_RELEASE_ASSERT(result.is_ok); + mSegmenter = result.ok; + mIterator = capi::ICU4XSentenceSegmenter_segment_utf16( + mSegmenter, (const uint16_t*)mText.Elements(), mText.Length()); +} + +SentenceBreakIteratorUtf16::~SentenceBreakIteratorUtf16() { + if (mIterator) { + capi::ICU4XSentenceBreakIteratorUtf16_destroy(mIterator); + } + if (mSegmenter) { + capi::ICU4XSentenceSegmenter_destroy(mSegmenter); + } +} + +Maybe SentenceBreakIteratorUtf16::Seek(uint32_t aPos) { + if (!mIterator) { + return Nothing(); + } + + if (mPos >= aPos) { + return Next(); + } + + while (mPos < aPos) { + const int32_t nextPos = + capi::ICU4XSentenceBreakIteratorUtf16_next(mIterator); + if (nextPos < 0) { + return Nothing(); + } + mPos = static_cast(nextPos); + } + + if (aPos < mPos) { + return Some(mPos); + } + + return Next(); +} + +Maybe SentenceBreakIteratorUtf16::Next() { + if (!mIterator) { + return Nothing(); + } + + const int32_t nextPos = capi::ICU4XSentenceBreakIteratorUtf16_next(mIterator); + if (nextPos < 0) { + return Nothing(); + } + if (!nextPos) { + return Next(); + } + mPos = nextPos; + return Some(mPos); +} +#endif + Result, ICUError> Segmenter::TryCreate( Span aLocale, const SegmenterOptions& aOptions) { +#if !defined(MOZ_ICU4X) || !defined(JS_HAS_INTL_API) if (aOptions.mGranularity == SegmenterGranularity::Sentence) { // Grapheme and Sentence iterator are not yet implemented. return Err(ICUError::InternalError); } +#endif return MakeUnique(aLocale, aOptions); } @@ -246,6 +506,11 @@ UniquePtr Segmenter::Segment( case SegmenterGranularity::Grapheme: return MakeUnique(aText); case SegmenterGranularity::Sentence: +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (StaticPrefs::intl_icu4x_segmenter_enabled()) { + return MakeUnique(aText); + } +#endif MOZ_ASSERT_UNREACHABLE("Unimplemented yet!"); return nullptr; case SegmenterGranularity::Word: diff --git a/intl/lwbrk/Segmenter.h b/intl/lwbrk/Segmenter.h index 647adb6faba4..26ea83acf02f 100644 --- a/intl/lwbrk/Segmenter.h +++ b/intl/lwbrk/Segmenter.h @@ -15,6 +15,19 @@ #include "mozilla/Span.h" #include "mozilla/UniquePtr.h" +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) +namespace capi { +struct ICU4XLineSegmenter; +struct ICU4XLineBreakIteratorUtf16; +struct ICU4XWordSegmenter; +struct ICU4XWordBreakIteratorUtf16; +struct ICU4XGraphemeClusterSegmenter; +struct ICU4XGraphemeClusterBreakIteratorUtf16; +struct ICU4XSentenceSegmenter; +struct ICU4XSentenceBreakIteratorUtf16; +} // namespace capi +#endif + namespace mozilla::intl { enum class SegmenterGranularity : uint8_t { @@ -104,11 +117,18 @@ class LineBreakIteratorUtf16 final : public SegmentIteratorUtf16 { public: explicit LineBreakIteratorUtf16(Span aText, const LineBreakOptions& aOptions = {}); + ~LineBreakIteratorUtf16() override; Maybe Next() override; + Maybe Seek(uint32_t aPos) override; private: LineBreakOptions mOptions; + +#ifdef MOZ_ICU4X + capi::ICU4XLineSegmenter* mSegmenter = nullptr; + capi::ICU4XLineBreakIteratorUtf16* mIterator = nullptr; +#endif }; /** @@ -117,8 +137,16 @@ class LineBreakIteratorUtf16 final : public SegmentIteratorUtf16 { class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 { public: explicit WordBreakIteratorUtf16(Span aText); + ~WordBreakIteratorUtf16() override; Maybe Next() override; + Maybe Seek(uint32_t aPos) override; + +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + private: + capi::ICU4XWordSegmenter* mSegmenter = nullptr; + capi::ICU4XWordBreakIteratorUtf16* mIterator = nullptr; +#endif }; /** @@ -127,8 +155,16 @@ class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 { class GraphemeClusterBreakIteratorUtf16 final : public SegmentIteratorUtf16 { public: explicit GraphemeClusterBreakIteratorUtf16(Span aText); + ~GraphemeClusterBreakIteratorUtf16() override; Maybe Next() override; + Maybe Seek(uint32_t aPos) override; + +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + private: + capi::ICU4XGraphemeClusterSegmenter* mSegmenter = nullptr; + capi::ICU4XGraphemeClusterBreakIteratorUtf16* mIterator = nullptr; +#endif }; /** @@ -146,6 +182,24 @@ class GraphemeClusterBreakReverseIteratorUtf16 final Maybe Seek(uint32_t aPos) override; }; +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) +/** + * Sentence break iterator for UTF-16 text. + */ +class SentenceBreakIteratorUtf16 final : public SegmentIteratorUtf16 { + public: + explicit SentenceBreakIteratorUtf16(Span aText); + ~SentenceBreakIteratorUtf16() override; + + Maybe Next() override; + Maybe Seek(uint32_t aPos) override; + + private: + capi::ICU4XSentenceSegmenter* mSegmenter = nullptr; + capi::ICU4XSentenceBreakIteratorUtf16* mIterator = nullptr; +}; +#endif + /** * This component is a Mozilla-focused API for working with segmenters in * internationalization code. diff --git a/intl/lwbrk/WordBreaker.cpp b/intl/lwbrk/WordBreaker.cpp index f7fc10a7a1c7..b5a304d05964 100644 --- a/intl/lwbrk/WordBreaker.cpp +++ b/intl/lwbrk/WordBreaker.cpp @@ -10,6 +10,14 @@ #include "nsTArray.h" #include "nsUnicodeProperties.h" +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) +# include "ICU4XDataProvider.h" +# include "ICU4XWordBreakIteratorUtf16.hpp" +# include "ICU4XWordSegmenter.hpp" +# include "mozilla/intl/ICU4XGeckoDataProvider.h" +# include "mozilla/StaticPrefs_intl.h" +#endif + using mozilla::intl::Script; using mozilla::intl::UnicodeProperties; using mozilla::intl::WordBreaker; @@ -102,9 +110,34 @@ WordRange WordBreaker::FindWord(const char16_t* aText, uint32_t aLen, return {aLen, aLen}; } - WordBreakClass c = GetClass(aText[aPos]); WordRange range{0, aLen}; +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (StaticPrefs::intl_icu4x_segmenter_enabled()) { + auto result = + capi::ICU4XWordSegmenter_create_auto(mozilla::intl::GetDataProvider()); + MOZ_ASSERT(result.is_ok); + ICU4XWordSegmenter segmenter(result.ok); + ICU4XWordBreakIteratorUtf16 iterator = + segmenter.segment_utf16(diplomat::span((const uint16_t*)aText, aLen)); + + uint32_t previousPos = 0; + while (true) { + const int32_t nextPos = iterator.next(); + if (nextPos < 0) { + return {previousPos, aLen}; + } + if ((uint32_t)nextPos > aPos) { + return {previousPos, (uint32_t)nextPos}; + } + + previousPos = nextPos; + } + } +#endif + + WordBreakClass c = GetClass(aText[aPos]); + // Scan forward for (uint32_t i = aPos + 1; i <= aLen; i++) { if (c != GetClass(aText[i])) { diff --git a/intl/lwbrk/gtest/TestSegmenter.cpp b/intl/lwbrk/gtest/TestSegmenter.cpp index 21c44a078f92..42d04b8e0306 100644 --- a/intl/lwbrk/gtest/TestSegmenter.cpp +++ b/intl/lwbrk/gtest/TestSegmenter.cpp @@ -7,11 +7,15 @@ #include "gtest/gtest.h" #include "mozilla/intl/Segmenter.h" +#include "mozilla/Preferences.h" namespace mozilla::intl { -TEST(IntlSegmenter, TestLineBreakIteratorUtf16) +TEST(IntlSegmenter, TestLineBreakIteratorUtf16SeekOld) { + nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", false); + EXPECT_TRUE(rv == NS_OK); + const SegmenterOptions options{SegmenterGranularity::Line}; auto result = Segmenter::TryCreate("en", options); ASSERT_TRUE(result.isOk()); @@ -30,7 +34,50 @@ TEST(IntlSegmenter, TestLineBreakIteratorUtf16) ASSERT_EQ(segIter->Seek(0u), Nothing()); } -TEST(IntlSegmenter, TestWordBreakIteratorUtf16) +TEST(IntlSegmenter, TestLineBreakIteratorUtf16Seek) +{ + nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true); + EXPECT_TRUE(rv == NS_OK); + + const SegmenterOptions options{SegmenterGranularity::Line}; + auto result = Segmenter::TryCreate("en", options); + ASSERT_TRUE(result.isOk()); + auto lineSegmenter = result.unwrap(); + + const char16_t text[] = u"hello world"; + UniquePtr segIter = + lineSegmenter->Segment(MakeStringSpan(text)); + + // Seek to space between "hello" and "world". + // UAX#14 rule returns before "w". + ASSERT_EQ(segIter->Seek(5u), Some(6u)); + + ASSERT_EQ(segIter->Next(), Some(11u)); + + ASSERT_EQ(segIter->Next(), Nothing()); + + // Same as calling Next(). + ASSERT_EQ(segIter->Seek(0u), Nothing()); +} + +TEST(IntlSegmenter, TestWordBreakIteratorUtf16Simple) +{ + const SegmenterOptions options{SegmenterGranularity::Word}; + auto result = Segmenter::TryCreate("en", options); + ASSERT_TRUE(result.isOk()); + auto wordSegmenter = result.unwrap(); + + const char16_t text[] = u"hello world"; + UniquePtr segIter = + wordSegmenter->Segment(MakeStringSpan(text)); + + ASSERT_EQ(segIter->Next(), Some(5u)); + ASSERT_EQ(segIter->Next(), Some(6u)); + ASSERT_EQ(segIter->Next(), Some(11u)); + ASSERT_EQ(segIter->Next(), Nothing()); +} + +TEST(IntlSegmenter, TestWordBreakIteratorUtf16Seek) { const SegmenterOptions options{SegmenterGranularity::Word}; auto result = Segmenter::TryCreate("en", options); @@ -51,7 +98,32 @@ TEST(IntlSegmenter, TestWordBreakIteratorUtf16) ASSERT_EQ(segIter->Seek(0u), Nothing()); } -TEST(IntlSegmenter, TestGraphemeClusterBreakIteratorUtf16) +TEST(IntlSegmenter, TestGraphemeClusterBreakIteratorUtf16Simple) +{ + SegmenterOptions options{SegmenterGranularity::Grapheme}; + auto result = Segmenter::TryCreate("en", options); + ASSERT_TRUE(result.isOk()); + auto graphemeClusterSegmenter = result.unwrap(); + + const char16_t text[] = u"hello world"; + UniquePtr segIter = + graphemeClusterSegmenter->Segment(MakeStringSpan(text)); + + ASSERT_EQ(segIter->Next(), Some(1u)); + ASSERT_EQ(segIter->Next(), Some(2u)); + ASSERT_EQ(segIter->Next(), Some(3u)); + ASSERT_EQ(segIter->Next(), Some(4u)); + ASSERT_EQ(segIter->Next(), Some(5u)); + ASSERT_EQ(segIter->Next(), Some(6u)); + ASSERT_EQ(segIter->Next(), Some(7u)); + ASSERT_EQ(segIter->Next(), Some(8u)); + ASSERT_EQ(segIter->Next(), Some(9u)); + ASSERT_EQ(segIter->Next(), Some(10u)); + ASSERT_EQ(segIter->Next(), Some(11u)); + ASSERT_EQ(segIter->Next(), Nothing()); +} + +TEST(IntlSegmenter, TestGraphemeClusterBreakIteratorUtf16Seek) { SegmenterOptions options{SegmenterGranularity::Grapheme}; auto result = Segmenter::TryCreate("en", options); @@ -97,9 +169,41 @@ TEST(IntlSegmenter, TestGraphemeClusterBreakReverseIteratorUtf16) TEST(IntlSegmenter, TestSentenceBreakIteratorUtf16) { + nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true); + EXPECT_TRUE(rv == NS_OK); + SegmenterOptions options{SegmenterGranularity::Sentence}; auto result = Segmenter::TryCreate("en", options); - ASSERT_TRUE(result.isErr()); + ASSERT_TRUE(result.isOk()); + auto sentenceSegmenter = result.unwrap(); + + const char16_t text[] = u"Hello world. Hello world."; + UniquePtr segIter = + sentenceSegmenter->Segment(MakeStringSpan(text)); + + ASSERT_EQ(segIter->Next(), Some(13u)); + ASSERT_EQ(segIter->Next(), Some(25u)); + ASSERT_EQ(segIter->Next(), Nothing()); + + // Same as calling Next(). + ASSERT_EQ(segIter->Seek(0u), Nothing()); +} + +TEST(IntlSegmenter, TestSentenceBreakIteratorUtf16Seek) +{ + nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true); + EXPECT_TRUE(rv == NS_OK); + + SegmenterOptions options{SegmenterGranularity::Sentence}; + auto result = Segmenter::TryCreate("en", options); + ASSERT_TRUE(result.isOk()); + auto sentenceSegmenter = result.unwrap(); + + const char16_t text[] = u"Hello world. Hello world."; + UniquePtr segIter = + sentenceSegmenter->Segment(MakeStringSpan(text)); + + ASSERT_EQ(segIter->Seek(5u), Some(13u)); } } // namespace mozilla::intl diff --git a/intl/lwbrk/moz.build b/intl/lwbrk/moz.build index 0699ff63a51a..1f87ebf99e62 100644 --- a/intl/lwbrk/moz.build +++ b/intl/lwbrk/moz.build @@ -42,4 +42,16 @@ else: "rulebrk.c", ] +if CONFIG["JS_HAS_INTL_API"] and CONFIG["MOZ_ICU4X"]: + LOCAL_INCLUDES += [ + "/third_party/rust/icu_capi/cpp/include", + ] + # Disable warnings when including C++ headers of ICU4X. + # - https://github.com/rust-diplomat/diplomat/issues/277 + # - https://github.com/rust-diplomat/diplomat/issues/335 + CXXFLAGS += [ + "-Wno-mismatched-tags", + "-Wno-pessimizing-move", + ] + FINAL_LIBRARY = "xul" diff --git a/modules/libpref/init/StaticPrefList.yaml b/modules/libpref/init/StaticPrefList.yaml index 059a6cf3d4ff..f498909146c7 100644 --- a/modules/libpref/init/StaticPrefList.yaml +++ b/modules/libpref/init/StaticPrefList.yaml @@ -7292,6 +7292,12 @@ mirror: always #endif +# If true, we use UAX14/29 compatible segmenter rules using ICU4X +- name: intl.icu4x.segmenter.enabled + type: RelaxedAtomicBool + value: false + mirror: always + #--------------------------------------------------------------------------- # Prefs starting with "javascript." #