Bug 1719535 - Part 5. Add ICU4X based segmenter modules. r=TYLin,jfkthame

Depends on D167673

Differential Revision: https://phabricator.services.mozilla.com/D167675
This commit is contained in:
Makoto Kato 2023-08-02 10:32:50 +00:00
parent 696dad9f78
commit 791b803c53
7 changed files with 655 additions and 8 deletions

View File

@ -13,6 +13,20 @@
#include "mozilla/intl/Segmenter.h"
#include "mozilla/intl/UnicodeProperties.h"
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
# include "ICU4XDataProvider.h"
# include "ICU4XLineBreakIteratorLatin1.hpp"
# include "ICU4XLineBreakIteratorUtf16.hpp"
# include "ICU4XLineSegmenter.h"
# include "mozilla/CheckedInt.h"
# include "mozilla/ClearOnShutdown.h"
# include "mozilla/intl/ICU4XGeckoDataProvider.h"
# include "mozilla/StaticPrefs_intl.h"
# include "nsThreadUtils.h"
# include <mutex>
#endif
using namespace mozilla::unicode;
using namespace mozilla::intl;
@ -978,9 +992,136 @@ static bool SuppressBreakForKeepAll(uint32_t aPrev, uint32_t aCh) {
affectedByKeepAll(GetLineBreakClass(aCh));
}
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
static capi::ICU4XLineBreakStrictness ConvertLineBreakRuleToICU4X(
LineBreakRule aLevel) {
switch (aLevel) {
case LineBreakRule::Auto:
return capi::ICU4XLineBreakStrictness_Strict;
case LineBreakRule::Strict:
return capi::ICU4XLineBreakStrictness_Strict;
case LineBreakRule::Loose:
return capi::ICU4XLineBreakStrictness_Loose;
case LineBreakRule::Normal:
return capi::ICU4XLineBreakStrictness_Normal;
case LineBreakRule::Anywhere:
return capi::ICU4XLineBreakStrictness_Anywhere;
}
MOZ_ASSERT_UNREACHABLE("should have been handled already");
return capi::ICU4XLineBreakStrictness_Normal;
}
static capi::ICU4XLineBreakWordOption ConvertWordBreakRuleToICU4X(
WordBreakRule aWordBreak) {
switch (aWordBreak) {
case WordBreakRule::Normal:
return capi::ICU4XLineBreakWordOption_Normal;
case WordBreakRule::BreakAll:
return capi::ICU4XLineBreakWordOption_BreakAll;
case WordBreakRule::KeepAll:
return capi::ICU4XLineBreakWordOption_KeepAll;
}
MOZ_ASSERT_UNREACHABLE("should have been handled already");
return capi::ICU4XLineBreakWordOption_Normal;
}
static capi::ICU4XLineSegmenter* sLineSegmenter = nullptr;
static capi::ICU4XLineSegmenter* GetDefaultLineSegmenter() {
static std::once_flag sOnce;
std::call_once(sOnce, [] {
auto result = capi::ICU4XLineSegmenter_create_auto(GetDataProvider());
MOZ_ASSERT(result.is_ok);
sLineSegmenter = result.ok;
if (NS_IsMainThread()) {
mozilla::RunOnShutdown([] {
if (sLineSegmenter) {
capi::ICU4XLineSegmenter_destroy(sLineSegmenter);
}
sLineSegmenter = nullptr;
});
return;
}
NS_DispatchToMainThread(
NS_NewRunnableFunction("GetDefaultLineSegmenter", [] {
mozilla::RunOnShutdown([] {
if (sLineSegmenter) {
capi::ICU4XLineSegmenter_destroy(sLineSegmenter);
}
sLineSegmenter = nullptr;
});
}));
});
return sLineSegmenter;
}
static bool UseDefaultLineSegmenter(WordBreakRule aWordBreak,
LineBreakRule aLevel,
bool aIsChineseOrJapanese) {
return aWordBreak == WordBreakRule::Normal &&
(aLevel == LineBreakRule::Strict || aLevel == LineBreakRule::Auto) &&
!aIsChineseOrJapanese;
}
static capi::ICU4XLineSegmenter* GetLineSegmenter(bool aUseDefault,
WordBreakRule aWordBreak,
LineBreakRule aLevel,
bool aIsChineseOrJapanese) {
if (aUseDefault) {
MOZ_ASSERT(
UseDefaultLineSegmenter(aWordBreak, aLevel, aIsChineseOrJapanese));
return GetDefaultLineSegmenter();
}
capi::ICU4XLineBreakOptionsV1 options;
options.word_option = ConvertWordBreakRuleToICU4X(aWordBreak);
options.strictness = ConvertLineBreakRuleToICU4X(aLevel);
options.ja_zh = aIsChineseOrJapanese;
auto result = capi::ICU4XLineSegmenter_create_lstm_with_options_v1(
GetDataProvider(), options);
MOZ_ASSERT(result.is_ok);
return result.ok;
}
#endif
void LineBreaker::ComputeBreakPositions(
const char16_t* aChars, uint32_t aLength, WordBreakRule aWordBreak,
LineBreakRule aLevel, bool aIsChineseOrJapanese, uint8_t* aBreakBefore) {
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
if (StaticPrefs::intl_icu4x_segmenter_enabled()) {
memset(aBreakBefore, 0, aLength);
CheckedInt<int32_t> length = aLength;
if (!length.isValid()) {
return;
}
const bool useDefault =
UseDefaultLineSegmenter(aWordBreak, aLevel, aIsChineseOrJapanese);
capi::ICU4XLineSegmenter* lineSegmenter =
GetLineSegmenter(useDefault, aWordBreak, aLevel, aIsChineseOrJapanese);
ICU4XLineBreakIteratorUtf16 iterator(capi::ICU4XLineSegmenter_segment_utf16(
lineSegmenter, (const uint16_t*)aChars, aLength));
while (true) {
const int32_t nextPos = iterator.next();
if (nextPos < 0 || nextPos >= length.value()) {
break;
}
aBreakBefore[nextPos] = 1;
}
if (!useDefault) {
capi::ICU4XLineSegmenter_destroy(lineSegmenter);
}
return;
}
#endif
uint32_t cur;
int8_t lastClass = CLASS_NONE;
ContextState state(aChars, aLength);
@ -1110,6 +1251,38 @@ void LineBreaker::ComputeBreakPositions(const uint8_t* aChars, uint32_t aLength,
LineBreakRule aLevel,
bool aIsChineseOrJapanese,
uint8_t* aBreakBefore) {
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
if (StaticPrefs::intl_icu4x_segmenter_enabled()) {
memset(aBreakBefore, 0, aLength);
CheckedInt<int32_t> length = aLength;
if (!length.isValid()) {
return;
}
const bool useDefault =
UseDefaultLineSegmenter(aWordBreak, aLevel, aIsChineseOrJapanese);
capi::ICU4XLineSegmenter* lineSegmenter =
GetLineSegmenter(useDefault, aWordBreak, aLevel, aIsChineseOrJapanese);
ICU4XLineBreakIteratorLatin1 iterator(
capi::ICU4XLineSegmenter_segment_latin1(
lineSegmenter, (const uint8_t*)aChars, aLength));
while (true) {
const int32_t nextPos = iterator.next();
if (nextPos < 0 || nextPos >= length.value()) {
break;
}
aBreakBefore[nextPos] = 1;
}
if (!useDefault) {
capi::ICU4XLineSegmenter_destroy(lineSegmenter);
}
return;
}
#endif
uint32_t cur;
int8_t lastClass = CLASS_NONE;
ContextState state(aChars, aLength);

View File

@ -11,9 +11,19 @@
#include "mozilla/intl/LineBreaker.h"
#include "mozilla/intl/WordBreaker.h"
#include "mozilla/intl/UnicodeProperties.h"
#include "mozilla/StaticPrefs_intl.h"
#include "nsUnicodeProperties.h"
#include "nsCharTraits.h"
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
# include "ICU4XDataProvider.h"
# include "ICU4XGraphemeClusterSegmenter.h"
# include "ICU4XLineSegmenter.h"
# include "ICU4XSentenceSegmenter.h"
# include "ICU4XWordSegmenter.h"
# include "mozilla/intl/ICU4XGeckoDataProvider.h"
#endif
using namespace mozilla::unicode;
namespace mozilla::intl {
@ -30,9 +40,45 @@ Maybe<uint32_t> SegmentIteratorUtf16::Seek(uint32_t aPos) {
LineBreakIteratorUtf16::LineBreakIteratorUtf16(Span<const char16_t> aText,
const LineBreakOptions& aOptions)
: SegmentIteratorUtf16(aText), mOptions(aOptions) {}
: SegmentIteratorUtf16(aText), mOptions(aOptions) {
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
if (!StaticPrefs::intl_icu4x_segmenter_enabled()) {
return;
}
auto result =
capi::ICU4XLineSegmenter_create_auto(mozilla::intl::GetDataProvider());
MOZ_RELEASE_ASSERT(result.is_ok);
mSegmenter = result.ok;
mIterator = capi::ICU4XLineSegmenter_segment_utf16(
mSegmenter, (const uint16_t*)mText.Elements(), mText.Length());
#endif
}
LineBreakIteratorUtf16::~LineBreakIteratorUtf16() {
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
if (mIterator) {
capi::ICU4XLineBreakIteratorUtf16_destroy(mIterator);
}
if (mSegmenter) {
capi::ICU4XLineSegmenter_destroy(mSegmenter);
}
#endif
}
Maybe<uint32_t> LineBreakIteratorUtf16::Next() {
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
if (mIterator) {
const int32_t nextPos = capi::ICU4XLineBreakIteratorUtf16_next(mIterator);
if (nextPos < 0) {
return Nothing();
}
if (!nextPos) {
return Next();
}
mPos = nextPos;
return Some(mPos);
}
#endif
const int32_t nextPos =
LineBreaker::Next(mText.Elements(), mText.Length(), mPos);
if (nextPos == NS_LINEBREAKER_NEED_MORE_TEXT) {
@ -42,10 +88,71 @@ Maybe<uint32_t> LineBreakIteratorUtf16::Next() {
return Some(mPos);
}
Maybe<uint32_t> LineBreakIteratorUtf16::Seek(uint32_t aPos) {
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
if (mIterator) {
if (mPos >= aPos) {
return Next();
}
while (mPos < aPos) {
const int32_t nextPos = capi::ICU4XLineBreakIteratorUtf16_next(mIterator);
if (nextPos < 0) {
return Nothing();
}
mPos = static_cast<uint32_t>(nextPos);
}
if (aPos < mPos) {
return Some(mPos);
}
return Next();
}
#endif
return SegmentIteratorUtf16::Seek(aPos);
}
WordBreakIteratorUtf16::WordBreakIteratorUtf16(Span<const char16_t> aText)
: SegmentIteratorUtf16(aText) {}
: SegmentIteratorUtf16(aText) {
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
if (!StaticPrefs::intl_icu4x_segmenter_enabled()) {
return;
}
auto result =
capi::ICU4XWordSegmenter_create_auto(mozilla::intl::GetDataProvider());
MOZ_RELEASE_ASSERT(result.is_ok);
mSegmenter = result.ok;
mIterator = capi::ICU4XWordSegmenter_segment_utf16(
mSegmenter, (const uint16_t*)mText.Elements(), mText.Length());
#endif
}
WordBreakIteratorUtf16::~WordBreakIteratorUtf16() {
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
if (mIterator) {
capi::ICU4XWordBreakIteratorUtf16_destroy(mIterator);
}
if (mSegmenter) {
capi::ICU4XWordSegmenter_destroy(mSegmenter);
}
#endif
}
Maybe<uint32_t> WordBreakIteratorUtf16::Next() {
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
if (mIterator) {
const int32_t nextPos = capi::ICU4XWordBreakIteratorUtf16_next(mIterator);
if (nextPos < 0) {
return Nothing();
}
if (!nextPos) {
return Next();
}
mPos = nextPos;
return Some(mPos);
}
#endif
const int32_t nextPos =
WordBreaker::Next(mText.Elements(), mText.Length(), mPos);
if (nextPos == NS_WORDBREAKER_NEED_MORE_TEXT) {
@ -55,9 +162,57 @@ Maybe<uint32_t> WordBreakIteratorUtf16::Next() {
return Some(mPos);
}
Maybe<uint32_t> WordBreakIteratorUtf16::Seek(uint32_t aPos) {
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
if (mIterator) {
if (mPos >= aPos) {
return Next();
}
while (mPos < aPos) {
const int32_t nextPos = capi::ICU4XWordBreakIteratorUtf16_next(mIterator);
if (nextPos < 0) {
return Nothing();
}
mPos = static_cast<uint32_t>(nextPos);
}
if (aPos < mPos) {
return Some(mPos);
}
return Next();
}
#endif
return SegmentIteratorUtf16::Seek(aPos);
}
GraphemeClusterBreakIteratorUtf16::GraphemeClusterBreakIteratorUtf16(
Span<const char16_t> aText)
: SegmentIteratorUtf16(aText) {}
: SegmentIteratorUtf16(aText) {
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
if (!StaticPrefs::intl_icu4x_segmenter_enabled()) {
return;
}
auto result = capi::ICU4XGraphemeClusterSegmenter_create(
mozilla::intl::GetDataProvider());
MOZ_RELEASE_ASSERT(result.is_ok);
mSegmenter = result.ok;
mIterator = capi::ICU4XGraphemeClusterSegmenter_segment_utf16(
mSegmenter, (const uint16_t*)mText.Elements(), mText.Length());
#endif
}
GraphemeClusterBreakIteratorUtf16::~GraphemeClusterBreakIteratorUtf16() {
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
if (mIterator) {
capi::ICU4XGraphemeClusterBreakIteratorUtf16_destroy(mIterator);
}
if (mSegmenter) {
capi::ICU4XGraphemeClusterSegmenter_destroy(mSegmenter);
}
#endif
}
enum HSType {
HST_NONE = U_HST_NOT_APPLICABLE,
@ -75,6 +230,20 @@ static HSType GetHangulSyllableType(uint32_t aCh) {
Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Next() {
const auto len = mText.Length();
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
if (mIterator) {
const int32_t nextPos =
capi::ICU4XGraphemeClusterBreakIteratorUtf16_next(mIterator);
if (nextPos < 0) {
return Nothing();
}
if (!nextPos) {
return Next();
}
mPos = nextPos;
return Some(mPos);
}
#endif
if (mPos >= len) {
// The iterator has already reached the end.
return Nothing();
@ -195,6 +364,32 @@ Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Next() {
return Some(mPos);
}
Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Seek(uint32_t aPos) {
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
if (mIterator) {
if (mPos >= aPos) {
return Next();
}
while (mPos < aPos) {
const int32_t nextPos =
capi::ICU4XGraphemeClusterBreakIteratorUtf16_next(mIterator);
if (nextPos < 0) {
return Nothing();
}
mPos = static_cast<uint32_t>(nextPos);
}
if (aPos < mPos) {
return Some(mPos);
}
return Next();
}
#endif
return SegmentIteratorUtf16::Seek(aPos);
}
GraphemeClusterBreakReverseIteratorUtf16::
GraphemeClusterBreakReverseIteratorUtf16(Span<const char16_t> aText)
: SegmentIteratorUtf16(aText) {
@ -231,12 +426,77 @@ Maybe<uint32_t> GraphemeClusterBreakReverseIteratorUtf16::Seek(uint32_t aPos) {
return Next();
}
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
SentenceBreakIteratorUtf16::SentenceBreakIteratorUtf16(
Span<const char16_t> aText)
: SegmentIteratorUtf16(aText) {
auto result =
capi::ICU4XSentenceSegmenter_create(mozilla::intl::GetDataProvider());
MOZ_RELEASE_ASSERT(result.is_ok);
mSegmenter = result.ok;
mIterator = capi::ICU4XSentenceSegmenter_segment_utf16(
mSegmenter, (const uint16_t*)mText.Elements(), mText.Length());
}
SentenceBreakIteratorUtf16::~SentenceBreakIteratorUtf16() {
if (mIterator) {
capi::ICU4XSentenceBreakIteratorUtf16_destroy(mIterator);
}
if (mSegmenter) {
capi::ICU4XSentenceSegmenter_destroy(mSegmenter);
}
}
Maybe<uint32_t> SentenceBreakIteratorUtf16::Seek(uint32_t aPos) {
if (!mIterator) {
return Nothing();
}
if (mPos >= aPos) {
return Next();
}
while (mPos < aPos) {
const int32_t nextPos =
capi::ICU4XSentenceBreakIteratorUtf16_next(mIterator);
if (nextPos < 0) {
return Nothing();
}
mPos = static_cast<uint32_t>(nextPos);
}
if (aPos < mPos) {
return Some(mPos);
}
return Next();
}
Maybe<uint32_t> SentenceBreakIteratorUtf16::Next() {
if (!mIterator) {
return Nothing();
}
const int32_t nextPos = capi::ICU4XSentenceBreakIteratorUtf16_next(mIterator);
if (nextPos < 0) {
return Nothing();
}
if (!nextPos) {
return Next();
}
mPos = nextPos;
return Some(mPos);
}
#endif
Result<UniquePtr<Segmenter>, ICUError> Segmenter::TryCreate(
Span<const char> aLocale, const SegmenterOptions& aOptions) {
#if !defined(MOZ_ICU4X) || !defined(JS_HAS_INTL_API)
if (aOptions.mGranularity == SegmenterGranularity::Sentence) {
// Grapheme and Sentence iterator are not yet implemented.
return Err(ICUError::InternalError);
}
#endif
return MakeUnique<Segmenter>(aLocale, aOptions);
}
@ -246,6 +506,11 @@ UniquePtr<SegmentIteratorUtf16> Segmenter::Segment(
case SegmenterGranularity::Grapheme:
return MakeUnique<GraphemeClusterBreakIteratorUtf16>(aText);
case SegmenterGranularity::Sentence:
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
if (StaticPrefs::intl_icu4x_segmenter_enabled()) {
return MakeUnique<SentenceBreakIteratorUtf16>(aText);
}
#endif
MOZ_ASSERT_UNREACHABLE("Unimplemented yet!");
return nullptr;
case SegmenterGranularity::Word:

View File

@ -15,6 +15,19 @@
#include "mozilla/Span.h"
#include "mozilla/UniquePtr.h"
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
namespace capi {
struct ICU4XLineSegmenter;
struct ICU4XLineBreakIteratorUtf16;
struct ICU4XWordSegmenter;
struct ICU4XWordBreakIteratorUtf16;
struct ICU4XGraphemeClusterSegmenter;
struct ICU4XGraphemeClusterBreakIteratorUtf16;
struct ICU4XSentenceSegmenter;
struct ICU4XSentenceBreakIteratorUtf16;
} // namespace capi
#endif
namespace mozilla::intl {
enum class SegmenterGranularity : uint8_t {
@ -104,11 +117,18 @@ class LineBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
public:
explicit LineBreakIteratorUtf16(Span<const char16_t> aText,
const LineBreakOptions& aOptions = {});
~LineBreakIteratorUtf16() override;
Maybe<uint32_t> Next() override;
Maybe<uint32_t> Seek(uint32_t aPos) override;
private:
LineBreakOptions mOptions;
#ifdef MOZ_ICU4X
capi::ICU4XLineSegmenter* mSegmenter = nullptr;
capi::ICU4XLineBreakIteratorUtf16* mIterator = nullptr;
#endif
};
/**
@ -117,8 +137,16 @@ class LineBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
public:
explicit WordBreakIteratorUtf16(Span<const char16_t> aText);
~WordBreakIteratorUtf16() override;
Maybe<uint32_t> Next() override;
Maybe<uint32_t> Seek(uint32_t aPos) override;
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
private:
capi::ICU4XWordSegmenter* mSegmenter = nullptr;
capi::ICU4XWordBreakIteratorUtf16* mIterator = nullptr;
#endif
};
/**
@ -127,8 +155,16 @@ class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
class GraphemeClusterBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
public:
explicit GraphemeClusterBreakIteratorUtf16(Span<const char16_t> aText);
~GraphemeClusterBreakIteratorUtf16() override;
Maybe<uint32_t> Next() override;
Maybe<uint32_t> Seek(uint32_t aPos) override;
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
private:
capi::ICU4XGraphemeClusterSegmenter* mSegmenter = nullptr;
capi::ICU4XGraphemeClusterBreakIteratorUtf16* mIterator = nullptr;
#endif
};
/**
@ -146,6 +182,24 @@ class GraphemeClusterBreakReverseIteratorUtf16 final
Maybe<uint32_t> Seek(uint32_t aPos) override;
};
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
/**
* Sentence break iterator for UTF-16 text.
*/
class SentenceBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
public:
explicit SentenceBreakIteratorUtf16(Span<const char16_t> aText);
~SentenceBreakIteratorUtf16() override;
Maybe<uint32_t> Next() override;
Maybe<uint32_t> Seek(uint32_t aPos) override;
private:
capi::ICU4XSentenceSegmenter* mSegmenter = nullptr;
capi::ICU4XSentenceBreakIteratorUtf16* mIterator = nullptr;
};
#endif
/**
* This component is a Mozilla-focused API for working with segmenters in
* internationalization code.

View File

@ -10,6 +10,14 @@
#include "nsTArray.h"
#include "nsUnicodeProperties.h"
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
# include "ICU4XDataProvider.h"
# include "ICU4XWordBreakIteratorUtf16.hpp"
# include "ICU4XWordSegmenter.hpp"
# include "mozilla/intl/ICU4XGeckoDataProvider.h"
# include "mozilla/StaticPrefs_intl.h"
#endif
using mozilla::intl::Script;
using mozilla::intl::UnicodeProperties;
using mozilla::intl::WordBreaker;
@ -102,9 +110,34 @@ WordRange WordBreaker::FindWord(const char16_t* aText, uint32_t aLen,
return {aLen, aLen};
}
WordBreakClass c = GetClass(aText[aPos]);
WordRange range{0, aLen};
#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
if (StaticPrefs::intl_icu4x_segmenter_enabled()) {
auto result =
capi::ICU4XWordSegmenter_create_auto(mozilla::intl::GetDataProvider());
MOZ_ASSERT(result.is_ok);
ICU4XWordSegmenter segmenter(result.ok);
ICU4XWordBreakIteratorUtf16 iterator =
segmenter.segment_utf16(diplomat::span((const uint16_t*)aText, aLen));
uint32_t previousPos = 0;
while (true) {
const int32_t nextPos = iterator.next();
if (nextPos < 0) {
return {previousPos, aLen};
}
if ((uint32_t)nextPos > aPos) {
return {previousPos, (uint32_t)nextPos};
}
previousPos = nextPos;
}
}
#endif
WordBreakClass c = GetClass(aText[aPos]);
// Scan forward
for (uint32_t i = aPos + 1; i <= aLen; i++) {
if (c != GetClass(aText[i])) {

View File

@ -7,11 +7,15 @@
#include "gtest/gtest.h"
#include "mozilla/intl/Segmenter.h"
#include "mozilla/Preferences.h"
namespace mozilla::intl {
TEST(IntlSegmenter, TestLineBreakIteratorUtf16)
TEST(IntlSegmenter, TestLineBreakIteratorUtf16SeekOld)
{
nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", false);
EXPECT_TRUE(rv == NS_OK);
const SegmenterOptions options{SegmenterGranularity::Line};
auto result = Segmenter::TryCreate("en", options);
ASSERT_TRUE(result.isOk());
@ -30,7 +34,50 @@ TEST(IntlSegmenter, TestLineBreakIteratorUtf16)
ASSERT_EQ(segIter->Seek(0u), Nothing());
}
TEST(IntlSegmenter, TestWordBreakIteratorUtf16)
TEST(IntlSegmenter, TestLineBreakIteratorUtf16Seek)
{
nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true);
EXPECT_TRUE(rv == NS_OK);
const SegmenterOptions options{SegmenterGranularity::Line};
auto result = Segmenter::TryCreate("en", options);
ASSERT_TRUE(result.isOk());
auto lineSegmenter = result.unwrap();
const char16_t text[] = u"hello world";
UniquePtr<SegmentIteratorUtf16> segIter =
lineSegmenter->Segment(MakeStringSpan(text));
// Seek to space between "hello" and "world".
// UAX#14 rule returns before "w".
ASSERT_EQ(segIter->Seek(5u), Some(6u));
ASSERT_EQ(segIter->Next(), Some(11u));
ASSERT_EQ(segIter->Next(), Nothing());
// Same as calling Next().
ASSERT_EQ(segIter->Seek(0u), Nothing());
}
TEST(IntlSegmenter, TestWordBreakIteratorUtf16Simple)
{
const SegmenterOptions options{SegmenterGranularity::Word};
auto result = Segmenter::TryCreate("en", options);
ASSERT_TRUE(result.isOk());
auto wordSegmenter = result.unwrap();
const char16_t text[] = u"hello world";
UniquePtr<SegmentIteratorUtf16> segIter =
wordSegmenter->Segment(MakeStringSpan(text));
ASSERT_EQ(segIter->Next(), Some(5u));
ASSERT_EQ(segIter->Next(), Some(6u));
ASSERT_EQ(segIter->Next(), Some(11u));
ASSERT_EQ(segIter->Next(), Nothing());
}
TEST(IntlSegmenter, TestWordBreakIteratorUtf16Seek)
{
const SegmenterOptions options{SegmenterGranularity::Word};
auto result = Segmenter::TryCreate("en", options);
@ -51,7 +98,32 @@ TEST(IntlSegmenter, TestWordBreakIteratorUtf16)
ASSERT_EQ(segIter->Seek(0u), Nothing());
}
TEST(IntlSegmenter, TestGraphemeClusterBreakIteratorUtf16)
TEST(IntlSegmenter, TestGraphemeClusterBreakIteratorUtf16Simple)
{
SegmenterOptions options{SegmenterGranularity::Grapheme};
auto result = Segmenter::TryCreate("en", options);
ASSERT_TRUE(result.isOk());
auto graphemeClusterSegmenter = result.unwrap();
const char16_t text[] = u"hello world";
UniquePtr<SegmentIteratorUtf16> segIter =
graphemeClusterSegmenter->Segment(MakeStringSpan(text));
ASSERT_EQ(segIter->Next(), Some(1u));
ASSERT_EQ(segIter->Next(), Some(2u));
ASSERT_EQ(segIter->Next(), Some(3u));
ASSERT_EQ(segIter->Next(), Some(4u));
ASSERT_EQ(segIter->Next(), Some(5u));
ASSERT_EQ(segIter->Next(), Some(6u));
ASSERT_EQ(segIter->Next(), Some(7u));
ASSERT_EQ(segIter->Next(), Some(8u));
ASSERT_EQ(segIter->Next(), Some(9u));
ASSERT_EQ(segIter->Next(), Some(10u));
ASSERT_EQ(segIter->Next(), Some(11u));
ASSERT_EQ(segIter->Next(), Nothing());
}
TEST(IntlSegmenter, TestGraphemeClusterBreakIteratorUtf16Seek)
{
SegmenterOptions options{SegmenterGranularity::Grapheme};
auto result = Segmenter::TryCreate("en", options);
@ -97,9 +169,41 @@ TEST(IntlSegmenter, TestGraphemeClusterBreakReverseIteratorUtf16)
TEST(IntlSegmenter, TestSentenceBreakIteratorUtf16)
{
nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true);
EXPECT_TRUE(rv == NS_OK);
SegmenterOptions options{SegmenterGranularity::Sentence};
auto result = Segmenter::TryCreate("en", options);
ASSERT_TRUE(result.isErr());
ASSERT_TRUE(result.isOk());
auto sentenceSegmenter = result.unwrap();
const char16_t text[] = u"Hello world. Hello world.";
UniquePtr<SegmentIteratorUtf16> segIter =
sentenceSegmenter->Segment(MakeStringSpan(text));
ASSERT_EQ(segIter->Next(), Some(13u));
ASSERT_EQ(segIter->Next(), Some(25u));
ASSERT_EQ(segIter->Next(), Nothing());
// Same as calling Next().
ASSERT_EQ(segIter->Seek(0u), Nothing());
}
TEST(IntlSegmenter, TestSentenceBreakIteratorUtf16Seek)
{
nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true);
EXPECT_TRUE(rv == NS_OK);
SegmenterOptions options{SegmenterGranularity::Sentence};
auto result = Segmenter::TryCreate("en", options);
ASSERT_TRUE(result.isOk());
auto sentenceSegmenter = result.unwrap();
const char16_t text[] = u"Hello world. Hello world.";
UniquePtr<SegmentIteratorUtf16> segIter =
sentenceSegmenter->Segment(MakeStringSpan(text));
ASSERT_EQ(segIter->Seek(5u), Some(13u));
}
} // namespace mozilla::intl

View File

@ -42,4 +42,16 @@ else:
"rulebrk.c",
]
if CONFIG["JS_HAS_INTL_API"] and CONFIG["MOZ_ICU4X"]:
LOCAL_INCLUDES += [
"/third_party/rust/icu_capi/cpp/include",
]
# Disable warnings when including C++ headers of ICU4X.
# - https://github.com/rust-diplomat/diplomat/issues/277
# - https://github.com/rust-diplomat/diplomat/issues/335
CXXFLAGS += [
"-Wno-mismatched-tags",
"-Wno-pessimizing-move",
]
FINAL_LIBRARY = "xul"

View File

@ -7292,6 +7292,12 @@
mirror: always
#endif
# If true, we use UAX14/29 compatible segmenter rules using ICU4X
- name: intl.icu4x.segmenter.enabled
type: RelaxedAtomicBool
value: false
mirror: always
#---------------------------------------------------------------------------
# Prefs starting with "javascript."
#