mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-24 05:11:16 +00:00
653f4b3694
This is the main patch for the bug. It aims to change the grapheme cluster break's `Next()` API by implementing SegmentIteratorUtf16 interface, and adapt the callers. It shouldn't change the behavior. While rewriting the caller, one caveat worth mentioning is the loop termination condition. If the old code relies on `!AtEnd()` as the loop termination condition, and it advances the iterator at the end of the loop, it meant to *skip* its logic when the break position is at the end of the string. For example, see the `mozTXTToHTMLConv::NumberOfMatches`. This patch also hooks grapheme cluster break iterator into Segmenter::TryCreate() interface. Existing test coverage for the file changed: - netwerk/test/unit/test_mozTXTToHTMLConv.js - layout/reftests/forms/input/file/dynamic-max-width.html Differential Revision: https://phabricator.services.mozilla.com/D135643
178 lines
4.9 KiB
C++
178 lines
4.9 KiB
C++
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
|
|
/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
|
* You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
|
|
/* Classes to iterate over grapheme, word, sentence, or line. */
|
|
|
|
#ifndef intl_components_Segmenter_h_
|
|
#define intl_components_Segmenter_h_
|
|
|
|
#include "mozilla/intl/ICUError.h"
|
|
#include "mozilla/Maybe.h"
|
|
#include "mozilla/Result.h"
|
|
#include "mozilla/Span.h"
|
|
#include "mozilla/UniquePtr.h"
|
|
|
|
namespace mozilla::intl {
|
|
|
|
enum class SegmenterGranularity : uint8_t {
|
|
Grapheme,
|
|
Word,
|
|
Sentence,
|
|
Line,
|
|
};
|
|
|
|
struct SegmenterOptions final {
|
|
SegmenterGranularity mGranularity = SegmenterGranularity::Grapheme;
|
|
};
|
|
|
|
/**
|
|
* Interface of segment iterators. Subclass this class to implement iterator for
|
|
* UTF-16 text.
|
|
*/
|
|
class SegmentIteratorUtf16 {
|
|
public:
|
|
virtual ~SegmentIteratorUtf16() = default;
|
|
|
|
// Disable copy or move semantics. Move semantic could be enabled in the
|
|
// future if needed.
|
|
SegmentIteratorUtf16(SegmentIteratorUtf16&&) = delete;
|
|
SegmentIteratorUtf16& operator=(SegmentIteratorUtf16&&) = delete;
|
|
SegmentIteratorUtf16(const SegmentIteratorUtf16&) = delete;
|
|
SegmentIteratorUtf16& operator=(const SegmentIteratorUtf16&) = delete;
|
|
|
|
/**
|
|
* Advance the iterator to the next break position.
|
|
*
|
|
* @return the break position. If there's no further break position, return
|
|
* Nothing().
|
|
*/
|
|
virtual Maybe<uint32_t> Next() = 0;
|
|
|
|
/**
|
|
* Advance the iterator to the first break position following the specified
|
|
* position aPos.
|
|
*
|
|
* Note: if this iterator's current position is already >= aPos, this method
|
|
* behaves the same as Next().
|
|
*/
|
|
virtual Maybe<uint32_t> Seek(uint32_t aPos);
|
|
|
|
protected:
|
|
explicit SegmentIteratorUtf16(Span<const char16_t> aText);
|
|
|
|
// The text to iterate over.
|
|
Span<const char16_t> mText;
|
|
|
|
// The current break position within mText.
|
|
uint32_t mPos = 0;
|
|
};
|
|
|
|
// Each enum value has the same meaning with respect to the `word-break`
|
|
// property values in the CSS Text spec. See the details in
|
|
// https://drafts.csswg.org/css-text-3/#word-break-property
|
|
enum class WordBreakRule : uint8_t {
|
|
Normal = 0,
|
|
BreakAll,
|
|
KeepAll,
|
|
};
|
|
|
|
// Each enum value has the same meaning with respect to the `line-break`
|
|
// property values in the CSS Text spec. See the details in
|
|
// https://drafts.csswg.org/css-text-3/#line-break-property.
|
|
enum class LineBreakRule : uint8_t {
|
|
Auto = 0,
|
|
Loose,
|
|
Normal,
|
|
Strict,
|
|
Anywhere,
|
|
};
|
|
|
|
// Extra options for line break iterator.
|
|
struct LineBreakOptions final {
|
|
WordBreakRule mWordBreakRule = WordBreakRule::Normal;
|
|
LineBreakRule mLineBreakRule = LineBreakRule::Auto;
|
|
bool mScriptIsChineseOrJapanese = false;
|
|
};
|
|
|
|
/**
|
|
* Line break iterator for UTF-16 text.
|
|
*/
|
|
class LineBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
|
|
public:
|
|
explicit LineBreakIteratorUtf16(Span<const char16_t> aText,
|
|
const LineBreakOptions& aOptions = {});
|
|
|
|
Maybe<uint32_t> Next() override;
|
|
|
|
private:
|
|
LineBreakOptions mOptions;
|
|
};
|
|
|
|
/**
|
|
* Word break iterator for UTF-16 text.
|
|
*/
|
|
class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
|
|
public:
|
|
explicit WordBreakIteratorUtf16(Span<const char16_t> aText);
|
|
|
|
Maybe<uint32_t> Next() override;
|
|
};
|
|
|
|
/**
|
|
* Grapheme cluster break iterator for UTF-16 text.
|
|
*/
|
|
class GraphemeClusterBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
|
|
public:
|
|
explicit GraphemeClusterBreakIteratorUtf16(Span<const char16_t> aText);
|
|
|
|
Maybe<uint32_t> Next() override;
|
|
};
|
|
|
|
/**
|
|
* Grapheme cluster break reverse iterator for UTF-16 text.
|
|
*
|
|
* Note: The reverse iterator doesn't handle conjoining Jamo and emoji. Use it
|
|
* at your own risk.
|
|
*/
|
|
class GraphemeClusterBreakReverseIteratorUtf16 final
|
|
: public SegmentIteratorUtf16 {
|
|
public:
|
|
explicit GraphemeClusterBreakReverseIteratorUtf16(Span<const char16_t> aText);
|
|
|
|
Maybe<uint32_t> Next() override;
|
|
Maybe<uint32_t> Seek(uint32_t aPos) override;
|
|
};
|
|
|
|
/**
|
|
* This component is a Mozilla-focused API for working with segmenters in
|
|
* internationalization code.
|
|
*
|
|
* This is a factor class. Calling Segment() to create an iterator over a text
|
|
* of given granularity.
|
|
*/
|
|
class Segmenter final {
|
|
public:
|
|
// NOTE: aLocale is a no-op currently.
|
|
static Result<UniquePtr<Segmenter>, ICUError> TryCreate(
|
|
Span<const char> aLocale, const SegmenterOptions& aOptions);
|
|
|
|
explicit Segmenter(Span<const char> aLocale, const SegmenterOptions& aOptions)
|
|
: mOptions(aOptions) {}
|
|
|
|
// Creates an iterator over aText of a given granularity in mOptions.
|
|
UniquePtr<SegmentIteratorUtf16> Segment(Span<const char16_t> aText) const;
|
|
|
|
// TODO: Implement an iterator for Latin1 text.
|
|
// UniquePtr<SegmentIteratorLatin1> Segment(Span<const uint8_t> aText) const;
|
|
|
|
private:
|
|
SegmenterOptions mOptions;
|
|
};
|
|
|
|
} // namespace mozilla::intl
|
|
|
|
#endif
|