зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1722484 Part 1 - Introduce mozilla::intl::Segmenter and break iterators. r=m_kato,dminor
intl/component is already linked with Javascript. However, segmenter is still backed by lwbrk, which highly couples with xpcom APIs, so we cannnot add it under intl/component until we integrate ICU4X segmenter. I added it under intl/lwbrk for now. The enum `WordBreakRule` and `LineBreakRule` are named after their counterpart in ICU4X. https://unicode-org.github.io/icu4x-docs/doc/icu_segmenter/index.html#enums Differential Revision: https://phabricator.services.mozilla.com/D129193
This commit is contained in:
Родитель
dfa4091f3b
Коммит
4a00f61720
|
@ -7,6 +7,8 @@
|
|||
|
||||
#include "nscore.h"
|
||||
|
||||
#include "mozilla/intl/Segmenter.h"
|
||||
|
||||
#define NS_LINEBREAKER_NEED_MORE_TEXT -1
|
||||
|
||||
namespace mozilla {
|
||||
|
@ -14,19 +16,8 @@ namespace intl {
|
|||
|
||||
class LineBreaker final {
|
||||
public:
|
||||
enum class WordBreak : uint8_t {
|
||||
Normal = 0, // default
|
||||
BreakAll = 1, // break all
|
||||
KeepAll = 2 // always keep
|
||||
};
|
||||
|
||||
enum class Strictness : uint8_t {
|
||||
Auto = 0,
|
||||
Loose = 1,
|
||||
Normal = 2,
|
||||
Strict = 3,
|
||||
Anywhere = 4
|
||||
};
|
||||
using WordBreak = WordBreakRule;
|
||||
using Strictness = LineBreakRule;
|
||||
|
||||
// LineBreaker is a utility class with only static methods. No need to
|
||||
// instantiate it.
|
||||
|
@ -38,6 +29,8 @@ class LineBreaker final {
|
|||
//
|
||||
// If aPos is already at the end of aText or beyond, i.e. aPos >= aLen, return
|
||||
// NS_LINEBREAKER_NEED_MORE_TEXT.
|
||||
//
|
||||
// DEPRECATED: Use LineBreakIteratorUtf16 instead.
|
||||
static int32_t Next(const char16_t* aText, uint32_t aLen, uint32_t aPos);
|
||||
|
||||
// Call this on a word with whitespace at either end. We will apply JISx4051
|
||||
|
|
|
@ -0,0 +1,79 @@
|
|||
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
* You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
/* Classes to iterate over grapheme, word, sentence, or line. */
|
||||
|
||||
#include "mozilla/intl/Segmenter.h"
|
||||
|
||||
#include "mozilla/intl/LineBreaker.h"
|
||||
#include "mozilla/intl/WordBreaker.h"
|
||||
|
||||
namespace mozilla::intl {
|
||||
|
||||
SegmentIteratorUtf16::SegmentIteratorUtf16(Span<const char16_t> aText)
|
||||
: mText(aText) {}
|
||||
|
||||
Maybe<uint32_t> SegmentIteratorUtf16::Seek(uint32_t aPos) {
|
||||
if (mPos < aPos) {
|
||||
mPos = aPos;
|
||||
}
|
||||
return Next();
|
||||
}
|
||||
|
||||
LineBreakIteratorUtf16::LineBreakIteratorUtf16(Span<const char16_t> aText,
|
||||
const LineBreakOptions& aOptions)
|
||||
: SegmentIteratorUtf16(aText), mOptions(aOptions) {}
|
||||
|
||||
Maybe<uint32_t> LineBreakIteratorUtf16::Next() {
|
||||
const int32_t nextPos =
|
||||
LineBreaker::Next(mText.Elements(), mText.Length(), mPos);
|
||||
if (nextPos == NS_LINEBREAKER_NEED_MORE_TEXT) {
|
||||
return Nothing();
|
||||
}
|
||||
mPos = nextPos;
|
||||
return Some(mPos);
|
||||
}
|
||||
|
||||
WordBreakIteratorUtf16::WordBreakIteratorUtf16(Span<const char16_t> aText)
|
||||
: SegmentIteratorUtf16(aText) {}
|
||||
|
||||
Maybe<uint32_t> WordBreakIteratorUtf16::Next() {
|
||||
const int32_t nextPos =
|
||||
WordBreaker::Next(mText.Elements(), mText.Length(), mPos);
|
||||
if (nextPos == NS_WORDBREAKER_NEED_MORE_TEXT) {
|
||||
return Nothing();
|
||||
}
|
||||
mPos = nextPos;
|
||||
return Some(mPos);
|
||||
}
|
||||
|
||||
Result<UniquePtr<Segmenter>, ICUError> Segmenter::TryCreate(
|
||||
Span<const char> aLocale, const SegmenterOptions& aOptions) {
|
||||
if (aOptions.mGranularity == SegmenterGranularity::Grapheme ||
|
||||
aOptions.mGranularity == SegmenterGranularity::Sentence) {
|
||||
// Grapheme and Sentence iterator are not yet implemented.
|
||||
return Err(ICUError::InternalError);
|
||||
}
|
||||
return MakeUnique<Segmenter>(aLocale, aOptions);
|
||||
}
|
||||
|
||||
UniquePtr<SegmentIteratorUtf16> Segmenter::Segment(
|
||||
Span<const char16_t> aText) const {
|
||||
switch (mOptions.mGranularity) {
|
||||
case SegmenterGranularity::Grapheme:
|
||||
case SegmenterGranularity::Sentence:
|
||||
MOZ_ASSERT_UNREACHABLE("Unimplemented yet!");
|
||||
return nullptr;
|
||||
case SegmenterGranularity::Word:
|
||||
return MakeUnique<WordBreakIteratorUtf16>(aText);
|
||||
case SegmenterGranularity::Line:
|
||||
return MakeUnique<LineBreakIteratorUtf16>(aText);
|
||||
}
|
||||
MOZ_ASSERT_UNREACHABLE("All granularities must be handled!");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
} // namespace mozilla::intl
|
|
@ -0,0 +1,152 @@
|
|||
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
* You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
/* Classes to iterate over grapheme, word, sentence, or line. */
|
||||
|
||||
#ifndef intl_components_Segmenter_h_
|
||||
#define intl_components_Segmenter_h_
|
||||
|
||||
#include "mozilla/intl/ICUError.h"
|
||||
#include "mozilla/Maybe.h"
|
||||
#include "mozilla/Result.h"
|
||||
#include "mozilla/Span.h"
|
||||
#include "mozilla/UniquePtr.h"
|
||||
|
||||
namespace mozilla::intl {
|
||||
|
||||
enum class SegmenterGranularity : uint8_t {
|
||||
Grapheme,
|
||||
Word,
|
||||
Sentence,
|
||||
Line,
|
||||
};
|
||||
|
||||
struct SegmenterOptions final {
|
||||
SegmenterGranularity mGranularity = SegmenterGranularity::Grapheme;
|
||||
};
|
||||
|
||||
/**
|
||||
* Interface of segment iterators. Subclass this class to implement iterator for
|
||||
* UTF-16 text.
|
||||
*/
|
||||
class SegmentIteratorUtf16 {
|
||||
public:
|
||||
virtual ~SegmentIteratorUtf16() = default;
|
||||
|
||||
// Disable copy or move semantics. Move semantic could be enabled in the
|
||||
// future if needed.
|
||||
SegmentIteratorUtf16(SegmentIteratorUtf16&&) = delete;
|
||||
SegmentIteratorUtf16& operator=(SegmentIteratorUtf16&&) = delete;
|
||||
SegmentIteratorUtf16(const SegmentIteratorUtf16&) = delete;
|
||||
SegmentIteratorUtf16& operator=(const SegmentIteratorUtf16&) = delete;
|
||||
|
||||
/**
|
||||
* Advance the iterator to the next break position.
|
||||
*
|
||||
* @return the break position. If there's no further break position, return
|
||||
* Nothing().
|
||||
*/
|
||||
virtual Maybe<uint32_t> Next() = 0;
|
||||
|
||||
/**
|
||||
* Advance the iterator to the first break position following the specified
|
||||
* position aPos.
|
||||
*
|
||||
* Note: if this iterator's current position is already >= aPos, this method
|
||||
* behaves the same as Next().
|
||||
*/
|
||||
virtual Maybe<uint32_t> Seek(uint32_t aPos);
|
||||
|
||||
protected:
|
||||
explicit SegmentIteratorUtf16(Span<const char16_t> aText);
|
||||
|
||||
// The text to iterate over.
|
||||
Span<const char16_t> mText;
|
||||
|
||||
// The current break position within mText.
|
||||
uint32_t mPos = 0;
|
||||
};
|
||||
|
||||
// Each enum value has the same meaning with respect to the `word-break`
|
||||
// property values in the CSS Text spec. See the details in
|
||||
// https://drafts.csswg.org/css-text-3/#word-break-property
|
||||
enum class WordBreakRule : uint8_t {
|
||||
Normal = 0,
|
||||
BreakAll,
|
||||
KeepAll,
|
||||
};
|
||||
|
||||
// Each enum value has the same meaning with respect to the `line-break`
|
||||
// property values in the CSS Text spec. See the details in
|
||||
// https://drafts.csswg.org/css-text-3/#line-break-property.
|
||||
enum class LineBreakRule : uint8_t {
|
||||
Auto = 0,
|
||||
Loose,
|
||||
Normal,
|
||||
Strict,
|
||||
Anywhere,
|
||||
};
|
||||
|
||||
// Extra options for line break iterator.
|
||||
struct LineBreakOptions final {
|
||||
WordBreakRule mWordBreakRule = WordBreakRule::Normal;
|
||||
LineBreakRule mLineBreakRule = LineBreakRule::Auto;
|
||||
bool mScriptIsChineseOrJapanese = false;
|
||||
};
|
||||
|
||||
/**
|
||||
* Line break iterator for UTF-16 text.
|
||||
*/
|
||||
class LineBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
|
||||
public:
|
||||
explicit LineBreakIteratorUtf16(Span<const char16_t> aText,
|
||||
const LineBreakOptions& aOptions = {});
|
||||
|
||||
Maybe<uint32_t> Next() override;
|
||||
|
||||
private:
|
||||
LineBreakOptions mOptions;
|
||||
};
|
||||
|
||||
/**
|
||||
* Word break iterator for UTF-16 text.
|
||||
*/
|
||||
class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
|
||||
public:
|
||||
explicit WordBreakIteratorUtf16(Span<const char16_t> aText);
|
||||
|
||||
Maybe<uint32_t> Next() override;
|
||||
};
|
||||
|
||||
/**
|
||||
* This component is a Mozilla-focused API for working with segmenters in
|
||||
* internationalization code.
|
||||
*
|
||||
* This is a factor class. Calling Segment() to create an iterator over a text
|
||||
* of given granularity.
|
||||
*/
|
||||
class Segmenter final {
|
||||
public:
|
||||
// NOTE: aLocale is a no-op currently.
|
||||
static Result<UniquePtr<Segmenter>, ICUError> TryCreate(
|
||||
Span<const char> aLocale, const SegmenterOptions& aOptions);
|
||||
|
||||
explicit Segmenter(Span<const char> aLocale, const SegmenterOptions& aOptions)
|
||||
: mOptions(aOptions) {}
|
||||
|
||||
// Creates an iterator over aText of a given granularity in mOptions.
|
||||
UniquePtr<SegmentIteratorUtf16> Segment(Span<const char16_t> aText) const;
|
||||
|
||||
// TODO: Implement an iterator for Latin1 text.
|
||||
// UniquePtr<SegmentIteratorLatin1> Segment(Span<const uint8_t> aText) const;
|
||||
|
||||
private:
|
||||
SegmenterOptions mOptions;
|
||||
};
|
||||
|
||||
} // namespace mozilla::intl
|
||||
|
||||
#endif
|
|
@ -40,6 +40,8 @@ class WordBreaker final {
|
|||
//
|
||||
// If aPos is already at the end of aText or beyond, i.e. aPos >= aLen, return
|
||||
// NS_WORDBREAKER_NEED_MORE_TEXT.
|
||||
//
|
||||
// DEPRECATED: Use WordBreakIteratorUtf16 instead.
|
||||
static int32_t Next(const char16_t* aText, uint32_t aLen, uint32_t aPos);
|
||||
|
||||
private:
|
||||
|
|
|
@ -0,0 +1,68 @@
|
|||
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
* You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
#include "mozilla/intl/Segmenter.h"
|
||||
|
||||
namespace mozilla::intl {
|
||||
|
||||
TEST(IntlSegmenter, TestLineBreakIteratorUtf16)
|
||||
{
|
||||
const SegmenterOptions options{SegmenterGranularity::Line};
|
||||
auto result = Segmenter::TryCreate("en", options);
|
||||
ASSERT_TRUE(result.isOk());
|
||||
auto lineSegmenter = result.unwrap();
|
||||
|
||||
const char16_t text[] = u"hello world";
|
||||
UniquePtr<SegmentIteratorUtf16> segIter =
|
||||
lineSegmenter->Segment(MakeStringSpan(text));
|
||||
|
||||
// Seek to space between "hello" and "world".
|
||||
ASSERT_EQ(segIter->Seek(5u), Some(11u));
|
||||
|
||||
ASSERT_EQ(segIter->Next(), Nothing());
|
||||
|
||||
// Same as calling Next().
|
||||
ASSERT_EQ(segIter->Seek(0u), Nothing());
|
||||
}
|
||||
|
||||
TEST(IntlSegmenter, TestWordBreakIteratorUtf16)
|
||||
{
|
||||
const SegmenterOptions options{SegmenterGranularity::Word};
|
||||
auto result = Segmenter::TryCreate("en", options);
|
||||
ASSERT_TRUE(result.isOk());
|
||||
auto wordSegmenter = result.unwrap();
|
||||
|
||||
const char16_t text[] = u"hello world";
|
||||
UniquePtr<SegmentIteratorUtf16> segIter =
|
||||
wordSegmenter->Segment(MakeStringSpan(text));
|
||||
|
||||
// Seek to the space between "hello" and "world"
|
||||
ASSERT_EQ(segIter->Seek(5u), Some(6u));
|
||||
|
||||
ASSERT_EQ(segIter->Next(), Some(11u));
|
||||
ASSERT_EQ(segIter->Next(), Nothing());
|
||||
|
||||
// Same as calling Next().
|
||||
ASSERT_EQ(segIter->Seek(0u), Nothing());
|
||||
}
|
||||
|
||||
TEST(IntlSegmenter, TestGraphemeBreakIteratorUtf16)
|
||||
{
|
||||
SegmenterOptions options{SegmenterGranularity::Grapheme};
|
||||
auto result = Segmenter::TryCreate("en", options);
|
||||
ASSERT_TRUE(result.isErr());
|
||||
}
|
||||
|
||||
TEST(IntlSegmenter, TestSentenceBreakIteratorUtf16)
|
||||
{
|
||||
SegmenterOptions options{SegmenterGranularity::Sentence};
|
||||
auto result = Segmenter::TryCreate("en", options);
|
||||
ASSERT_TRUE(result.isErr());
|
||||
}
|
||||
|
||||
} // namespace mozilla::intl
|
|
@ -6,6 +6,7 @@
|
|||
|
||||
UNIFIED_SOURCES += [
|
||||
"TestBreak.cpp",
|
||||
"TestSegmenter.cpp",
|
||||
]
|
||||
|
||||
FINAL_LIBRARY = "xul-gtest"
|
||||
|
|
|
@ -9,11 +9,13 @@ TEST_DIRS += ["gtest"]
|
|||
EXPORTS.mozilla.intl += [
|
||||
"LineBreaker.h",
|
||||
"nsComplexBreaker.h",
|
||||
"Segmenter.h",
|
||||
"WordBreaker.h",
|
||||
]
|
||||
|
||||
UNIFIED_SOURCES += [
|
||||
"LineBreaker.cpp",
|
||||
"Segmenter.cpp",
|
||||
"WordBreaker.cpp",
|
||||
]
|
||||
|
||||
|
@ -37,8 +39,6 @@ elif CONFIG["MOZ_WIDGET_TOOLKIT"] == "cocoa":
|
|||
else:
|
||||
SOURCES += [
|
||||
"nsRuleBreaker.cpp",
|
||||
]
|
||||
SOURCES += [
|
||||
"rulebrk.c",
|
||||
]
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче