зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1719550 - Build an initial unified mozilla::intl::Collator; r=platform-i18n-reviewers,nordzilla
This collator attempts to match the options bag from the Intl.Collator API from ECMA-402. It is built to be compatible and consistent across both Gecko code and SpiderMonkey code. Its behavior is designed to match ECMA-402. Differential Revision: https://phabricator.services.mozilla.com/D120494
This commit is contained in:
Родитель
f41c9baf2b
Коммит
c2cf97af11
|
@ -0,0 +1,196 @@
|
|||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
#include <string.h>
|
||||
#include "mozilla/intl/Collator.h"
|
||||
#include "mozilla/Span.h"
|
||||
#include "TestBuffer.h"
|
||||
|
||||
namespace mozilla::intl {
|
||||
|
||||
TEST(IntlCollator, SetAttributesInternal)
|
||||
{
|
||||
// Run through each settings to make sure MOZ_ASSERT is not triggered for
|
||||
// misconfigured attributes.
|
||||
auto result = Collator::TryCreate("en-US");
|
||||
ASSERT_TRUE(result.isOk());
|
||||
auto collator = result.unwrap();
|
||||
|
||||
collator->SetStrength(Collator::Strength::Primary);
|
||||
collator->SetStrength(Collator::Strength::Secondary);
|
||||
collator->SetStrength(Collator::Strength::Tertiary);
|
||||
collator->SetStrength(Collator::Strength::Quaternary);
|
||||
collator->SetStrength(Collator::Strength::Identical);
|
||||
collator->SetStrength(Collator::Strength::Default);
|
||||
|
||||
collator->SetAlternateHandling(Collator::AlternateHandling::NonIgnorable)
|
||||
.unwrap();
|
||||
collator->SetAlternateHandling(Collator::AlternateHandling::Shifted).unwrap();
|
||||
collator->SetAlternateHandling(Collator::AlternateHandling::Default).unwrap();
|
||||
|
||||
collator->SetCaseFirst(Collator::CaseFirst::False).unwrap();
|
||||
collator->SetCaseFirst(Collator::CaseFirst::Upper).unwrap();
|
||||
collator->SetCaseFirst(Collator::CaseFirst::Lower).unwrap();
|
||||
|
||||
collator->SetCaseLevel(Collator::Feature::On).unwrap();
|
||||
collator->SetCaseLevel(Collator::Feature::Off).unwrap();
|
||||
collator->SetCaseLevel(Collator::Feature::Default).unwrap();
|
||||
|
||||
collator->SetNumericCollation(Collator::Feature::On).unwrap();
|
||||
collator->SetNumericCollation(Collator::Feature::Off).unwrap();
|
||||
collator->SetNumericCollation(Collator::Feature::Default).unwrap();
|
||||
|
||||
collator->SetNormalizationMode(Collator::Feature::On).unwrap();
|
||||
collator->SetNormalizationMode(Collator::Feature::Off).unwrap();
|
||||
collator->SetNormalizationMode(Collator::Feature::Default).unwrap();
|
||||
}
|
||||
|
||||
TEST(IntlCollator, GetSortKey)
|
||||
{
|
||||
// Do some light sort key comparisons to ensure everything is wired up
|
||||
// correctly. This is not doing extensive correctness testing.
|
||||
auto result = Collator::TryCreate("en-US");
|
||||
ASSERT_TRUE(result.isOk());
|
||||
auto collator = result.unwrap();
|
||||
TestBuffer<uint8_t> bufferA;
|
||||
TestBuffer<uint8_t> bufferB;
|
||||
|
||||
auto compareSortKeys = [&](const char16_t* a, const char16_t* b) {
|
||||
collator->GetSortKey(MakeStringSpan(a), bufferA).unwrap();
|
||||
collator->GetSortKey(MakeStringSpan(b), bufferB).unwrap();
|
||||
return strcmp(reinterpret_cast<const char*>(bufferA.data()),
|
||||
reinterpret_cast<const char*>(bufferB.data()));
|
||||
};
|
||||
|
||||
ASSERT_TRUE(compareSortKeys(u"aaa", u"bbb") < 0);
|
||||
ASSERT_TRUE(compareSortKeys(u"bbb", u"aaa") > 0);
|
||||
ASSERT_TRUE(compareSortKeys(u"aaa", u"aaa") == 0);
|
||||
ASSERT_TRUE(compareSortKeys(u"👍", u"👎") < 0);
|
||||
}
|
||||
|
||||
TEST(IntlCollator, CompareStrings)
|
||||
{
|
||||
// Do some light string comparisons to ensure everything is wired up
|
||||
// correctly. This is not doing extensive correctness testing.
|
||||
auto result = Collator::TryCreate("en-US");
|
||||
ASSERT_TRUE(result.isOk());
|
||||
auto collator = result.unwrap();
|
||||
TestBuffer<uint8_t> bufferA;
|
||||
TestBuffer<uint8_t> bufferB;
|
||||
|
||||
ASSERT_EQ(collator->CompareStrings(u"aaa", u"bbb"), -1);
|
||||
ASSERT_EQ(collator->CompareStrings(u"bbb", u"aaa"), 1);
|
||||
ASSERT_EQ(collator->CompareStrings(u"aaa", u"aaa"), 0);
|
||||
ASSERT_EQ(collator->CompareStrings(u"👍", u"👎"), -1);
|
||||
}
|
||||
|
||||
TEST(IntlCollator, SetOptionsSensitivity)
|
||||
{
|
||||
// Test the ECMA 402 sensitivity behavior per:
|
||||
// https://tc39.es/ecma402/#sec-collator-comparestrings
|
||||
auto result = Collator::TryCreate("en-US");
|
||||
ASSERT_TRUE(result.isOk());
|
||||
auto collator = result.unwrap();
|
||||
|
||||
TestBuffer<uint8_t> bufferA;
|
||||
TestBuffer<uint8_t> bufferB;
|
||||
ICUResult optResult = Ok();
|
||||
Collator::Options options{};
|
||||
|
||||
options.sensitivity = Collator::Sensitivity::Base;
|
||||
optResult = collator->SetOptions(options);
|
||||
ASSERT_TRUE(optResult.isOk());
|
||||
ASSERT_EQ(collator->CompareStrings(u"a", u"b"), -1);
|
||||
ASSERT_EQ(collator->CompareStrings(u"a", u"á"), 0);
|
||||
ASSERT_EQ(collator->CompareStrings(u"a", u"A"), 0);
|
||||
|
||||
options.sensitivity = Collator::Sensitivity::Accent;
|
||||
optResult = collator->SetOptions(options);
|
||||
ASSERT_TRUE(optResult.isOk());
|
||||
ASSERT_EQ(collator->CompareStrings(u"a", u"b"), -1);
|
||||
ASSERT_EQ(collator->CompareStrings(u"a", u"á"), -1);
|
||||
ASSERT_EQ(collator->CompareStrings(u"a", u"A"), 0);
|
||||
|
||||
options.sensitivity = Collator::Sensitivity::Case;
|
||||
optResult = collator->SetOptions(options);
|
||||
ASSERT_TRUE(optResult.isOk());
|
||||
ASSERT_EQ(collator->CompareStrings(u"a", u"b"), -1);
|
||||
ASSERT_EQ(collator->CompareStrings(u"a", u"á"), 0);
|
||||
ASSERT_EQ(collator->CompareStrings(u"a", u"A"), -1);
|
||||
|
||||
options.sensitivity = Collator::Sensitivity::Variant;
|
||||
optResult = collator->SetOptions(options);
|
||||
ASSERT_TRUE(optResult.isOk());
|
||||
ASSERT_EQ(collator->CompareStrings(u"a", u"b"), -1);
|
||||
ASSERT_EQ(collator->CompareStrings(u"a", u"á"), -1);
|
||||
ASSERT_EQ(collator->CompareStrings(u"a", u"A"), -1);
|
||||
}
|
||||
|
||||
TEST(IntlCollator, LocaleSensitiveCollations)
|
||||
{
|
||||
UniquePtr<Collator> collator = nullptr;
|
||||
TestBuffer<uint8_t> bufferA;
|
||||
TestBuffer<uint8_t> bufferB;
|
||||
|
||||
auto changeLocale = [&](const char* locale) {
|
||||
auto result = Collator::TryCreate(locale);
|
||||
ASSERT_TRUE(result.isOk());
|
||||
collator = result.unwrap();
|
||||
|
||||
Collator::Options options{};
|
||||
options.sensitivity = Collator::Sensitivity::Base;
|
||||
auto optResult = collator->SetOptions(options);
|
||||
ASSERT_TRUE(optResult.isOk());
|
||||
};
|
||||
|
||||
// Swedish treats "Ö" as a separate character, which sorts after "Z".
|
||||
changeLocale("en-US");
|
||||
ASSERT_EQ(collator->CompareStrings(u"Österreich", u"Västervik"), -1);
|
||||
changeLocale("sv-SE");
|
||||
ASSERT_EQ(collator->CompareStrings(u"Österreich", u"Västervik"), 1);
|
||||
|
||||
// Country names in their respective scripts.
|
||||
auto china = MakeStringSpan(u"中国");
|
||||
auto japan = MakeStringSpan(u"日本");
|
||||
auto korea = MakeStringSpan(u"한국");
|
||||
|
||||
changeLocale("en-US");
|
||||
ASSERT_EQ(collator->CompareStrings(china, japan), -1);
|
||||
ASSERT_EQ(collator->CompareStrings(china, korea), 1);
|
||||
changeLocale("zh");
|
||||
ASSERT_EQ(collator->CompareStrings(china, japan), 1);
|
||||
ASSERT_EQ(collator->CompareStrings(china, korea), -1);
|
||||
changeLocale("ja");
|
||||
ASSERT_EQ(collator->CompareStrings(china, japan), -1);
|
||||
ASSERT_EQ(collator->CompareStrings(china, korea), -1);
|
||||
changeLocale("ko");
|
||||
ASSERT_EQ(collator->CompareStrings(china, japan), 1);
|
||||
ASSERT_EQ(collator->CompareStrings(china, korea), -1);
|
||||
}
|
||||
|
||||
TEST(IntlCollator, IgnorePunctuation)
|
||||
{
|
||||
TestBuffer<uint8_t> bufferA;
|
||||
TestBuffer<uint8_t> bufferB;
|
||||
|
||||
auto result = Collator::TryCreate("en-US");
|
||||
ASSERT_TRUE(result.isOk());
|
||||
auto collator = result.unwrap();
|
||||
Collator::Options options{};
|
||||
options.ignorePunctuation = true;
|
||||
|
||||
auto optResult = collator->SetOptions(options);
|
||||
ASSERT_TRUE(optResult.isOk());
|
||||
|
||||
ASSERT_EQ(collator->CompareStrings(u"aa", u".bb"), -1);
|
||||
|
||||
options.ignorePunctuation = false;
|
||||
optResult = collator->SetOptions(options);
|
||||
ASSERT_TRUE(optResult.isOk());
|
||||
|
||||
ASSERT_EQ(collator->CompareStrings(u"aa", u".bb"), 1);
|
||||
}
|
||||
|
||||
} // namespace mozilla::intl
|
|
@ -6,6 +6,7 @@
|
|||
|
||||
UNIFIED_SOURCES += [
|
||||
"TestCalendar.cpp",
|
||||
"TestCollator.cpp",
|
||||
"TestDateTimeFormat.cpp",
|
||||
"TestNumberFormat.cpp",
|
||||
"TestPluralRules.cpp",
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
EXPORTS.mozilla.intl = [
|
||||
"src/Calendar.h",
|
||||
"src/Collator.h",
|
||||
"src/DateTimeFormat.h",
|
||||
"src/DateTimePatternGenerator.h",
|
||||
"src/ICU4CGlue.h",
|
||||
|
@ -14,8 +15,10 @@ EXPORTS.mozilla.intl = [
|
|||
|
||||
UNIFIED_SOURCES += [
|
||||
"src/Calendar.cpp",
|
||||
"src/Collator.cpp",
|
||||
"src/DateTimeFormat.cpp",
|
||||
"src/DateTimePatternGenerator.cpp",
|
||||
"src/ICU4CGlue.cpp",
|
||||
"src/NumberFormat.cpp",
|
||||
"src/NumberFormatFields.cpp",
|
||||
"src/NumberFormatterSkeleton.cpp",
|
||||
|
|
|
@ -0,0 +1,253 @@
|
|||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#include <algorithm>
|
||||
#include <string.h>
|
||||
#include "mozilla/intl/Collator.h"
|
||||
|
||||
namespace mozilla::intl {
|
||||
|
||||
Collator::Collator(UCollator* aCollator) : mCollator(aCollator) {
|
||||
MOZ_ASSERT(aCollator);
|
||||
}
|
||||
|
||||
Collator::~Collator() {
|
||||
if (mCollator.GetMut()) {
|
||||
ucol_close(mCollator.GetMut());
|
||||
}
|
||||
}
|
||||
|
||||
Result<UniquePtr<Collator>, ICUError> Collator::TryCreate(const char* aLocale) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UCollator* collator = ucol_open(aLocale, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
return MakeUnique<Collator>(collator);
|
||||
}
|
||||
if (status == U_MEMORY_ALLOCATION_ERROR) {
|
||||
return Err(ICUError::OutOfMemory);
|
||||
}
|
||||
return Err(ICUError::InternalError);
|
||||
};
|
||||
|
||||
int32_t Collator::CompareStrings(Span<const char16_t> aSource,
|
||||
Span<const char16_t> aTarget) const {
|
||||
switch (ucol_strcoll(mCollator.GetConst(), aSource.data(),
|
||||
static_cast<int32_t>(aSource.size()), aTarget.data(),
|
||||
static_cast<int32_t>(aTarget.size()))) {
|
||||
case UCOL_LESS:
|
||||
return -1;
|
||||
case UCOL_EQUAL:
|
||||
return 0;
|
||||
case UCOL_GREATER:
|
||||
return 1;
|
||||
}
|
||||
MOZ_ASSERT_UNREACHABLE("ucol_strcoll returned bad UCollationResult");
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t Collator::CompareSortKeys(Span<const uint8_t> aKey1,
|
||||
Span<const uint8_t> aKey2) const {
|
||||
size_t minLength = std::min(aKey1.Length(), aKey2.Length());
|
||||
int32_t tmpResult = strncmp((const char*)aKey1.Elements(),
|
||||
(const char*)aKey2.Elements(), minLength);
|
||||
if (tmpResult < 0) {
|
||||
return -1;
|
||||
}
|
||||
if (tmpResult > 0) {
|
||||
return 1;
|
||||
}
|
||||
if (aKey1.Length() > minLength) {
|
||||
// First string contains second one, so comes later, hence return > 0.
|
||||
return 1;
|
||||
}
|
||||
if (aKey2.Length() > minLength) {
|
||||
// First string is a substring of second one, so comes earlier,
|
||||
// hence return < 0.
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static UColAttributeValue CaseFirstToICU(Collator::CaseFirst caseFirst) {
|
||||
switch (caseFirst) {
|
||||
case Collator::CaseFirst::False:
|
||||
return UCOL_OFF;
|
||||
case Collator::CaseFirst::Upper:
|
||||
return UCOL_UPPER_FIRST;
|
||||
case Collator::CaseFirst::Lower:
|
||||
return UCOL_LOWER_FIRST;
|
||||
}
|
||||
|
||||
MOZ_ASSERT_UNREACHABLE();
|
||||
return UCOL_DEFAULT;
|
||||
}
|
||||
|
||||
// Define this as a macro to work around exposing the UColAttributeValue type to
|
||||
// the header file. Collation::Feature is private to the class.
|
||||
#define FEATURE_TO_ICU(featureICU, feature) \
|
||||
switch (feature) { \
|
||||
case Collator::Feature::On: \
|
||||
(featureICU) = UCOL_ON; \
|
||||
break; \
|
||||
case Collator::Feature::Off: \
|
||||
(featureICU) = UCOL_OFF; \
|
||||
break; \
|
||||
case Collator::Feature::Default: \
|
||||
(featureICU) = UCOL_DEFAULT; \
|
||||
break; \
|
||||
}
|
||||
|
||||
void Collator::SetStrength(Collator::Strength aStrength) {
|
||||
UColAttributeValue strength;
|
||||
switch (aStrength) {
|
||||
case Collator::Strength::Default:
|
||||
strength = UCOL_DEFAULT_STRENGTH;
|
||||
break;
|
||||
case Collator::Strength::Primary:
|
||||
strength = UCOL_PRIMARY;
|
||||
break;
|
||||
case Collator::Strength::Secondary:
|
||||
strength = UCOL_SECONDARY;
|
||||
break;
|
||||
case Collator::Strength::Tertiary:
|
||||
strength = UCOL_TERTIARY;
|
||||
break;
|
||||
case Collator::Strength::Quaternary:
|
||||
strength = UCOL_QUATERNARY;
|
||||
break;
|
||||
case Collator::Strength::Identical:
|
||||
strength = UCOL_IDENTICAL;
|
||||
break;
|
||||
}
|
||||
|
||||
ucol_setStrength(mCollator.GetMut(), strength);
|
||||
}
|
||||
|
||||
ICUResult Collator::SetCaseLevel(Collator::Feature aFeature) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UColAttributeValue featureICU;
|
||||
FEATURE_TO_ICU(featureICU, aFeature);
|
||||
ucol_setAttribute(mCollator.GetMut(), UCOL_CASE_LEVEL, featureICU, &status);
|
||||
return ToICUResult(status);
|
||||
}
|
||||
|
||||
ICUResult Collator::SetAlternateHandling(
|
||||
Collator::AlternateHandling aAlternateHandling) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UColAttributeValue handling;
|
||||
switch (aAlternateHandling) {
|
||||
case Collator::AlternateHandling::NonIgnorable:
|
||||
handling = UCOL_NON_IGNORABLE;
|
||||
break;
|
||||
case Collator::AlternateHandling::Shifted:
|
||||
handling = UCOL_SHIFTED;
|
||||
break;
|
||||
case Collator::AlternateHandling::Default:
|
||||
handling = UCOL_DEFAULT;
|
||||
break;
|
||||
}
|
||||
|
||||
ucol_setAttribute(mCollator.GetMut(), UCOL_ALTERNATE_HANDLING, handling,
|
||||
&status);
|
||||
return ToICUResult(status);
|
||||
}
|
||||
|
||||
ICUResult Collator::SetNumericCollation(Collator::Feature aFeature) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UColAttributeValue featureICU;
|
||||
FEATURE_TO_ICU(featureICU, aFeature);
|
||||
|
||||
ucol_setAttribute(mCollator.GetMut(), UCOL_NUMERIC_COLLATION, featureICU,
|
||||
&status);
|
||||
return ToICUResult(status);
|
||||
}
|
||||
|
||||
ICUResult Collator::SetNormalizationMode(Collator::Feature aFeature) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UColAttributeValue featureICU;
|
||||
FEATURE_TO_ICU(featureICU, aFeature);
|
||||
ucol_setAttribute(mCollator.GetMut(), UCOL_NORMALIZATION_MODE, featureICU,
|
||||
&status);
|
||||
return ToICUResult(status);
|
||||
}
|
||||
|
||||
ICUResult Collator::SetCaseFirst(Collator::CaseFirst aCaseFirst) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
ucol_setAttribute(mCollator.GetMut(), UCOL_CASE_FIRST,
|
||||
CaseFirstToICU(aCaseFirst), &status);
|
||||
return ToICUResult(status);
|
||||
}
|
||||
|
||||
ICUResult Collator::SetOptions(const Options& aOptions,
|
||||
const Maybe<Options&> aPrevOptions) {
|
||||
if (aPrevOptions &&
|
||||
// Check the equality of the previous options.
|
||||
aPrevOptions->sensitivity == aOptions.sensitivity &&
|
||||
aPrevOptions->caseFirst == aOptions.caseFirst &&
|
||||
aPrevOptions->ignorePunctuation == aOptions.ignorePunctuation &&
|
||||
aPrevOptions->numeric == aOptions.numeric) {
|
||||
return Ok();
|
||||
}
|
||||
|
||||
Collator::Strength strength = Collator::Strength::Default;
|
||||
Collator::Feature caseLevel = Collator::Feature::Off;
|
||||
switch (aOptions.sensitivity) {
|
||||
case Collator::Sensitivity::Base:
|
||||
strength = Collator::Strength::Primary;
|
||||
break;
|
||||
case Collator::Sensitivity::Accent:
|
||||
strength = Collator::Strength::Secondary;
|
||||
break;
|
||||
case Collator::Sensitivity::Case:
|
||||
caseLevel = Collator::Feature::On;
|
||||
strength = Collator::Strength::Primary;
|
||||
break;
|
||||
case Collator::Sensitivity::Variant:
|
||||
strength = Collator::Strength::Tertiary;
|
||||
break;
|
||||
}
|
||||
|
||||
SetStrength(strength);
|
||||
|
||||
ICUResult result = Ok();
|
||||
|
||||
// According to the ICU team, UCOL_SHIFTED causes punctuation to be
|
||||
// ignored. Looking at Unicode Technical Report 35, Unicode Locale Data
|
||||
// Markup Language, "shifted" causes whitespace and punctuation to be
|
||||
// ignored - that's a bit more than asked for, but there's no way to get
|
||||
// less.
|
||||
result = this->SetAlternateHandling(
|
||||
aOptions.ignorePunctuation ? Collator::AlternateHandling::Shifted
|
||||
: Collator::AlternateHandling::Default);
|
||||
if (result.isErr()) {
|
||||
return result;
|
||||
}
|
||||
|
||||
result = SetCaseLevel(caseLevel);
|
||||
if (result.isErr()) {
|
||||
return result;
|
||||
}
|
||||
|
||||
result = SetNumericCollation(aOptions.numeric ? Collator::Feature::On
|
||||
: Collator::Feature::Off);
|
||||
if (result.isErr()) {
|
||||
return result;
|
||||
}
|
||||
|
||||
// Normalization is always on to meet the canonical equivalence requirement.
|
||||
result = SetNormalizationMode(Collator::Feature::On);
|
||||
if (result.isErr()) {
|
||||
return result;
|
||||
}
|
||||
|
||||
result = SetCaseFirst(aOptions.caseFirst);
|
||||
if (result.isErr()) {
|
||||
return result;
|
||||
}
|
||||
return Ok();
|
||||
}
|
||||
|
||||
#undef FEATURE_TO_ICU
|
||||
|
||||
} // namespace mozilla::intl
|
|
@ -0,0 +1,255 @@
|
|||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
#ifndef intl_components_Collator_h_
|
||||
#define intl_components_Collator_h_
|
||||
|
||||
#ifndef JS_STANDALONE
|
||||
# include "gtest/MozGtestFriend.h"
|
||||
#endif
|
||||
|
||||
#include "unicode/ucol.h"
|
||||
|
||||
#include "mozilla/intl/ICU4CGlue.h"
|
||||
#include "mozilla/Result.h"
|
||||
#include "mozilla/Span.h"
|
||||
|
||||
namespace mozilla::intl {
|
||||
|
||||
class Collator final {
|
||||
public:
|
||||
/**
|
||||
* Construct from a raw UCollator. This is public so that the UniquePtr can
|
||||
* access it.
|
||||
*/
|
||||
explicit Collator(UCollator* aCollator);
|
||||
|
||||
// Do not allow copy as this class owns the ICU resource. Move is not
|
||||
// currently implemented, but a custom move operator could be created if
|
||||
// needed.
|
||||
Collator(const Collator&) = delete;
|
||||
Collator& operator=(const Collator&) = delete;
|
||||
|
||||
/**
|
||||
* Attempt to initialize a new collator.
|
||||
*/
|
||||
static Result<UniquePtr<Collator>, ICUError> TryCreate(const char* aLocale);
|
||||
|
||||
~Collator();
|
||||
|
||||
template <typename B>
|
||||
ICUResult GetSortKey(Span<const char16_t> aString, B& aBuffer) const {
|
||||
static_assert(std::is_same_v<typename B::CharType, uint8_t>,
|
||||
"Expected a uint8_t* buffer.");
|
||||
// Do not use FillBufferWithICUCall, as this API does not report the
|
||||
// U_BUFFER_OVERFLOW_ERROR. The return value is always the number of bytes
|
||||
// needed, regardless of whether the result buffer was big enough.
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int32_t length =
|
||||
ucol_getSortKey(mCollator.GetConst(), aString.data(),
|
||||
static_cast<int32_t>(aString.size()), nullptr, 0);
|
||||
if (U_FAILURE(status) || length == 0) {
|
||||
// If the length is 0, and internal error occurred according to the docs.
|
||||
return Err(ICUError::InternalError);
|
||||
}
|
||||
|
||||
if (!aBuffer.reserve(length)) {
|
||||
return Err(ICUError::OutOfMemory);
|
||||
}
|
||||
|
||||
length = ucol_getSortKey(mCollator.GetConst(), aString.data(),
|
||||
aString.size(), aBuffer.data(), length);
|
||||
|
||||
if (U_FAILURE(status) || length == 0) {
|
||||
return Err(ICUError::InternalError);
|
||||
}
|
||||
|
||||
aBuffer.written(length);
|
||||
return Ok();
|
||||
}
|
||||
|
||||
int32_t CompareStrings(Span<const char16_t> aSource,
|
||||
Span<const char16_t> aTarget) const;
|
||||
|
||||
int32_t CompareSortKeys(Span<const uint8_t> aKey1,
|
||||
Span<const uint8_t> aKey2) const;
|
||||
|
||||
/**
|
||||
* Determine how casing affects sorting. These options map to ECMA 402
|
||||
* collator options.
|
||||
*
|
||||
* https://tc39.es/ecma402/#sec-initializecollator
|
||||
*/
|
||||
enum class CaseFirst {
|
||||
// Sort upper case first.
|
||||
Upper,
|
||||
// Sort lower case first.
|
||||
Lower,
|
||||
// Orders upper and lower case letters in accordance to their tertiary
|
||||
// weights.
|
||||
False,
|
||||
};
|
||||
|
||||
/**
|
||||
* Which differences in the strings should lead to differences in collation
|
||||
* comparisons.
|
||||
*
|
||||
* This setting needs to be ECMA 402 compliant.
|
||||
* https://tc39.es/ecma402/#sec-collator-comparestrings
|
||||
*/
|
||||
enum class Sensitivity {
|
||||
// Only strings that differ in base letters compare as unequal.
|
||||
// Examples: a ≠ b, a = á, a = A.
|
||||
Base,
|
||||
// Only strings that differ in base letters or accents and other diacritic
|
||||
// marks compare as unequal.
|
||||
// Examples: a ≠ b, a ≠ á, a = A.
|
||||
Accent,
|
||||
// Only strings that differ in base letters or case compare as unequal.
|
||||
// Examples: a ≠ b, a = á, a ≠ A.
|
||||
Case,
|
||||
// Strings that differ in base letters, accents and other diacritic marks,
|
||||
// or case compare as unequal. Other differences may also be taken into
|
||||
// consideration.
|
||||
// Examples: a ≠ b, a ≠ á, a ≠ A.
|
||||
Variant,
|
||||
};
|
||||
|
||||
/**
|
||||
* These options map to ECMA 402 collator options. Make sure the defaults map
|
||||
* to the default initialized values of ECMA 402.
|
||||
*
|
||||
* https://tc39.es/ecma402/#sec-initializecollator
|
||||
*/
|
||||
struct Options {
|
||||
Sensitivity sensitivity = Sensitivity::Variant;
|
||||
CaseFirst caseFirst = CaseFirst::False;
|
||||
bool ignorePunctuation = false;
|
||||
bool numeric = false;
|
||||
};
|
||||
|
||||
/**
|
||||
* Change the configuraton of the options.
|
||||
*/
|
||||
ICUResult SetOptions(const Options& aOptions,
|
||||
const Maybe<Options&> aPrevOptions = Nothing());
|
||||
|
||||
private:
|
||||
/**
|
||||
* Toggle features, or use the default setting.
|
||||
*/
|
||||
enum class Feature {
|
||||
// Turn the feature off.
|
||||
On,
|
||||
// Turn the feature off.
|
||||
Off,
|
||||
// Use the default setting for the feature.
|
||||
Default,
|
||||
};
|
||||
|
||||
/**
|
||||
* Attribute for handling variable elements.
|
||||
*/
|
||||
enum class AlternateHandling {
|
||||
// Treats all the codepoints with non-ignorable primary weights in the
|
||||
// same way (default)
|
||||
NonIgnorable,
|
||||
// Causes codepoints with primary weights that are equal or below the
|
||||
// variable top value to be ignored on primary level and moved to the
|
||||
// quaternary level.
|
||||
Shifted,
|
||||
Default,
|
||||
};
|
||||
|
||||
/**
|
||||
* The strength attribute.
|
||||
*
|
||||
* The usual strength for most locales (except Japanese) is tertiary.
|
||||
*
|
||||
* Quaternary strength is useful when combined with shifted setting for
|
||||
* alternate handling attribute and for JIS X 4061 collation, when it is used
|
||||
* to distinguish between Katakana and Hiragana. Otherwise, quaternary level
|
||||
* is affected only by the number of non-ignorable code points in the string.
|
||||
*
|
||||
* Identical strength is rarely useful, as it amounts to codepoints of the NFD
|
||||
* form of the string.
|
||||
*/
|
||||
enum class Strength {
|
||||
// Primary collation strength.
|
||||
Primary,
|
||||
// Secondary collation strength.
|
||||
Secondary,
|
||||
// Tertiary collation strength.
|
||||
Tertiary,
|
||||
// Quaternary collation strength.
|
||||
Quaternary,
|
||||
// Identical collation strength.
|
||||
Identical,
|
||||
Default,
|
||||
};
|
||||
|
||||
/**
|
||||
* Configure the Collation::Strength
|
||||
*/
|
||||
void SetStrength(Strength strength);
|
||||
|
||||
/**
|
||||
* Configure Collation::AlternateHandling.
|
||||
*/
|
||||
ICUResult SetAlternateHandling(AlternateHandling aAlternateHandling);
|
||||
|
||||
/**
|
||||
* Controls whether an extra case level (positioned before the third level) is
|
||||
* generated or not.
|
||||
*
|
||||
* Contents of the case level are affected by the value of CaseFirst
|
||||
* attribute. A simple way to ignore accent differences in a string is to set
|
||||
* the strength to Primary and enable case level.
|
||||
*/
|
||||
ICUResult SetCaseLevel(Feature aFeature);
|
||||
|
||||
/**
|
||||
* When turned on, this attribute makes substrings of digits sort according to
|
||||
* their numeric values.
|
||||
*
|
||||
* This is a way to get '100' to sort AFTER '2'. Note that the longest digit
|
||||
* substring that can be treated as a single unit is 254 digits (not counting
|
||||
* leading zeros). If a digit substring is longer than that, the digits beyond
|
||||
* the limit will be treated as a separate digit substring.
|
||||
*
|
||||
* A "digit" in this sense is a code point with General_Category=Nd, which
|
||||
* does not include circled numbers, roman numerals, etc. Only a contiguous
|
||||
* digit substring is considered, that is, non-negative integers without
|
||||
* separators. There is no support for plus/minus signs, decimals, exponents,
|
||||
* etc.
|
||||
*/
|
||||
ICUResult SetNumericCollation(Feature aFeature);
|
||||
|
||||
/**
|
||||
* Controls whether the normalization check and necessary normalizations are
|
||||
* performed.
|
||||
*
|
||||
* When off (default), no normalization check is performed. The correctness of
|
||||
* the result is guaranteed only if the input data is in so-called FCD form
|
||||
* When set to on, an incremental check is performed to see whether the input
|
||||
* data is in the FCD form. If the data is not in the FCD form, incremental
|
||||
* NFD normalization is performed.
|
||||
*/
|
||||
ICUResult SetNormalizationMode(Feature aFeature);
|
||||
|
||||
/**
|
||||
* Configure Collation::CaseFirst.
|
||||
*/
|
||||
ICUResult SetCaseFirst(CaseFirst aCaseFirst);
|
||||
|
||||
#ifndef JS_STANDALONE
|
||||
FRIEND_TEST(IntlCollator, SetAttributesInternal);
|
||||
#endif
|
||||
|
||||
ICUPointer<UCollator> mCollator = ICUPointer<UCollator>(nullptr);
|
||||
Maybe<Sensitivity> mLastStrategy = Nothing();
|
||||
};
|
||||
|
||||
} // namespace mozilla::intl
|
||||
|
||||
#endif
|
|
@ -0,0 +1,19 @@
|
|||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#include "mozilla/intl/ICU4CGlue.h"
|
||||
|
||||
namespace mozilla::intl {
|
||||
|
||||
ICUResult ToICUResult(UErrorCode status) {
|
||||
if (U_SUCCESS(status)) {
|
||||
return Ok();
|
||||
}
|
||||
if (status == U_MEMORY_ALLOCATION_ERROR) {
|
||||
return Err(ICUError::OutOfMemory);
|
||||
}
|
||||
return Err(ICUError::InternalError);
|
||||
}
|
||||
|
||||
} // namespace mozilla::intl
|
|
@ -27,6 +27,11 @@ struct InternalError {};
|
|||
|
||||
using ICUResult = Result<Ok, ICUError>;
|
||||
|
||||
/**
|
||||
* Convert a UErrorCode to ICUResult.
|
||||
*/
|
||||
ICUResult ToICUResult(UErrorCode status);
|
||||
|
||||
/**
|
||||
* The ICU status can complain about a string not being terminated, but this
|
||||
* is fine for this API, as it deals with the mozilla::Span that has a pointer
|
||||
|
@ -36,6 +41,28 @@ static inline bool ICUSuccessForStringSpan(UErrorCode status) {
|
|||
return U_SUCCESS(status) || status == U_STRING_NOT_TERMINATED_WARNING;
|
||||
}
|
||||
|
||||
/**
|
||||
* This class manages the access to an ICU pointer. It allows requesting either
|
||||
* a mutable or const pointer. This pointer should match the const or mutability
|
||||
* of the ICU APIs. This will then correctly propagate const-ness into the
|
||||
* mozilla::intl APIs.
|
||||
*/
|
||||
template <typename T>
|
||||
class ICUPointer {
|
||||
public:
|
||||
explicit ICUPointer(T* aPointer) : mPointer(aPointer) {}
|
||||
|
||||
// Only allow moves, no copies.
|
||||
ICUPointer(ICUPointer&& other) noexcept = default;
|
||||
ICUPointer& operator=(ICUPointer&& other) noexcept = default;
|
||||
|
||||
const T* GetConst() const { return const_cast<const T*>(mPointer); }
|
||||
T* GetMut() { return mPointer; }
|
||||
|
||||
private:
|
||||
T* mPointer;
|
||||
};
|
||||
|
||||
/**
|
||||
* Calling into ICU with the C-API can be a bit tricky. This function wraps up
|
||||
* the relatively risky operations involving pointers, lengths, and buffers into
|
||||
|
|
Загрузка…
Ссылка в новой задаче