Bug 1719540 - Implement a unified Locale canonicalizer; r=platform-i18n-reviewers,dminor

I used the C api here, but verified that the behavior was the same
as using the C++ getBaseName API.

927fee5656/src/features/icu.cpp (L249-L268)

I tried to keep the implementation naming at least similar to the
work in: https://phabricator.services.mozilla.com/D118070

However, the implementation do diverge. I figure in the future, the
APIs can be made similar, but at least the gtests will ensure the
behavior remains the same.

Differential Revision: https://phabricator.services.mozilla.com/D120248
This commit is contained in:
Greg Tatum 2021-09-01 00:02:30 +00:00
Родитель 97b566d849
Коммит 443160a426
6 изменённых файлов: 144 добавлений и 2 удалений

Просмотреть файл

@ -0,0 +1,60 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "gtest/gtest.h"
#include "mozilla/intl/LocaleCanonicalizer.h"
#include "mozilla/Span.h"
namespace mozilla::intl {
static void CheckLocaleResult(LocaleCanonicalizer::Vector& ascii,
const char* before, const char* after) {
auto result = LocaleCanonicalizer::CanonicalizeICULevel1(before, ascii);
ASSERT_TRUE(result.isOk());
ASSERT_EQ(Span(const_cast<const char*>(ascii.begin()), ascii.length()),
MakeStringSpan(after));
}
/**
* Asserts the behavior of canonicalization as defined in:
* http://userguide.icu-project.org/locale#TOC-Canonicalization
*/
TEST(IntlLocaleCanonicalizer, CanonicalizeICULevel1)
{
LocaleCanonicalizer::Vector ascii{};
// Canonicalizes en-US
CheckLocaleResult(ascii, "en-US", "en_US");
// Canonicalizes POSIX
CheckLocaleResult(ascii, "en-US-posix", "en_US_POSIX");
// und gets changed to an empty string
CheckLocaleResult(ascii, "und", "");
// retains incorrect locales
CheckLocaleResult(ascii, "asdf", "asdf");
// makes text uppercase
CheckLocaleResult(ascii, "es-es", "es_ES");
// Converts 3 letter country codes to 2 letter.
CheckLocaleResult(ascii, "en-USA", "en_US");
// Does not perform level 2 canonicalization where the result would be
// fr_FR@currency=EUR
CheckLocaleResult(ascii, "fr-fr@EURO", "fr_FR_EURO");
// Removes the .utf8 ends
CheckLocaleResult(ascii, "ar-MA.utf8", "ar_MA");
// Allows valid ascii inputs
CheckLocaleResult(
ascii,
"abcdefghijlkmnopqrstuvwxyzABCDEFGHIJLKMNOPQRSTUVWXYZ-_.0123456789",
"abcdefghijlkmnopqrstuvwxyzabcdefghijlkmnopqrstuvwxyz__");
CheckLocaleResult(ascii, "exotic ascii:", "exotic ascii:");
// Does not accept non-ascii inputs.
ASSERT_EQ(LocaleCanonicalizer::CanonicalizeICULevel1("👍", ascii).unwrapErr(),
ICUError::InternalError);
ASSERT_EQ(
LocaleCanonicalizer::CanonicalizeICULevel1("ᏣᎳᎩ", ascii).unwrapErr(),
ICUError::InternalError);
}
} // namespace mozilla::intl

Просмотреть файл

@ -8,6 +8,7 @@ UNIFIED_SOURCES += [
"TestCalendar.cpp",
"TestCollator.cpp",
"TestDateTimeFormat.cpp",
"TestLocaleCanonicalizer.cpp",
"TestNumberFormat.cpp",
"TestPluralRules.cpp",
"TestRelativeTimeFormat.cpp",

Просмотреть файл

@ -10,6 +10,7 @@ EXPORTS.mozilla.intl = [
"src/DateTimePatternGenerator.h",
"src/ICU4CGlue.h",
"src/ICUError.h",
"src/LocaleCanonicalizer.h",
"src/NumberFormat.h",
"src/NumberFormatFields.h",
"src/NumberRangeFormat.h",
@ -23,6 +24,7 @@ UNIFIED_SOURCES += [
"src/DateTimeFormat.cpp",
"src/DateTimePatternGenerator.cpp",
"src/ICU4CGlue.cpp",
"src/LocaleCanonicalizer.cpp",
"src/NumberFormat.cpp",
"src/NumberFormatFields.cpp",
"src/NumberFormatFieldsUtil.cpp",

Просмотреть файл

@ -108,8 +108,8 @@ static ICUResult FillBufferWithICUCall(Buffer& buffer,
* A variant of FillBufferWithICUCall that accepts a mozilla::Vector rather than
* a Buffer.
*/
template <typename ICUStringFunction, size_t InlineSize>
static ICUResult FillVectorWithICUCall(Vector<char16_t, InlineSize>& vector,
template <typename ICUStringFunction, size_t InlineSize, typename CharType>
static ICUResult FillVectorWithICUCall(Vector<CharType, InlineSize>& vector,
const ICUStringFunction& strFn) {
UErrorCode status = U_ZERO_ERROR;
int32_t length = strFn(vector.begin(), vector.capacity(), &status);

Просмотреть файл

@ -0,0 +1,36 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "mozilla/intl/LocaleCanonicalizer.h"
#include <cstdio>
#include "unicode/uloc.h"
namespace mozilla::intl {
/* static */
ICUResult LocaleCanonicalizer::CanonicalizeICULevel1(
const char* aLocaleIn, LocaleCanonicalizer::Vector& aLocaleOut) {
auto result = FillVectorWithICUCall(
aLocaleOut,
[&aLocaleIn](char* target, int32_t length, UErrorCode* status) {
return uloc_canonicalize(aLocaleIn, target, length, status);
});
if (result.isErr()) {
return Err(result.unwrapErr());
}
// This step is not included in the normal ICU4C canonicalization step, but
// consumers were expecting the results to actually be ASCII. It seemed safer
// to include it.
for (auto byte : aLocaleOut) {
if (static_cast<unsigned char>(byte) > 127) {
return Err(ICUError::InternalError);
}
}
return Ok();
}
} // namespace mozilla::intl

Просмотреть файл

@ -0,0 +1,43 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef intl_components_Locale_h_
#define intl_components_Locale_h_
#include "mozilla/intl/ICU4CGlue.h"
#include "mozilla/Span.h"
#include "mozilla/Vector.h"
namespace mozilla::intl {
/**
* 32 is somewhat an arbitrary size, but it should fit most locales on the
* stack to avoid heap allocations.
*/
constexpr size_t INITIAL_LOCALE_CANONICALIZER_BUFFER_SIZE = 32;
/**
* Eventually this class will unify the behaviors of Locale Canonicalization.
* See Bug 1723586.
*/
class LocaleCanonicalizer {
public:
using Vector =
mozilla::Vector<char, INITIAL_LOCALE_CANONICALIZER_BUFFER_SIZE>;
/**
* This static method will canonicalize a locale string, per the Level 1
* canonicalization steps outlined in:
* http://userguide.icu-project.org/locale#TOC-Canonicalization
*
* For instance it will turn the string "en-US" to "en_US". It guarantees that
* the string span targeted will be in the ASCII range. The canonicalization
* process on ICU is somewhat permissive in what it accepts as input, but only
* ASCII locales are technically correct.
*/
static ICUResult CanonicalizeICULevel1(
const char* aLocale, LocaleCanonicalizer::Vector& aLocaleOut);
};
} // namespace mozilla::intl
#endif