зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1570370 - Part 1: Port Unicode BCP 47 locale identifier parser to C++. r=jwalden
Differential Revision: https://phabricator.services.mozilla.com/D40067 --HG-- extra : moz-landing-system : lando
This commit is contained in:
Родитель
8b869f9cf6
Коммит
4a8f76c4de
|
@ -4,7 +4,9 @@ build/clang-plugin/.*
|
|||
config/gcc-stl-wrapper.template.h
|
||||
config/msvc-stl-wrapper.template.h
|
||||
# Generated code
|
||||
js/src/builtin/intl/LanguageTagGenerated.cpp
|
||||
js/src/builtin/intl/TimeZoneDataGenerated.h
|
||||
js/src/builtin/intl/UnicodeExtensionsGenerated.cpp
|
||||
|
||||
# Don't want to reformat irregexp. bug 1510128
|
||||
js/src/irregexp/.*
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,689 @@
|
|||
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
|
||||
* vim: set ts=8 sts=2 et sw=2 tw=80:
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
/* Structured representation of Unicode locale IDs used with Intl functions. */
|
||||
|
||||
#ifndef builtin_intl_LanguageTag_h
|
||||
#define builtin_intl_LanguageTag_h
|
||||
|
||||
#include "mozilla/Assertions.h"
|
||||
#include "mozilla/Range.h"
|
||||
#include "mozilla/TextUtils.h"
|
||||
#include "mozilla/TypedEnumBits.h"
|
||||
#include "mozilla/Variant.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <utility>
|
||||
|
||||
#include "js/AllocPolicy.h"
|
||||
#include "js/GCAPI.h"
|
||||
#include "js/Result.h"
|
||||
#include "js/Utility.h"
|
||||
#include "js/Vector.h"
|
||||
|
||||
struct JSContext;
|
||||
class JSLinearString;
|
||||
class JSString;
|
||||
|
||||
namespace js {
|
||||
|
||||
class StringBuffer;
|
||||
|
||||
namespace intl {
|
||||
|
||||
#ifdef DEBUG
|
||||
|
||||
/**
|
||||
* Return true if |language| is a valid, case-normalized language subtag.
|
||||
*/
|
||||
template <typename CharT>
|
||||
bool IsStructurallyValidLanguageTag(
|
||||
const mozilla::Range<const CharT>& language);
|
||||
|
||||
/**
|
||||
* Return true if |script| is a valid, case-normalized script subtag.
|
||||
*/
|
||||
template <typename CharT>
|
||||
bool IsStructurallyValidScriptTag(const mozilla::Range<const CharT>& script);
|
||||
|
||||
/**
|
||||
* Return true if |region| is a valid, case-normalized region subtag.
|
||||
*/
|
||||
template <typename CharT>
|
||||
bool IsStructurallyValidRegionTag(const mozilla::Range<const CharT>& region);
|
||||
|
||||
/**
|
||||
* Return true if |variant| is a valid, case-normalized variant subtag.
|
||||
*/
|
||||
bool IsStructurallyValidVariantTag(const mozilla::Range<const char>& variant);
|
||||
|
||||
/**
|
||||
* Return true if |extension| is a valid, case-normalized Unicode extension
|
||||
* subtag.
|
||||
*/
|
||||
bool IsStructurallyValidUnicodeExtensionTag(
|
||||
const mozilla::Range<const char>& extension);
|
||||
|
||||
/**
|
||||
* Return true if |privateUse| is a valid, case-normalized private-use subtag.
|
||||
*/
|
||||
bool IsStructurallyValidPrivateUseTag(
|
||||
const mozilla::Range<const char>& privateUse);
|
||||
|
||||
#endif
|
||||
|
||||
template <typename CharT>
|
||||
char AsciiToLowerCase(CharT c) {
|
||||
MOZ_ASSERT(mozilla::IsAscii(c));
|
||||
return mozilla::IsAsciiUppercaseAlpha(c) ? (c | 0x20) : c;
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
char AsciiToUpperCase(CharT c) {
|
||||
MOZ_ASSERT(mozilla::IsAscii(c));
|
||||
return mozilla::IsAsciiLowercaseAlpha(c) ? (c & ~0x20) : c;
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
void AsciiToLowerCase(CharT* chars, size_t length, char* dest) {
|
||||
// Tell the analysis the |std::transform| function can't GC.
|
||||
JS::AutoSuppressGCAnalysis nogc;
|
||||
|
||||
char (&fn)(CharT) = AsciiToLowerCase;
|
||||
std::transform(chars, chars + length, dest, fn);
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
void AsciiToUpperCase(CharT* chars, size_t length, char* dest) {
|
||||
// Tell the analysis the |std::transform| function can't GC.
|
||||
JS::AutoSuppressGCAnalysis nogc;
|
||||
|
||||
char (&fn)(CharT) = AsciiToUpperCase;
|
||||
std::transform(chars, chars + length, dest, fn);
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
void AsciiToTitleCase(CharT* chars, size_t length, char* dest) {
|
||||
if (length > 0) {
|
||||
AsciiToUpperCase(chars, 1, dest);
|
||||
AsciiToLowerCase(chars + 1, length - 1, dest + 1);
|
||||
}
|
||||
}
|
||||
|
||||
// Constants for language subtag lengths.
|
||||
namespace LanguageTagLimits {
|
||||
|
||||
// unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
|
||||
static constexpr size_t LanguageLength = 8;
|
||||
|
||||
// unicode_script_subtag = alpha{4} ;
|
||||
static constexpr size_t ScriptLength = 4;
|
||||
|
||||
// unicode_region_subtag = (alpha{2} | digit{3}) ;
|
||||
static constexpr size_t RegionLength = 3;
|
||||
static constexpr size_t AlphaRegionLength = 2;
|
||||
static constexpr size_t DigitRegionLength = 3;
|
||||
|
||||
// key = alphanum alpha ;
|
||||
static constexpr size_t UnicodeKeyLength = 2;
|
||||
|
||||
// tkey = alpha digit ;
|
||||
static constexpr size_t TransformKeyLength = 2;
|
||||
|
||||
} // namespace LanguageTagLimits
|
||||
|
||||
// Fixed size language subtag which is stored inline in LanguageTag.
|
||||
template <size_t Length>
|
||||
class LanguageTagSubtag final {
|
||||
uint8_t length_ = 0;
|
||||
char chars_[Length];
|
||||
|
||||
public:
|
||||
LanguageTagSubtag() = default;
|
||||
|
||||
LanguageTagSubtag(const LanguageTagSubtag&) = delete;
|
||||
LanguageTagSubtag& operator=(const LanguageTagSubtag&) = delete;
|
||||
|
||||
size_t length() const { return length_; }
|
||||
|
||||
mozilla::Range<const char> range() const { return {chars_, length_}; }
|
||||
|
||||
template <typename CharT>
|
||||
void set(const mozilla::Range<const CharT>& str) {
|
||||
MOZ_ASSERT(str.length() <= Length);
|
||||
std::copy_n(str.begin().get(), str.length(), chars_);
|
||||
length_ = str.length();
|
||||
}
|
||||
|
||||
void toLowerCase() { AsciiToLowerCase(chars_, length(), chars_); }
|
||||
|
||||
void toUpperCase() { AsciiToUpperCase(chars_, length(), chars_); }
|
||||
|
||||
void toTitleCase() { AsciiToTitleCase(chars_, length(), chars_); }
|
||||
|
||||
template <size_t N>
|
||||
bool equalTo(const char (&str)[N]) const {
|
||||
static_assert(N - 1 <= Length,
|
||||
"subtag literals must not exceed the maximum subtag length");
|
||||
|
||||
return length_ == N - 1 && memcmp(chars_, str, N - 1) == 0;
|
||||
}
|
||||
};
|
||||
|
||||
using LanguageSubtag = LanguageTagSubtag<LanguageTagLimits::LanguageLength>;
|
||||
using ScriptSubtag = LanguageTagSubtag<LanguageTagLimits::ScriptLength>;
|
||||
using RegionSubtag = LanguageTagSubtag<LanguageTagLimits::RegionLength>;
|
||||
|
||||
/**
|
||||
* Object representing a language tag.
|
||||
*
|
||||
* All subtags are already in canonicalized case.
|
||||
*/
|
||||
class MOZ_STACK_CLASS LanguageTag final {
|
||||
LanguageSubtag language_ = {};
|
||||
ScriptSubtag script_ = {};
|
||||
RegionSubtag region_ = {};
|
||||
|
||||
using VariantsVector = Vector<JS::UniqueChars, 2>;
|
||||
using ExtensionsVector = Vector<JS::UniqueChars, 2>;
|
||||
|
||||
VariantsVector variants_;
|
||||
ExtensionsVector extensions_;
|
||||
JS::UniqueChars privateuse_ = nullptr;
|
||||
|
||||
friend class LanguageTagParser;
|
||||
|
||||
public:
|
||||
// Flag to request canonicalized Unicode extensions.
|
||||
enum class UnicodeExtensionCanonicalForm : bool { No, Yes };
|
||||
|
||||
private:
|
||||
bool canonicalizeUnicodeExtension(
|
||||
JSContext* cx, JS::UniqueChars& unicodeExtension,
|
||||
UnicodeExtensionCanonicalForm canonicalForm);
|
||||
|
||||
bool canonicalizeTransformExtension(JSContext* cx,
|
||||
JS::UniqueChars& transformExtension);
|
||||
|
||||
public:
|
||||
static bool languageMapping(LanguageSubtag& language);
|
||||
static bool complexLanguageMapping(const LanguageSubtag& language);
|
||||
|
||||
private:
|
||||
static bool regionMapping(RegionSubtag& region);
|
||||
static bool complexRegionMapping(const RegionSubtag& region);
|
||||
|
||||
void performComplexLanguageMappings();
|
||||
void performComplexRegionMappings();
|
||||
|
||||
MOZ_MUST_USE bool updateGrandfatheredMappings(JSContext* cx);
|
||||
|
||||
static const char* replaceUnicodeExtensionType(
|
||||
const mozilla::Range<const char>& key,
|
||||
const mozilla::Range<const char>& type);
|
||||
|
||||
public:
|
||||
explicit LanguageTag(JSContext* cx) : variants_(cx), extensions_(cx) {}
|
||||
|
||||
LanguageTag(const LanguageTag&) = delete;
|
||||
LanguageTag& operator=(const LanguageTag&) = delete;
|
||||
|
||||
const LanguageSubtag& language() const { return language_; }
|
||||
const ScriptSubtag& script() const { return script_; }
|
||||
const RegionSubtag& region() const { return region_; }
|
||||
const auto& variants() const { return variants_; }
|
||||
const auto& extensions() const { return extensions_; }
|
||||
const char* privateuse() const { return privateuse_.get(); }
|
||||
|
||||
/**
|
||||
* Set the language subtag. The input must be a valid, case-normalized
|
||||
* language subtag.
|
||||
*/
|
||||
template <size_t N>
|
||||
void setLanguage(const char (&language)[N]) {
|
||||
mozilla::Range<const char> range(language, N - 1);
|
||||
MOZ_ASSERT(IsStructurallyValidLanguageTag(range));
|
||||
language_.set(range);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the language subtag. The input must be a valid, case-normalized
|
||||
* language subtag.
|
||||
*/
|
||||
void setLanguage(const LanguageSubtag& language) {
|
||||
MOZ_ASSERT(IsStructurallyValidLanguageTag(language.range()));
|
||||
language_.set(language.range());
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the script subtag. The input must be a valid, case-normalized
|
||||
* script subtag or the empty string.
|
||||
*/
|
||||
template <size_t N>
|
||||
void setScript(const char (&script)[N]) {
|
||||
mozilla::Range<const char> range(script, N - 1);
|
||||
MOZ_ASSERT(IsStructurallyValidScriptTag(range));
|
||||
script_.set(range);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the script subtag. The input must be a valid, case-normalized
|
||||
* script subtag or the empty string.
|
||||
*/
|
||||
void setScript(const ScriptSubtag& script) {
|
||||
MOZ_ASSERT(script.length() == 0 ||
|
||||
IsStructurallyValidScriptTag(script.range()));
|
||||
script_.set(script.range());
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the region subtag. The input must be a valid, case-normalized
|
||||
* region subtag or the empty string.
|
||||
*/
|
||||
template <size_t N>
|
||||
void setRegion(const char (®ion)[N]) {
|
||||
mozilla::Range<const char> range(region, N - 1);
|
||||
MOZ_ASSERT(IsStructurallyValidRegionTag(range));
|
||||
region_.set(range);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the region subtag. The input must be a valid, case-normalized
|
||||
* region subtag or the empty string.
|
||||
*/
|
||||
void setRegion(const RegionSubtag& region) {
|
||||
MOZ_ASSERT(region.length() == 0 ||
|
||||
IsStructurallyValidRegionTag(region.range()));
|
||||
region_.set(region.range());
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes all variant subtags.
|
||||
*/
|
||||
void clearVariants() { variants_.clearAndFree(); }
|
||||
|
||||
/**
|
||||
* Set the Unicode extension subtag. The input must be a valid,
|
||||
* case-normalized Unicode extension subtag.
|
||||
*/
|
||||
bool setUnicodeExtension(JS::UniqueChars extension);
|
||||
|
||||
/**
|
||||
* Set the private-use subtag. The input must be a valid, case-normalized
|
||||
* private-use subtag or the empty string.
|
||||
*/
|
||||
void setPrivateuse(JS::UniqueChars privateuse) {
|
||||
MOZ_ASSERT(!privateuse ||
|
||||
IsStructurallyValidPrivateUseTag(
|
||||
{privateuse.get(), strlen(privateuse.get())}));
|
||||
privateuse_ = std::move(privateuse);
|
||||
}
|
||||
|
||||
/**
|
||||
* Canonicalize the base-name subtags, that means the language, script,
|
||||
* region, and variant subtags.
|
||||
*/
|
||||
bool canonicalizeBaseName(JSContext* cx);
|
||||
|
||||
/**
|
||||
* Canonicalize all extension subtags.
|
||||
*/
|
||||
bool canonicalizeExtensions(JSContext* cx,
|
||||
UnicodeExtensionCanonicalForm canonicalForm);
|
||||
|
||||
/**
|
||||
* Canonicalizes the given structurally valid Unicode BCP 47 locale
|
||||
* identifier, including regularized case of subtags. For example, the
|
||||
* language tag Zh-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE,
|
||||
* where
|
||||
*
|
||||
* Zh ; 2*3ALPHA
|
||||
* -haNS ; ["-" script]
|
||||
* -bu ; ["-" region]
|
||||
* -variant2 ; *("-" variant)
|
||||
* -Variant1
|
||||
* -u-ca-chinese ; *("-" extension)
|
||||
* -t-Zh-laTN
|
||||
* -x-PRIVATE ; ["-" privateuse]
|
||||
*
|
||||
* becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private
|
||||
*
|
||||
* UTS 35 specifies two different canonicalization algorithms. There's one to
|
||||
* canonicalize BCP 47 language tags and other one to canonicalize Unicode
|
||||
* locale identifiers. The latter one wasn't present when ECMA-402 was changed
|
||||
* to use Unicode BCP 47 locale identifiers instead of BCP 47 language tags,
|
||||
* so ECMA-402 currently only uses the former to canonicalize Unicode BCP 47
|
||||
* locale identifiers.
|
||||
*
|
||||
* Spec: ECMAScript Internationalization API Specification, 6.2.3.
|
||||
* Spec:
|
||||
* https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers
|
||||
* Spec: https://unicode.org/reports/tr35/#BCP_47_Language_Tag_Conversion
|
||||
*/
|
||||
bool canonicalize(JSContext* cx,
|
||||
UnicodeExtensionCanonicalForm canonicalForm) {
|
||||
return canonicalizeBaseName(cx) &&
|
||||
canonicalizeExtensions(cx, canonicalForm);
|
||||
}
|
||||
|
||||
/**
|
||||
* Append the string representation of this language tag to the given
|
||||
* string buffer.
|
||||
*/
|
||||
bool appendTo(JSContext* cx, StringBuffer& sb) const;
|
||||
|
||||
/**
|
||||
* Add likely-subtags to the language tag.
|
||||
*
|
||||
* Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags>
|
||||
*/
|
||||
bool addLikelySubtags(JSContext* cx);
|
||||
|
||||
/**
|
||||
* Remove likely-subtags from the language tag.
|
||||
*
|
||||
* Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags>
|
||||
*/
|
||||
bool removeLikelySubtags(JSContext* cx);
|
||||
};
|
||||
|
||||
/**
|
||||
* Parser for Unicode BCP 47 locale identifiers.
|
||||
*
|
||||
* <https://unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers>
|
||||
*/
|
||||
class MOZ_STACK_CLASS LanguageTagParser final {
|
||||
public:
|
||||
// Exposed as |public| for |MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS|.
|
||||
enum class TokenKind : uint8_t {
|
||||
None = 0b000,
|
||||
Alpha = 0b001,
|
||||
Digit = 0b010,
|
||||
AlphaDigit = 0b011,
|
||||
Error = 0b100
|
||||
};
|
||||
|
||||
private:
|
||||
class Token final {
|
||||
size_t index_;
|
||||
size_t length_;
|
||||
TokenKind kind_;
|
||||
|
||||
public:
|
||||
Token(TokenKind kind, size_t index, size_t length)
|
||||
: index_(index), length_(length), kind_(kind) {}
|
||||
|
||||
TokenKind kind() const { return kind_; }
|
||||
size_t index() const { return index_; }
|
||||
size_t length() const { return length_; }
|
||||
|
||||
bool isError() const { return kind_ == TokenKind::Error; }
|
||||
bool isNone() const { return kind_ == TokenKind::None; }
|
||||
bool isAlpha() const { return kind_ == TokenKind::Alpha; }
|
||||
bool isDigit() const { return kind_ == TokenKind::Digit; }
|
||||
bool isAlphaDigit() const { return kind_ == TokenKind::AlphaDigit; }
|
||||
};
|
||||
|
||||
using LocaleChars = mozilla::Variant<const JS::Latin1Char*, const char16_t*>;
|
||||
|
||||
const LocaleChars& locale_;
|
||||
size_t length_;
|
||||
size_t index_ = 0;
|
||||
|
||||
LanguageTagParser(const LocaleChars& locale, size_t length)
|
||||
: locale_(locale), length_(length) {}
|
||||
|
||||
char16_t charAtUnchecked(size_t index) const {
|
||||
if (locale_.is<const JS::Latin1Char*>()) {
|
||||
return locale_.as<const JS::Latin1Char*>()[index];
|
||||
}
|
||||
return locale_.as<const char16_t*>()[index];
|
||||
}
|
||||
|
||||
char charAt(size_t index) const {
|
||||
char16_t c = charAtUnchecked(index);
|
||||
MOZ_ASSERT(mozilla::IsAscii(c));
|
||||
return c;
|
||||
}
|
||||
|
||||
// Copy the token characters into |subtag|.
|
||||
template <size_t N>
|
||||
void copyChars(const Token& tok, LanguageTagSubtag<N>& subtag) const {
|
||||
size_t index = tok.index();
|
||||
size_t length = tok.length();
|
||||
if (locale_.is<const JS::Latin1Char*>()) {
|
||||
using T = const JS::Latin1Char;
|
||||
subtag.set(mozilla::Range<T>(locale_.as<T*>() + index, length));
|
||||
} else {
|
||||
using T = const char16_t;
|
||||
subtag.set(mozilla::Range<T>(locale_.as<T*>() + index, length));
|
||||
}
|
||||
}
|
||||
|
||||
// Create a string copy of |length| characters starting at |index|.
|
||||
JS::UniqueChars chars(JSContext* cx, size_t index, size_t length) const;
|
||||
|
||||
// Create a string copy of the token characters.
|
||||
JS::UniqueChars chars(JSContext* cx, const Token& tok) const {
|
||||
return chars(cx, tok.index(), tok.length());
|
||||
}
|
||||
|
||||
Token nextToken();
|
||||
|
||||
JS::UniqueChars extension(JSContext* cx, const Token& start,
|
||||
const Token& end) const;
|
||||
|
||||
// unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
|
||||
//
|
||||
// Four character language subtags are not allowed in Unicode BCP 47 locale
|
||||
// identifiers. Also see the comparison to Unicode CLDR locale identifiers in
|
||||
// <https://unicode.org/reports/tr35/#BCP_47_Conformance>.
|
||||
bool isLanguage(const Token& tok) const {
|
||||
return tok.isAlpha() && ((2 <= tok.length() && tok.length() <= 3) ||
|
||||
(5 <= tok.length() && tok.length() <= 8));
|
||||
}
|
||||
|
||||
// unicode_script_subtag = alpha{4} ;
|
||||
bool isScript(const Token& tok) const {
|
||||
return tok.isAlpha() && tok.length() == 4;
|
||||
}
|
||||
|
||||
// unicode_region_subtag = (alpha{2} | digit{3}) ;
|
||||
bool isRegion(const Token& tok) const {
|
||||
return (tok.isAlpha() && tok.length() == 2) ||
|
||||
(tok.isDigit() && tok.length() == 3);
|
||||
}
|
||||
|
||||
// unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ;
|
||||
bool isVariant(const Token& tok) const {
|
||||
return (5 <= tok.length() && tok.length() <= 8) ||
|
||||
(tok.length() == 4 && mozilla::IsAsciiDigit(charAt(tok.index())));
|
||||
}
|
||||
|
||||
// Returns the code unit of the first character at the given singleton token.
|
||||
// Always returns the lower case form of an alphabetical character.
|
||||
char singletonKey(const Token& tok) const {
|
||||
MOZ_ASSERT(tok.length() == 1);
|
||||
char c = charAt(tok.index());
|
||||
return mozilla::IsAsciiUppercaseAlpha(c) ? (c | 0x20) : c;
|
||||
}
|
||||
|
||||
// extensions = unicode_locale_extensions |
|
||||
// transformed_extensions |
|
||||
// other_extensions ;
|
||||
//
|
||||
// unicode_locale_extensions = sep [uU] ((sep keyword)+ |
|
||||
// (sep attribute)+ (sep keyword)*) ;
|
||||
//
|
||||
// transformed_extensions = sep [tT] ((sep tlang (sep tfield)*) |
|
||||
// (sep tfield)+) ;
|
||||
//
|
||||
// other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
|
||||
bool isExtensionStart(const Token& tok) const {
|
||||
return tok.length() == 1 && singletonKey(tok) != 'x';
|
||||
}
|
||||
|
||||
// other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
|
||||
bool isOtherExtensionPart(const Token& tok) const {
|
||||
return 2 <= tok.length() && tok.length() <= 8;
|
||||
}
|
||||
|
||||
// unicode_locale_extensions = sep [uU] ((sep keyword)+ |
|
||||
// (sep attribute)+ (sep keyword)*) ;
|
||||
// keyword = key (sep type)? ;
|
||||
bool isUnicodeExtensionPart(const Token& tok) const {
|
||||
return isUnicodeExtensionKey(tok) || isUnicodeExtensionType(tok) ||
|
||||
isUnicodeExtensionAttribute(tok);
|
||||
}
|
||||
|
||||
// attribute = alphanum{3,8} ;
|
||||
bool isUnicodeExtensionAttribute(const Token& tok) const {
|
||||
return 3 <= tok.length() && tok.length() <= 8;
|
||||
}
|
||||
|
||||
// key = alphanum alpha ;
|
||||
bool isUnicodeExtensionKey(const Token& tok) const {
|
||||
return tok.length() == 2 && mozilla::IsAsciiAlpha(charAt(tok.index() + 1));
|
||||
}
|
||||
|
||||
// type = alphanum{3,8} (sep alphanum{3,8})* ;
|
||||
bool isUnicodeExtensionType(const Token& tok) const {
|
||||
return 3 <= tok.length() && tok.length() <= 8;
|
||||
}
|
||||
|
||||
// tkey = alpha digit ;
|
||||
bool isTransformExtensionKey(const Token& tok) const {
|
||||
return tok.length() == 2 && mozilla::IsAsciiAlpha(charAt(tok.index())) &&
|
||||
mozilla::IsAsciiDigit(charAt(tok.index() + 1));
|
||||
}
|
||||
|
||||
// tvalue = (sep alphanum{3,8})+ ;
|
||||
bool isTransformExtensionPart(const Token& tok) const {
|
||||
return 3 <= tok.length() && tok.length() <= 8;
|
||||
}
|
||||
|
||||
// pu_extensions = sep [xX] (sep alphanum{1,8})+ ;
|
||||
bool isPrivateUseStart(const Token& tok) const {
|
||||
return tok.length() == 1 && singletonKey(tok) == 'x';
|
||||
}
|
||||
|
||||
// pu_extensions = sep [xX] (sep alphanum{1,8})+ ;
|
||||
bool isPrivateUsePart(const Token& tok) const {
|
||||
return 1 <= tok.length() && tok.length() <= 8;
|
||||
}
|
||||
|
||||
enum class BaseNameParsing : bool { Normal, WithinTransformExtension };
|
||||
|
||||
// Helper function for use in |parseBaseName| and
|
||||
// |parseTlangInTransformExtension|. Do not use this directly!
|
||||
static JS::Result<bool> internalParseBaseName(JSContext* cx,
|
||||
LanguageTagParser& ts,
|
||||
LanguageTag& tag, Token& tok,
|
||||
BaseNameParsing parseType);
|
||||
|
||||
// Parse the `unicode_language_id` production, i.e. the
|
||||
// language/script/region/variants portion of a language tag, into |tag|,
|
||||
// which will be filled with canonical-cased components (lowercase language,
|
||||
// titlecase script, uppercase region, lowercased and alphabetized and
|
||||
// deduplicated variants). |tok| must be the current token.
|
||||
static JS::Result<bool> parseBaseName(JSContext* cx, LanguageTagParser& ts,
|
||||
LanguageTag& tag, Token& tok) {
|
||||
return internalParseBaseName(cx, ts, tag, tok, BaseNameParsing::Normal);
|
||||
}
|
||||
|
||||
// Parse the `tlang` production within a parsed 't' transform extension.
|
||||
// The precise requirements for "previously parsed" are:
|
||||
//
|
||||
// * the input begins from current token |tok| with a valid `tlang`
|
||||
// * the `tlang` is wholly lowercase (*not* canonical case)
|
||||
// * variant subtags in the `tlang` may contain duplicates and be
|
||||
// unordered
|
||||
//
|
||||
// Return an error on internal failure. Otherwise, return a success value. If
|
||||
// there was no `tlang`, then |tag.language().missing()|. But if there was a
|
||||
// `tlang`, then |tag| is filled with subtags exactly as they appeared in the
|
||||
// parse input: fully lowercase, variants in alphabetical order without
|
||||
// duplicates.
|
||||
static JS::Result<JS::Ok> parseTlangInTransformExtension(
|
||||
JSContext* cx, LanguageTagParser& ts, LanguageTag& tag, Token& tok) {
|
||||
MOZ_ASSERT(ts.isLanguage(tok));
|
||||
return internalParseBaseName(cx, ts, tag, tok,
|
||||
BaseNameParsing::WithinTransformExtension)
|
||||
.map([](bool parsed) {
|
||||
MOZ_ASSERT(parsed);
|
||||
return JS::Ok();
|
||||
});
|
||||
}
|
||||
|
||||
friend class LanguageTag;
|
||||
|
||||
class Range final {
|
||||
size_t begin_;
|
||||
size_t length_;
|
||||
|
||||
public:
|
||||
Range(size_t begin, size_t length) : begin_(begin), length_(length) {}
|
||||
|
||||
template <typename T>
|
||||
T* begin(T* ptr) const {
|
||||
return ptr + begin_;
|
||||
}
|
||||
|
||||
size_t length() const { return length_; }
|
||||
};
|
||||
|
||||
using TFieldVector = js::Vector<Range, 8>;
|
||||
using AttributesVector = js::Vector<Range, 8>;
|
||||
using KeywordsVector = js::Vector<Range, 8>;
|
||||
|
||||
// Parse |extension|, which must be a validated, fully lowercase
|
||||
// `transformed_extensions` subtag, and fill |tag| and |fields| from the
|
||||
// `tlang` and `tfield` components. Data in |tag| is lowercase, consistent
|
||||
// with |extension|.
|
||||
static JS::Result<bool> parseTransformExtension(
|
||||
JSContext* cx, mozilla::Range<const char> extension, LanguageTag& tag,
|
||||
TFieldVector& fields);
|
||||
|
||||
// Parse |extension|, which must be a validated, fully lowercase
|
||||
// `unicode_locale_extensions` subtag, and fill |attributes| and |keywords|
|
||||
// from the `attribute` and `keyword` components.
|
||||
static JS::Result<bool> parseUnicodeExtension(
|
||||
JSContext* cx, mozilla::Range<const char> extension,
|
||||
AttributesVector& attributes, KeywordsVector& keywords);
|
||||
|
||||
public:
|
||||
// Parse the input string as a language tag. Reports an error to the context
|
||||
// if the input can't be parsed completely.
|
||||
static bool parse(JSContext* cx, JSLinearString* locale, LanguageTag& tag);
|
||||
|
||||
// Parse the input string as a language tag. Returns Ok(true) if the input
|
||||
// could be completely parsed, Ok(false) if the input couldn't be parsed,
|
||||
// or Err() in case of internal error.
|
||||
static JS::Result<bool> tryParse(JSContext* cx, JSLinearString* locale,
|
||||
LanguageTag& tag);
|
||||
|
||||
// Parse the input string as the base-name parts (language, script, region,
|
||||
// variants) of a language tag. Ignores any trailing characters.
|
||||
static bool parseBaseName(JSContext* cx, mozilla::Range<const char> locale,
|
||||
LanguageTag& tag);
|
||||
|
||||
// Return true iff |extension| can be parsed as a Unicode extension subtag.
|
||||
static bool canParseUnicodeExtension(mozilla::Range<const char> extension);
|
||||
|
||||
// Return true iff |unicodeType| can be parsed as a Unicode extension type.
|
||||
static bool canParseUnicodeExtensionType(JSLinearString* unicodeType);
|
||||
};
|
||||
|
||||
MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS(LanguageTagParser::TokenKind)
|
||||
|
||||
} // namespace intl
|
||||
|
||||
} // namespace js
|
||||
|
||||
#endif /* builtin_intl_LanguageTag_h */
|
|
@ -0,0 +1,615 @@
|
|||
// Generated by make_intl_data.py. DO NOT EDIT.
|
||||
|
||||
#include "mozilla/Assertions.h"
|
||||
#include "mozilla/Range.h"
|
||||
#include "mozilla/TextUtils.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <iterator>
|
||||
#include <type_traits>
|
||||
|
||||
#include "builtin/intl/LanguageTag.h"
|
||||
#include "util/Text.h"
|
||||
#include "vm/JSContext.h"
|
||||
|
||||
using ConstCharRange = mozilla::Range<const char>;
|
||||
|
||||
template <size_t Length, size_t TagLength, size_t SubtagLength>
|
||||
static inline bool HasReplacement(
|
||||
const char (&subtags)[Length][TagLength],
|
||||
const js::intl::LanguageTagSubtag<SubtagLength>& subtag) {
|
||||
MOZ_ASSERT(subtag.length() == TagLength - 1,
|
||||
"subtag must have the same length as the list of subtags");
|
||||
|
||||
const char* ptr = subtag.range().begin().get();
|
||||
return std::binary_search(std::begin(subtags), std::end(subtags), ptr,
|
||||
[](const char* a, const char* b) {
|
||||
return memcmp(a, b, TagLength - 1) < 0;
|
||||
});
|
||||
}
|
||||
|
||||
template <size_t Length, size_t TagLength, size_t SubtagLength>
|
||||
static inline const char* SearchReplacement(
|
||||
const char (&subtags)[Length][TagLength],
|
||||
const char* (&aliases)[Length],
|
||||
const js::intl::LanguageTagSubtag<SubtagLength>& subtag) {
|
||||
MOZ_ASSERT(subtag.length() == TagLength - 1,
|
||||
"subtag must have the same length as the list of subtags");
|
||||
|
||||
const char* ptr = subtag.range().begin().get();
|
||||
auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr,
|
||||
[](const char* a, const char* b) {
|
||||
return memcmp(a, b, TagLength - 1) < 0;
|
||||
});
|
||||
if (p != std::end(subtags) && memcmp(*p, ptr, TagLength - 1) == 0) {
|
||||
return aliases[std::distance(std::begin(subtags), p)];
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Mappings from language subtags to preferred values.
|
||||
// Derived from CLDR Supplemental Data, version 35.1.
|
||||
// https://github.com/unicode-org/cldr.git
|
||||
bool js::intl::LanguageTag::languageMapping(LanguageSubtag& language) {
|
||||
MOZ_ASSERT(IsStructurallyValidLanguageTag(language.range()));
|
||||
|
||||
if (language.length() == 2) {
|
||||
static const char languages[9][3] = {
|
||||
"bh", "in", "iw", "ji", "jw", "mo", "no", "tl", "tw",
|
||||
};
|
||||
static const char* aliases[9] = {
|
||||
"bho", "id", "he", "yi", "jv", "ro", "nb", "fil", "ak",
|
||||
};
|
||||
|
||||
if (const char* replacement = SearchReplacement(languages, aliases, language)) {
|
||||
language.set(ConstCharRange(replacement, strlen(replacement)));
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
if (language.length() == 3) {
|
||||
static const char languages[340][4] = {
|
||||
"aam", "aar", "abk", "adp", "afr", "aju", "aka", "alb", "als", "amh",
|
||||
"ara", "arb", "arg", "arm", "asm", "aue", "ava", "ave", "aym", "ayr",
|
||||
"ayx", "aze", "azj", "bak", "bam", "baq", "bcc", "bcl", "bel", "ben",
|
||||
"bgm", "bih", "bis", "bjd", "bod", "bos", "bre", "bul", "bur", "bxk",
|
||||
"bxr", "cat", "ccq", "ces", "cha", "che", "chi", "chu", "chv", "cjr",
|
||||
"cka", "cld", "cmk", "cmn", "cor", "cos", "coy", "cqu", "cre", "cwd",
|
||||
"cym", "cze", "dan", "deu", "dgo", "dhd", "dik", "diq", "div", "drh",
|
||||
"dut", "dzo", "ekk", "ell", "emk", "eng", "epo", "esk", "est", "eus",
|
||||
"ewe", "fao", "fas", "fat", "fij", "fin", "fra", "fre", "fry", "fuc",
|
||||
"ful", "gav", "gaz", "gbo", "geo", "ger", "gfx", "ggn", "gla", "gle",
|
||||
"glg", "glv", "gno", "gre", "grn", "gti", "gug", "guj", "guv", "gya",
|
||||
"hat", "hau", "hdn", "hea", "heb", "her", "him", "hin", "hmo", "hrr",
|
||||
"hrv", "hun", "hye", "ibi", "ibo", "ice", "ido", "iii", "ike", "iku",
|
||||
"ile", "ilw", "ina", "ind", "ipk", "isl", "ita", "jav", "jeg", "jpn",
|
||||
"kal", "kan", "kas", "kat", "kau", "kaz", "kgc", "kgh", "khk", "khm",
|
||||
"kik", "kin", "kir", "kmr", "knc", "kng", "knn", "koj", "kom", "kon",
|
||||
"kor", "kpv", "krm", "ktr", "kua", "kur", "kvs", "kwq", "kxe", "kzj",
|
||||
"kzt", "lao", "lat", "lav", "lbk", "lii", "lim", "lin", "lit", "lmm",
|
||||
"ltz", "lub", "lug", "lvs", "mac", "mah", "mal", "mao", "mar", "may",
|
||||
"meg", "mhr", "mkd", "mlg", "mlt", "mnk", "mol", "mon", "mri", "msa",
|
||||
"mst", "mup", "mwj", "mya", "myt", "nad", "nau", "nav", "nbl", "ncp",
|
||||
"nde", "ndo", "nep", "nld", "nno", "nnx", "nob", "nor", "npi", "nts",
|
||||
"nya", "oci", "ojg", "oji", "ori", "orm", "ory", "oss", "oun", "pan",
|
||||
"pbu", "pcr", "per", "pes", "pli", "plt", "pmc", "pmu", "pnb", "pol",
|
||||
"por", "ppa", "ppr", "pry", "pus", "puz", "que", "quz", "rmy", "roh",
|
||||
"ron", "rum", "run", "rus", "sag", "san", "sca", "scc", "scr", "sin",
|
||||
"skk", "slk", "slo", "slv", "sme", "smo", "sna", "snd", "som", "sot",
|
||||
"spa", "spy", "sqi", "src", "srd", "srp", "ssw", "sun", "swa", "swe",
|
||||
"swh", "tah", "tam", "tat", "tdu", "tel", "tgk", "tgl", "tha", "thc",
|
||||
"thx", "tib", "tie", "tir", "tkk", "tlw", "tmp", "tne", "ton", "tsf",
|
||||
"tsn", "tso", "ttq", "tuk", "tur", "twi", "uig", "ukr", "umu", "uok",
|
||||
"urd", "uzb", "uzn", "ven", "vie", "vol", "wel", "wln", "wol", "xba",
|
||||
"xho", "xia", "xkh", "xpe", "xsj", "xsl", "ybd", "ydd", "yid", "yma",
|
||||
"ymt", "yor", "yos", "yuu", "zai", "zha", "zho", "zsm", "zul", "zyb",
|
||||
};
|
||||
static const char* aliases[340] = {
|
||||
"aas", "aa", "ab", "dz", "af", "jrb", "ak", "sq", "sq", "am",
|
||||
"ar", "ar", "an", "hy", "as", "ktz", "av", "ae", "ay", "ay",
|
||||
"nun", "az", "az", "ba", "bm", "eu", "bal", "bik", "be", "bn",
|
||||
"bcg", "bho", "bi", "drl", "bo", "bs", "br", "bg", "my", "luy",
|
||||
"bua", "ca", "rki", "cs", "ch", "ce", "zh", "cu", "cv", "mom",
|
||||
"cmr", "syr", "xch", "zh", "kw", "co", "pij", "quh", "cr", "cr",
|
||||
"cy", "cs", "da", "de", "doi", "mwr", "din", "zza", "dv", "mn",
|
||||
"nl", "dz", "et", "el", "man", "en", "eo", "ik", "et", "eu",
|
||||
"ee", "fo", "fa", "ak", "fj", "fi", "fr", "fr", "fy", "ff",
|
||||
"ff", "dev", "om", "grb", "ka", "de", "vaj", "gvr", "gd", "ga",
|
||||
"gl", "gv", "gon", "el", "gn", "nyc", "gn", "gu", "duz", "gba",
|
||||
"ht", "ha", "hai", "hmn", "he", "hz", "srx", "hi", "ho", "jal",
|
||||
"hr", "hu", "hy", "opa", "ig", "is", "io", "ii", "iu", "iu",
|
||||
"ie", "gal", "ia", "id", "ik", "is", "it", "jv", "oyb", "ja",
|
||||
"kl", "kn", "ks", "ka", "kr", "kk", "tdf", "kml", "mn", "km",
|
||||
"ki", "rw", "ky", "ku", "kr", "kg", "kok", "kwv", "kv", "kg",
|
||||
"ko", "kv", "bmf", "dtp", "kj", "ku", "gdj", "yam", "tvd", "dtp",
|
||||
"dtp", "lo", "la", "lv", "bnc", "raq", "li", "ln", "lt", "rmx",
|
||||
"lb", "lu", "lg", "lv", "mk", "mh", "ml", "mi", "mr", "ms",
|
||||
"cir", "chm", "mk", "mg", "mt", "man", "ro", "mn", "mi", "ms",
|
||||
"mry", "raj", "vaj", "my", "mry", "xny", "na", "nv", "nr", "kdz",
|
||||
"nd", "ng", "ne", "nl", "nn", "ngv", "nb", "nb", "ne", "pij",
|
||||
"ny", "oc", "oj", "oj", "or", "om", "or", "os", "vaj", "pa",
|
||||
"ps", "adx", "fa", "fa", "pi", "mg", "huw", "phr", "lah", "pl",
|
||||
"pt", "bfy", "lcq", "prt", "ps", "pub", "qu", "qu", "rom", "rm",
|
||||
"ro", "ro", "rn", "ru", "sg", "sa", "hle", "sr", "hr", "si",
|
||||
"oyb", "sk", "sk", "sl", "se", "sm", "sn", "sd", "so", "st",
|
||||
"es", "kln", "sq", "sc", "sc", "sr", "ss", "su", "sw", "sv",
|
||||
"sw", "ty", "ta", "tt", "dtp", "te", "tg", "fil", "th", "tpo",
|
||||
"oyb", "bo", "ras", "ti", "twm", "weo", "tyj", "kak", "to", "taj",
|
||||
"tn", "ts", "tmh", "tk", "tr", "ak", "ug", "uk", "del", "ema",
|
||||
"ur", "uz", "uz", "ve", "vi", "vo", "cy", "wa", "wo", "cax",
|
||||
"xh", "acn", "waw", "kpe", "suj", "den", "rki", "yi", "yi", "lrr",
|
||||
"mtm", "yo", "zom", "yug", "zap", "za", "zh", "ms", "zu", "za",
|
||||
};
|
||||
|
||||
if (const char* replacement = SearchReplacement(languages, aliases, language)) {
|
||||
language.set(ConstCharRange(replacement, strlen(replacement)));
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Language subtags with complex mappings.
|
||||
// Derived from CLDR Supplemental Data, version 35.1.
|
||||
// https://github.com/unicode-org/cldr.git
|
||||
bool js::intl::LanguageTag::complexLanguageMapping(const LanguageSubtag& language) {
|
||||
MOZ_ASSERT(IsStructurallyValidLanguageTag(language.range()));
|
||||
|
||||
if (language.length() == 2) {
|
||||
return language.equalTo("sh");
|
||||
}
|
||||
|
||||
if (language.length() == 3) {
|
||||
static const char languages[6][4] = {
|
||||
"cnr", "drw", "hbs", "prs", "swc", "tnf",
|
||||
};
|
||||
|
||||
return HasReplacement(languages, language);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Mappings from region subtags to preferred values.
|
||||
// Derived from CLDR Supplemental Data, version 35.1.
|
||||
// https://github.com/unicode-org/cldr.git
|
||||
bool js::intl::LanguageTag::regionMapping(RegionSubtag& region) {
|
||||
MOZ_ASSERT(IsStructurallyValidRegionTag(region.range()));
|
||||
|
||||
if (region.length() == 2) {
|
||||
static const char regions[23][3] = {
|
||||
"BU", "CS", "CT", "DD", "DY", "FQ", "FX", "HV", "JT", "MI",
|
||||
"NH", "NQ", "PU", "PZ", "QU", "RH", "TP", "UK", "VD", "WK",
|
||||
"YD", "YU", "ZR",
|
||||
};
|
||||
static const char* aliases[23] = {
|
||||
"MM", "RS", "KI", "DE", "BJ", "AQ", "FR", "BF", "UM", "UM",
|
||||
"VU", "AQ", "UM", "PA", "EU", "ZW", "TL", "GB", "VN", "UM",
|
||||
"YE", "RS", "CD",
|
||||
};
|
||||
|
||||
if (const char* replacement = SearchReplacement(regions, aliases, region)) {
|
||||
region.set(ConstCharRange(replacement, strlen(replacement)));
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
{
|
||||
static const char regions[300][4] = {
|
||||
"004", "008", "010", "012", "016", "020", "024", "028", "031", "032",
|
||||
"036", "040", "044", "048", "050", "051", "052", "056", "060", "062",
|
||||
"064", "068", "070", "072", "074", "076", "084", "086", "090", "092",
|
||||
"096", "100", "104", "108", "112", "116", "120", "124", "132", "136",
|
||||
"140", "144", "148", "152", "156", "158", "162", "166", "170", "174",
|
||||
"175", "178", "180", "184", "188", "191", "192", "196", "203", "204",
|
||||
"208", "212", "214", "218", "222", "226", "230", "231", "232", "233",
|
||||
"234", "238", "239", "242", "246", "248", "249", "250", "254", "258",
|
||||
"260", "262", "266", "268", "270", "275", "276", "278", "280", "288",
|
||||
"292", "296", "300", "304", "308", "312", "316", "320", "324", "328",
|
||||
"332", "334", "336", "340", "344", "348", "352", "356", "360", "364",
|
||||
"368", "372", "376", "380", "384", "388", "392", "398", "400", "404",
|
||||
"408", "410", "414", "417", "418", "422", "426", "428", "430", "434",
|
||||
"438", "440", "442", "446", "450", "454", "458", "462", "466", "470",
|
||||
"474", "478", "480", "484", "492", "496", "498", "499", "500", "504",
|
||||
"508", "512", "516", "520", "524", "528", "531", "533", "534", "535",
|
||||
"540", "548", "554", "558", "562", "566", "570", "574", "578", "580",
|
||||
"581", "583", "584", "585", "586", "591", "598", "600", "604", "608",
|
||||
"612", "616", "620", "624", "626", "630", "634", "638", "642", "643",
|
||||
"646", "652", "654", "659", "660", "662", "663", "666", "670", "674",
|
||||
"678", "682", "686", "688", "690", "694", "702", "703", "704", "705",
|
||||
"706", "710", "716", "720", "724", "728", "729", "732", "736", "740",
|
||||
"744", "748", "752", "756", "760", "762", "764", "768", "772", "776",
|
||||
"780", "784", "788", "792", "795", "796", "798", "800", "804", "807",
|
||||
"818", "826", "830", "831", "832", "833", "834", "840", "850", "854",
|
||||
"858", "860", "862", "876", "882", "886", "887", "891", "894", "958",
|
||||
"959", "960", "962", "963", "964", "965", "966", "967", "968", "969",
|
||||
"970", "971", "972", "973", "974", "975", "976", "977", "978", "979",
|
||||
"980", "981", "982", "983", "984", "985", "986", "987", "988", "989",
|
||||
"990", "991", "992", "993", "994", "995", "996", "997", "998", "999",
|
||||
};
|
||||
static const char* aliases[300] = {
|
||||
"AF", "AL", "AQ", "DZ", "AS", "AD", "AO", "AG", "AZ", "AR",
|
||||
"AU", "AT", "BS", "BH", "BD", "AM", "BB", "BE", "BM", "034",
|
||||
"BT", "BO", "BA", "BW", "BV", "BR", "BZ", "IO", "SB", "VG",
|
||||
"BN", "BG", "MM", "BI", "BY", "KH", "CM", "CA", "CV", "KY",
|
||||
"CF", "LK", "TD", "CL", "CN", "TW", "CX", "CC", "CO", "KM",
|
||||
"YT", "CG", "CD", "CK", "CR", "HR", "CU", "CY", "CZ", "BJ",
|
||||
"DK", "DM", "DO", "EC", "SV", "GQ", "ET", "ET", "ER", "EE",
|
||||
"FO", "FK", "GS", "FJ", "FI", "AX", "FR", "FR", "GF", "PF",
|
||||
"TF", "DJ", "GA", "GE", "GM", "PS", "DE", "DE", "DE", "GH",
|
||||
"GI", "KI", "GR", "GL", "GD", "GP", "GU", "GT", "GN", "GY",
|
||||
"HT", "HM", "VA", "HN", "HK", "HU", "IS", "IN", "ID", "IR",
|
||||
"IQ", "IE", "IL", "IT", "CI", "JM", "JP", "KZ", "JO", "KE",
|
||||
"KP", "KR", "KW", "KG", "LA", "LB", "LS", "LV", "LR", "LY",
|
||||
"LI", "LT", "LU", "MO", "MG", "MW", "MY", "MV", "ML", "MT",
|
||||
"MQ", "MR", "MU", "MX", "MC", "MN", "MD", "ME", "MS", "MA",
|
||||
"MZ", "OM", "NA", "NR", "NP", "NL", "CW", "AW", "SX", "BQ",
|
||||
"NC", "VU", "NZ", "NI", "NE", "NG", "NU", "NF", "NO", "MP",
|
||||
"UM", "FM", "MH", "PW", "PK", "PA", "PG", "PY", "PE", "PH",
|
||||
"PN", "PL", "PT", "GW", "TL", "PR", "QA", "RE", "RO", "RU",
|
||||
"RW", "BL", "SH", "KN", "AI", "LC", "MF", "PM", "VC", "SM",
|
||||
"ST", "SA", "SN", "RS", "SC", "SL", "SG", "SK", "VN", "SI",
|
||||
"SO", "ZA", "ZW", "YE", "ES", "SS", "SD", "EH", "SD", "SR",
|
||||
"SJ", "SZ", "SE", "CH", "SY", "TJ", "TH", "TG", "TK", "TO",
|
||||
"TT", "AE", "TN", "TR", "TM", "TC", "TV", "UG", "UA", "MK",
|
||||
"EG", "GB", "JE", "GG", "JE", "IM", "TZ", "US", "VI", "BF",
|
||||
"UY", "UZ", "VE", "WF", "WS", "YE", "YE", "RS", "ZM", "AA",
|
||||
"QM", "QN", "QP", "QQ", "QR", "QS", "QT", "EU", "QV", "QW",
|
||||
"QX", "QY", "QZ", "XA", "XB", "XC", "XD", "XE", "XF", "XG",
|
||||
"XH", "XI", "XJ", "XK", "XL", "XM", "XN", "XO", "XP", "XQ",
|
||||
"XR", "XS", "XT", "XU", "XV", "XW", "XX", "XY", "XZ", "ZZ",
|
||||
};
|
||||
|
||||
if (const char* replacement = SearchReplacement(regions, aliases, region)) {
|
||||
region.set(ConstCharRange(replacement, strlen(replacement)));
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Region subtags with complex mappings.
|
||||
// Derived from CLDR Supplemental Data, version 35.1.
|
||||
// https://github.com/unicode-org/cldr.git
|
||||
bool js::intl::LanguageTag::complexRegionMapping(const RegionSubtag& region) {
|
||||
MOZ_ASSERT(IsStructurallyValidRegionTag(region.range()));
|
||||
|
||||
if (region.length() == 2) {
|
||||
return region.equalTo("AN") ||
|
||||
region.equalTo("NT") ||
|
||||
region.equalTo("PC") ||
|
||||
region.equalTo("SU");
|
||||
}
|
||||
|
||||
{
|
||||
static const char regions[8][4] = {
|
||||
"172", "200", "530", "532", "536", "582", "810", "890",
|
||||
};
|
||||
|
||||
return HasReplacement(regions, region);
|
||||
}
|
||||
}
|
||||
|
||||
// Language subtags with complex mappings.
|
||||
// Derived from CLDR Supplemental Data, version 35.1.
|
||||
// https://github.com/unicode-org/cldr.git
|
||||
void js::intl::LanguageTag::performComplexLanguageMappings() {
|
||||
MOZ_ASSERT(IsStructurallyValidLanguageTag(language().range()));
|
||||
|
||||
if (language().equalTo("cnr")) {
|
||||
setLanguage("sr");
|
||||
if (region().length() == 0) {
|
||||
setRegion("ME");
|
||||
}
|
||||
}
|
||||
else if (language().equalTo("drw") ||
|
||||
language().equalTo("prs") ||
|
||||
language().equalTo("tnf")) {
|
||||
setLanguage("fa");
|
||||
if (region().length() == 0) {
|
||||
setRegion("AF");
|
||||
}
|
||||
}
|
||||
else if (language().equalTo("hbs") ||
|
||||
language().equalTo("sh")) {
|
||||
setLanguage("sr");
|
||||
if (script().length() == 0) {
|
||||
setScript("Latn");
|
||||
}
|
||||
}
|
||||
else if (language().equalTo("swc")) {
|
||||
setLanguage("sw");
|
||||
if (region().length() == 0) {
|
||||
setRegion("CD");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Region subtags with complex mappings.
|
||||
// Derived from CLDR Supplemental Data, version 35.1.
|
||||
// https://github.com/unicode-org/cldr.git
|
||||
void js::intl::LanguageTag::performComplexRegionMappings() {
|
||||
MOZ_ASSERT(IsStructurallyValidLanguageTag(language().range()));
|
||||
MOZ_ASSERT(IsStructurallyValidRegionTag(region().range()));
|
||||
|
||||
if (region().equalTo("172")) {
|
||||
if (language().equalTo("hy") ||
|
||||
(language().equalTo("und") && script().equalTo("Armn"))) {
|
||||
setRegion("AM");
|
||||
}
|
||||
else if (language().equalTo("az") ||
|
||||
language().equalTo("tkr") ||
|
||||
language().equalTo("tly") ||
|
||||
language().equalTo("ttt")) {
|
||||
setRegion("AZ");
|
||||
}
|
||||
else if (language().equalTo("be")) {
|
||||
setRegion("BY");
|
||||
}
|
||||
else if (language().equalTo("ab") ||
|
||||
language().equalTo("ka") ||
|
||||
language().equalTo("os") ||
|
||||
(language().equalTo("und") && script().equalTo("Geor")) ||
|
||||
language().equalTo("xmf")) {
|
||||
setRegion("GE");
|
||||
}
|
||||
else if (language().equalTo("ky")) {
|
||||
setRegion("KG");
|
||||
}
|
||||
else if (language().equalTo("kk") ||
|
||||
(language().equalTo("ug") && script().equalTo("Cyrl"))) {
|
||||
setRegion("KZ");
|
||||
}
|
||||
else if (language().equalTo("gag")) {
|
||||
setRegion("MD");
|
||||
}
|
||||
else if (language().equalTo("tg")) {
|
||||
setRegion("TJ");
|
||||
}
|
||||
else if (language().equalTo("tk")) {
|
||||
setRegion("TM");
|
||||
}
|
||||
else if (language().equalTo("crh") ||
|
||||
language().equalTo("got") ||
|
||||
language().equalTo("ji") ||
|
||||
language().equalTo("rue") ||
|
||||
language().equalTo("uk") ||
|
||||
(language().equalTo("und") && script().equalTo("Goth"))) {
|
||||
setRegion("UA");
|
||||
}
|
||||
else if (language().equalTo("kaa") ||
|
||||
language().equalTo("sog") ||
|
||||
(language().equalTo("und") && script().equalTo("Sogd")) ||
|
||||
(language().equalTo("und") && script().equalTo("Sogo")) ||
|
||||
language().equalTo("uz")) {
|
||||
setRegion("UZ");
|
||||
}
|
||||
else {
|
||||
setRegion("RU");
|
||||
}
|
||||
}
|
||||
else if (region().equalTo("200")) {
|
||||
if (language().equalTo("sk")) {
|
||||
setRegion("SK");
|
||||
}
|
||||
else {
|
||||
setRegion("CZ");
|
||||
}
|
||||
}
|
||||
else if (region().equalTo("530") ||
|
||||
region().equalTo("532") ||
|
||||
region().equalTo("AN")) {
|
||||
if (language().equalTo("vic")) {
|
||||
setRegion("SX");
|
||||
}
|
||||
else {
|
||||
setRegion("CW");
|
||||
}
|
||||
}
|
||||
else if (region().equalTo("536") ||
|
||||
region().equalTo("NT")) {
|
||||
if (language().equalTo("akk") ||
|
||||
language().equalTo("ckb") ||
|
||||
(language().equalTo("ku") && script().equalTo("Arab")) ||
|
||||
language().equalTo("mis") ||
|
||||
language().equalTo("syr") ||
|
||||
(language().equalTo("und") && script().equalTo("Xsux")) ||
|
||||
(language().equalTo("und") && script().equalTo("Hatr")) ||
|
||||
(language().equalTo("und") && script().equalTo("Syrc"))) {
|
||||
setRegion("IQ");
|
||||
}
|
||||
else {
|
||||
setRegion("SA");
|
||||
}
|
||||
}
|
||||
else if (region().equalTo("582") ||
|
||||
region().equalTo("PC")) {
|
||||
if (language().equalTo("mh")) {
|
||||
setRegion("MH");
|
||||
}
|
||||
else if (language().equalTo("pau")) {
|
||||
setRegion("PW");
|
||||
}
|
||||
else {
|
||||
setRegion("FM");
|
||||
}
|
||||
}
|
||||
else if (region().equalTo("810") ||
|
||||
region().equalTo("SU")) {
|
||||
if (language().equalTo("hy") ||
|
||||
(language().equalTo("und") && script().equalTo("Armn"))) {
|
||||
setRegion("AM");
|
||||
}
|
||||
else if (language().equalTo("az") ||
|
||||
language().equalTo("tkr") ||
|
||||
language().equalTo("tly") ||
|
||||
language().equalTo("ttt")) {
|
||||
setRegion("AZ");
|
||||
}
|
||||
else if (language().equalTo("be")) {
|
||||
setRegion("BY");
|
||||
}
|
||||
else if (language().equalTo("et") ||
|
||||
language().equalTo("vro")) {
|
||||
setRegion("EE");
|
||||
}
|
||||
else if (language().equalTo("ab") ||
|
||||
language().equalTo("ka") ||
|
||||
language().equalTo("os") ||
|
||||
(language().equalTo("und") && script().equalTo("Geor")) ||
|
||||
language().equalTo("xmf")) {
|
||||
setRegion("GE");
|
||||
}
|
||||
else if (language().equalTo("ky")) {
|
||||
setRegion("KG");
|
||||
}
|
||||
else if (language().equalTo("kk") ||
|
||||
(language().equalTo("ug") && script().equalTo("Cyrl"))) {
|
||||
setRegion("KZ");
|
||||
}
|
||||
else if (language().equalTo("lt") ||
|
||||
language().equalTo("sgs")) {
|
||||
setRegion("LT");
|
||||
}
|
||||
else if (language().equalTo("ltg") ||
|
||||
language().equalTo("lv")) {
|
||||
setRegion("LV");
|
||||
}
|
||||
else if (language().equalTo("gag")) {
|
||||
setRegion("MD");
|
||||
}
|
||||
else if (language().equalTo("tg")) {
|
||||
setRegion("TJ");
|
||||
}
|
||||
else if (language().equalTo("tk")) {
|
||||
setRegion("TM");
|
||||
}
|
||||
else if (language().equalTo("crh") ||
|
||||
language().equalTo("got") ||
|
||||
language().equalTo("ji") ||
|
||||
language().equalTo("rue") ||
|
||||
language().equalTo("uk") ||
|
||||
(language().equalTo("und") && script().equalTo("Goth"))) {
|
||||
setRegion("UA");
|
||||
}
|
||||
else if (language().equalTo("kaa") ||
|
||||
language().equalTo("sog") ||
|
||||
(language().equalTo("und") && script().equalTo("Sogd")) ||
|
||||
(language().equalTo("und") && script().equalTo("Sogo")) ||
|
||||
language().equalTo("uz")) {
|
||||
setRegion("UZ");
|
||||
}
|
||||
else {
|
||||
setRegion("RU");
|
||||
}
|
||||
}
|
||||
else if (region().equalTo("890")) {
|
||||
if (language().equalTo("bs")) {
|
||||
setRegion("BA");
|
||||
}
|
||||
else if (language().equalTo("hr")) {
|
||||
setRegion("HR");
|
||||
}
|
||||
else if (language().equalTo("mk")) {
|
||||
setRegion("MK");
|
||||
}
|
||||
else if (language().equalTo("sl")) {
|
||||
setRegion("SI");
|
||||
}
|
||||
else {
|
||||
setRegion("RS");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Canonicalize grandfathered locale identifiers.
|
||||
// Derived from CLDR Supplemental Data, version 35.1.
|
||||
// https://github.com/unicode-org/cldr.git
|
||||
bool js::intl::LanguageTag::updateGrandfatheredMappings(JSContext* cx) {
|
||||
// We're mapping regular grandfathered tags to non-grandfathered form here.
|
||||
// Other tags remain unchanged.
|
||||
//
|
||||
// regular = "art-lojban"
|
||||
// / "cel-gaulish"
|
||||
// / "no-bok"
|
||||
// / "no-nyn"
|
||||
// / "zh-guoyu"
|
||||
// / "zh-hakka"
|
||||
// / "zh-min"
|
||||
// / "zh-min-nan"
|
||||
// / "zh-xiang"
|
||||
//
|
||||
// Therefore we can quickly exclude most tags by checking every
|
||||
// |unicode_locale_id| subcomponent for characteristics not shared by any of
|
||||
// the regular grandfathered (RG) tags:
|
||||
//
|
||||
// * Real-world |unicode_language_subtag|s are all two or three letters,
|
||||
// so don't waste time running a useless |language.length > 3| fast-path.
|
||||
// * No RG tag has a "script"-looking component.
|
||||
// * No RG tag has a "region"-looking component.
|
||||
// * The RG tags that match |unicode_locale_id| (art-lojban, cel-gaulish,
|
||||
// zh-guoyu, zh-hakka, zh-xiang) have exactly one "variant". (no-bok,
|
||||
// no-nyn, zh-min, and zh-min-nan require BCP47's extlang subtag
|
||||
// that |unicode_locale_id| doesn't support.)
|
||||
// * No RG tag contains |extensions| or |pu_extensions|.
|
||||
if (script().length() != 0 ||
|
||||
region().length() != 0 ||
|
||||
variants().length() != 1 ||
|
||||
extensions().length() != 0 ||
|
||||
privateuse()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
auto variantEqualTo = [this](const char* variant) {
|
||||
return strcmp(variants()[0].get(), variant) == 0;
|
||||
};
|
||||
|
||||
// art-lojban -> jbo
|
||||
if (language().equalTo("art") && variantEqualTo("lojban")) {
|
||||
setLanguage("jbo");
|
||||
clearVariants();
|
||||
return true;
|
||||
}
|
||||
|
||||
// cel-gaulish -> xtg-x-cel-gaulish
|
||||
else if (language().equalTo("cel") && variantEqualTo("gaulish")) {
|
||||
setLanguage("xtg");
|
||||
clearVariants();
|
||||
|
||||
auto privateuse = DuplicateString(cx, "x-cel-gaulish");
|
||||
if (!privateuse) {
|
||||
return false;
|
||||
}
|
||||
setPrivateuse(std::move(privateuse));
|
||||
return true;
|
||||
}
|
||||
|
||||
// zh-guoyu -> zh
|
||||
else if (language().equalTo("zh") && variantEqualTo("guoyu")) {
|
||||
setLanguage("zh");
|
||||
clearVariants();
|
||||
return true;
|
||||
}
|
||||
|
||||
// zh-hakka -> hak
|
||||
else if (language().equalTo("zh") && variantEqualTo("hakka")) {
|
||||
setLanguage("hak");
|
||||
clearVariants();
|
||||
return true;
|
||||
}
|
||||
|
||||
// zh-xiang -> hsn
|
||||
else if (language().equalTo("zh") && variantEqualTo("xiang")) {
|
||||
setLanguage("hsn");
|
||||
clearVariants();
|
||||
return true;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
|
@ -0,0 +1,188 @@
|
|||
// Generated by make_intl_data.py. DO NOT EDIT.
|
||||
// Version: CLDR-35.1
|
||||
// URL: https://unicode.org/Public/cldr/35.1/core.zip
|
||||
|
||||
#include "mozilla/Assertions.h"
|
||||
#include "mozilla/Range.h"
|
||||
#include "mozilla/TextUtils.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
||||
#include "builtin/intl/LanguageTag.h"
|
||||
|
||||
using namespace js::intl::LanguageTagLimits;
|
||||
|
||||
using ConstCharRange = mozilla::Range<const char>;
|
||||
|
||||
template <size_t Length>
|
||||
static inline bool IsUnicodeKey(const ConstCharRange& key,
|
||||
const char (&str)[Length]) {
|
||||
static_assert(Length == UnicodeKeyLength + 1,
|
||||
"Unicode extension key is two characters long");
|
||||
return memcmp(key.begin().get(), str, Length - 1) == 0;
|
||||
}
|
||||
|
||||
template <size_t Length>
|
||||
static inline bool IsUnicodeType(const ConstCharRange& type,
|
||||
const char (&str)[Length]) {
|
||||
static_assert(Length > UnicodeKeyLength + 1,
|
||||
"Unicode extension type contains more than two characters");
|
||||
return type.length() == (Length - 1) &&
|
||||
memcmp(type.begin().get(), str, Length - 1) == 0;
|
||||
}
|
||||
|
||||
static int32_t CompareUnicodeType(const char* a, const ConstCharRange& b) {
|
||||
#ifdef DEBUG
|
||||
auto isNull = [](char c) {
|
||||
return c == '\0';
|
||||
};
|
||||
#endif
|
||||
|
||||
MOZ_ASSERT(std::none_of(b.begin().get(), b.end().get(), isNull),
|
||||
"unexpected null-character in string");
|
||||
|
||||
using UnsignedChar = unsigned char;
|
||||
for (size_t i = 0; i < b.length(); i++) {
|
||||
// |a| is zero-terminated and |b| doesn't contain a null-terminator. So if
|
||||
// we've reached the end of |a|, the below if-statement will always be true.
|
||||
// That ensures we don't read past the end of |a|.
|
||||
if (int32_t r = UnsignedChar(a[i]) - UnsignedChar(b[i])) {
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
// Return zero if both strings are equal or a negative number if |b| is a
|
||||
// prefix of |a|.
|
||||
return -int32_t(UnsignedChar(a[b.length()]));
|
||||
};
|
||||
|
||||
template <size_t Length>
|
||||
static inline const char* SearchReplacement(const char* (&types)[Length],
|
||||
const char* (&aliases)[Length],
|
||||
const ConstCharRange& type) {
|
||||
|
||||
auto p = std::lower_bound(std::begin(types), std::end(types), type,
|
||||
[](const auto& a, const auto& b) {
|
||||
return CompareUnicodeType(a, b) < 0;
|
||||
});
|
||||
if (p != std::end(types) && CompareUnicodeType(*p, type) == 0) {
|
||||
return aliases[std::distance(std::begin(types), p)];
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
/**
|
||||
* Mapping from deprecated BCP 47 Unicode extension types to their preferred
|
||||
* values.
|
||||
*
|
||||
* Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files
|
||||
*/
|
||||
const char* js::intl::LanguageTag::replaceUnicodeExtensionType(
|
||||
const ConstCharRange& key, const ConstCharRange& type) {
|
||||
#ifdef DEBUG
|
||||
static auto isAsciiLowercaseAlphanumeric = [](char c) {
|
||||
return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c);
|
||||
};
|
||||
|
||||
static auto isAsciiLowercaseAlphanumericOrDash = [](char c) {
|
||||
return isAsciiLowercaseAlphanumeric(c) || c == '-';
|
||||
};
|
||||
#endif
|
||||
|
||||
MOZ_ASSERT(key.length() == UnicodeKeyLength);
|
||||
MOZ_ASSERT(std::all_of(key.begin().get(), key.end().get(),
|
||||
isAsciiLowercaseAlphanumeric));
|
||||
|
||||
MOZ_ASSERT(type.length() > UnicodeKeyLength);
|
||||
MOZ_ASSERT(std::all_of(type.begin().get(), type.end().get(),
|
||||
isAsciiLowercaseAlphanumericOrDash));
|
||||
|
||||
if (IsUnicodeKey(key, "ca")) {
|
||||
if (IsUnicodeType(type, "ethiopic-amete-alem")) {
|
||||
return "ethioaa";
|
||||
}
|
||||
if (IsUnicodeType(type, "islamicc")) {
|
||||
return "islamic-civil";
|
||||
}
|
||||
}
|
||||
else if (IsUnicodeKey(key, "kb") ||
|
||||
IsUnicodeKey(key, "kc") ||
|
||||
IsUnicodeKey(key, "kh") ||
|
||||
IsUnicodeKey(key, "kk") ||
|
||||
IsUnicodeKey(key, "kn")) {
|
||||
if (IsUnicodeType(type, "yes")) {
|
||||
return "true";
|
||||
}
|
||||
}
|
||||
else if (IsUnicodeKey(key, "ks")) {
|
||||
if (IsUnicodeType(type, "primary")) {
|
||||
return "level1";
|
||||
}
|
||||
if (IsUnicodeType(type, "tertiary")) {
|
||||
return "level3";
|
||||
}
|
||||
}
|
||||
else if (IsUnicodeKey(key, "ms")) {
|
||||
if (IsUnicodeType(type, "imperial")) {
|
||||
return "uksystem";
|
||||
}
|
||||
}
|
||||
else if (IsUnicodeKey(key, "rg") ||
|
||||
IsUnicodeKey(key, "sd")) {
|
||||
static const char* types[116] = {
|
||||
"cn11", "cn12", "cn13", "cn14", "cn15", "cn21", "cn22", "cn23",
|
||||
"cn31", "cn32", "cn33", "cn34", "cn35", "cn36", "cn37", "cn41",
|
||||
"cn42", "cn43", "cn44", "cn45", "cn46", "cn50", "cn51", "cn52",
|
||||
"cn53", "cn54", "cn61", "cn62", "cn63", "cn64", "cn65", "cz10a",
|
||||
"cz10b", "cz10c", "cz10d", "cz10e", "cz10f", "cz611", "cz612", "cz613",
|
||||
"cz614", "cz615", "cz621", "cz622", "cz623", "cz624", "cz626", "cz627",
|
||||
"czjc", "czjm", "czka", "czkr", "czli", "czmo", "czol", "czpa",
|
||||
"czpl", "czpr", "czst", "czus", "czvy", "czzl", "fra", "frb",
|
||||
"frc", "frd", "fre", "frf", "frg", "frh", "fri", "frj",
|
||||
"frk", "frl", "frm", "frn", "fro", "frp", "frq", "frr",
|
||||
"frs", "frt", "fru", "frv", "laxn", "lud", "lug", "lul",
|
||||
"mrnkc", "nzn", "nzs", "omba", "omsh", "plds", "plkp", "pllb",
|
||||
"plld", "pllu", "plma", "plmz", "plop", "plpd", "plpk", "plpm",
|
||||
"plsk", "plsl", "plwn", "plwp", "plzp", "tteto", "ttrcm", "ttwto",
|
||||
"twkhq", "twtnq", "twtpq", "twtxq",
|
||||
};
|
||||
static const char* aliases[116] = {
|
||||
"cnbj", "cntj", "cnhe", "cnsx", "cnmn", "cnln", "cnjl", "cnhl",
|
||||
"cnsh", "cnjs", "cnzj", "cnah", "cnfj", "cnjx", "cnsd", "cnha",
|
||||
"cnhb", "cnhn", "cngd", "cngx", "cnhi", "cncq", "cnsc", "cngz",
|
||||
"cnyn", "cnxz", "cnsn", "cngs", "cnqh", "cnnx", "cnxj", "cz110",
|
||||
"cz111", "cz112", "cz113", "cz114", "cz115", "cz663", "cz632", "cz633",
|
||||
"cz634", "cz635", "cz641", "cz642", "cz643", "cz644", "cz646", "cz647",
|
||||
"cz31", "cz64", "cz41", "cz52", "cz51", "cz80", "cz71", "cz53",
|
||||
"cz32", "cz10", "cz20", "cz42", "cz63", "cz72", "frges", "frnaq",
|
||||
"frara", "frbfc", "frbre", "frcvl", "frges", "frcor", "frbfc", "fridf",
|
||||
"frocc", "frnaq", "frges", "frocc", "frhdf", "frnor", "frnor", "frpdl",
|
||||
"frhdf", "frnaq", "frpac", "frara", "laxs", "lucl", "luec", "luca",
|
||||
"mr13", "nzauk", "nzcan", "ombj", "omsj", "pl02", "pl04", "pl08",
|
||||
"pl10", "pl06", "pl12", "pl14", "pl16", "pl20", "pl18", "pl22",
|
||||
"pl26", "pl24", "pl28", "pl30", "pl32", "tttob", "ttmrc", "tttob",
|
||||
"twkhh", "twtnn", "twnwt", "twtxg",
|
||||
};
|
||||
return SearchReplacement(types, aliases, type);
|
||||
}
|
||||
else if (IsUnicodeKey(key, "tz")) {
|
||||
static const char* types[28] = {
|
||||
"aqams", "cnckg", "cnhrb", "cnkhg", "cuba", "egypt",
|
||||
"eire", "est", "gmt0", "hongkong", "hst", "iceland",
|
||||
"iran", "israel", "jamaica", "japan", "libya", "mst",
|
||||
"navajo", "poland", "portugal", "prc", "roc", "rok",
|
||||
"turkey", "uct", "usnavajo", "zulu",
|
||||
};
|
||||
static const char* aliases[28] = {
|
||||
"nzakl", "cnsha", "cnsha", "cnurc", "cuhav", "egcai",
|
||||
"iedub", "utcw05", "gmt", "hkhkg", "utcw10", "isrey",
|
||||
"irthr", "jeruslm", "jmkin", "jptyo", "lytip", "utcw07",
|
||||
"usden", "plwaw", "ptlis", "cnsha", "twtpe", "krsel",
|
||||
"trist", "utc", "usden", "utc",
|
||||
};
|
||||
return SearchReplacement(types, aliases, type);
|
||||
}
|
||||
return nullptr;
|
||||
}
|
|
@ -50,15 +50,24 @@ from operator import attrgetter, itemgetter
|
|||
from zipfile import ZipFile
|
||||
|
||||
if sys.version_info.major == 2:
|
||||
from itertools import ifilter as filter, ifilterfalse as filterfalse, imap as map
|
||||
from itertools import ifilter as filter, ifilterfalse as filterfalse, imap as map,\
|
||||
izip_longest as zip_longest
|
||||
from urllib2 import urlopen, Request as UrlRequest
|
||||
from urlparse import urlsplit, urlunsplit
|
||||
else:
|
||||
from itertools import filterfalse
|
||||
from itertools import filterfalse, zip_longest
|
||||
from urllib.request import urlopen, Request as UrlRequest
|
||||
from urllib.parse import urlsplit, urlunsplit
|
||||
|
||||
|
||||
# From https://docs.python.org/3/library/itertools.html
|
||||
def grouper(iterable, n, fillvalue=None):
|
||||
"Collect data into fixed-length chunks or blocks"
|
||||
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
|
||||
args = [iter(iterable)] * n
|
||||
return zip_longest(*args, fillvalue=fillvalue)
|
||||
|
||||
|
||||
def writeMappingHeader(println, description, source, url):
|
||||
if type(description) is not list:
|
||||
description = [description]
|
||||
|
@ -383,6 +392,419 @@ function updateGrandfatheredMappings(tag) {
|
|||
}""".lstrip("\n"))
|
||||
|
||||
|
||||
def writeMappingsBinarySearch(println, fn_name, type_name, name, validate_fn, mappings,
|
||||
tag_maxlength, description, source, url):
|
||||
""" Emit code to perform a binary search on language tag subtags.
|
||||
|
||||
Uses the contents of |mapping|, which can either be a dictionary or set,
|
||||
to emit a mapping function to find subtag replacements.
|
||||
"""
|
||||
println(u"")
|
||||
writeMappingHeader(println, description, source, url)
|
||||
println(u"""
|
||||
bool js::intl::LanguageTag::{0}({1} {2}) {{
|
||||
MOZ_ASSERT({3}({2}.range()));
|
||||
""".format(fn_name, type_name, name, validate_fn).strip())
|
||||
|
||||
def write_array(subtags, name, length, fixed):
|
||||
if fixed:
|
||||
println(u" static const char {}[{}][{}] = {{".format(name, len(subtags),
|
||||
length + 1))
|
||||
else:
|
||||
println(u" static const char* {}[{}] = {{".format(name, len(subtags)))
|
||||
|
||||
# Group in pairs of ten to not exceed the 80 line column limit.
|
||||
for entries in grouper(subtags, 10):
|
||||
entries = (u"\"{}\"".format(tag).rjust(length + 2)
|
||||
for tag in entries if tag is not None)
|
||||
println(u" {},".format(u", ".join(entries)))
|
||||
|
||||
println(u" };")
|
||||
|
||||
trailing_return = True
|
||||
|
||||
# Sort the subtags by length. That enables using an optimized comparator
|
||||
# for the binary search, which only performs a single |memcmp| for multiple
|
||||
# of two subtag lengths.
|
||||
mappings_keys = mappings.keys() if type(mappings) == dict else mappings
|
||||
for (length, subtags) in groupby(sorted(mappings_keys, key=len), len):
|
||||
# Omit the length check if the current length is the maximum length.
|
||||
if length != tag_maxlength:
|
||||
println(u"""
|
||||
if ({}.length() == {}) {{
|
||||
""".format(name, length).rstrip("\n"))
|
||||
else:
|
||||
trailing_return = False
|
||||
println(u"""
|
||||
{
|
||||
""".rstrip("\n"))
|
||||
|
||||
# The subtags need to be sorted for binary search to work.
|
||||
subtags = sorted(subtags)
|
||||
|
||||
def equals(subtag):
|
||||
return u"""{}.equalTo("{}")""".format(name, subtag)
|
||||
|
||||
# Don't emit a binary search for short lists.
|
||||
if len(subtags) == 1:
|
||||
if type(mappings) == dict:
|
||||
println(u"""
|
||||
if ({}) {{
|
||||
{}.set("{}");
|
||||
return true;
|
||||
}}
|
||||
return false;
|
||||
""".format(equals(subtags[0]), name, mappings[subtags[0]]).strip("\n"))
|
||||
else:
|
||||
println(u"""
|
||||
return {};
|
||||
""".format(equals(subtags[0])).strip("\n"))
|
||||
elif len(subtags) <= 4:
|
||||
if type(mappings) == dict:
|
||||
for subtag in subtags:
|
||||
println(u"""
|
||||
if ({}) {{
|
||||
{}.set("{}");
|
||||
return true;
|
||||
}}
|
||||
""".format(equals(subtag), name, mappings[subtag]).strip("\n"))
|
||||
|
||||
println(u"""
|
||||
return false;
|
||||
""".strip("\n"))
|
||||
else:
|
||||
cond = (equals(subtag) for subtag in subtags)
|
||||
cond = (u" ||\n" + u" " * (4 + len("return "))).join(cond)
|
||||
println(u"""
|
||||
return {};
|
||||
""".format(cond).strip("\n"))
|
||||
else:
|
||||
write_array(subtags, name + "s", length, True)
|
||||
|
||||
if type(mappings) == dict:
|
||||
write_array([mappings[k] for k in subtags], u"aliases", length, False)
|
||||
|
||||
println(u"""
|
||||
if (const char* replacement = SearchReplacement({0}s, aliases, {0})) {{
|
||||
{0}.set(ConstCharRange(replacement, strlen(replacement)));
|
||||
return true;
|
||||
}}
|
||||
return false;
|
||||
""".format(name).rstrip())
|
||||
else:
|
||||
println(u"""
|
||||
return HasReplacement({0}s, {0});
|
||||
""".format(name).rstrip())
|
||||
|
||||
println(u"""
|
||||
}
|
||||
""".strip("\n"))
|
||||
|
||||
if trailing_return:
|
||||
println(u"""
|
||||
return false;""")
|
||||
|
||||
println(u"""
|
||||
}""".lstrip("\n"))
|
||||
|
||||
|
||||
def writeComplexLanguageTagMappingsNative(println, complex_language_mappings,
|
||||
description, source, url):
|
||||
println(u"")
|
||||
writeMappingHeader(println, description, source, url)
|
||||
println(u"""
|
||||
void js::intl::LanguageTag::performComplexLanguageMappings() {
|
||||
MOZ_ASSERT(IsStructurallyValidLanguageTag(language().range()));
|
||||
""".lstrip())
|
||||
|
||||
# Merge duplicate language entries.
|
||||
language_aliases = {}
|
||||
for (deprecated_language, (language, script, region)) in (
|
||||
sorted(complex_language_mappings.items(), key=itemgetter(0))
|
||||
):
|
||||
key = (language, script, region)
|
||||
if key not in language_aliases:
|
||||
language_aliases[key] = []
|
||||
else:
|
||||
language_aliases[key].append(deprecated_language)
|
||||
|
||||
first_language = True
|
||||
for (deprecated_language, (language, script, region)) in (
|
||||
sorted(complex_language_mappings.items(), key=itemgetter(0))
|
||||
):
|
||||
key = (language, script, region)
|
||||
if deprecated_language in language_aliases[key]:
|
||||
continue
|
||||
|
||||
if_kind = u"if" if first_language else u"else if"
|
||||
first_language = False
|
||||
|
||||
cond = (u"language().equalTo(\"{}\")".format(lang)
|
||||
for lang in [deprecated_language] + language_aliases[key])
|
||||
cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond)
|
||||
|
||||
println(u"""
|
||||
{} ({}) {{""".format(if_kind, cond).strip("\n"))
|
||||
|
||||
println(u"""
|
||||
setLanguage("{}");""".format(language).strip("\n"))
|
||||
|
||||
if script is not None:
|
||||
println(u"""
|
||||
if (script().length() == 0) {{
|
||||
setScript("{}");
|
||||
}}""".format(script).strip("\n"))
|
||||
if region is not None:
|
||||
println(u"""
|
||||
if (region().length() == 0) {{
|
||||
setRegion("{}");
|
||||
}}""".format(region).strip("\n"))
|
||||
println(u"""
|
||||
}""".strip("\n"))
|
||||
|
||||
println(u"""
|
||||
}
|
||||
""".strip("\n"))
|
||||
|
||||
|
||||
def writeComplexRegionTagMappingsNative(println, complex_region_mappings,
|
||||
description, source, url):
|
||||
println(u"")
|
||||
writeMappingHeader(println, description, source, url)
|
||||
println(u"""
|
||||
void js::intl::LanguageTag::performComplexRegionMappings() {
|
||||
MOZ_ASSERT(IsStructurallyValidLanguageTag(language().range()));
|
||||
MOZ_ASSERT(IsStructurallyValidRegionTag(region().range()));
|
||||
""".lstrip())
|
||||
|
||||
# |non_default_replacements| is a list and hence not hashable. Convert it
|
||||
# to a string to get a proper hashable value.
|
||||
def hash_key(default, non_default_replacements):
|
||||
return (default, str(sorted(str(v) for v in non_default_replacements)))
|
||||
|
||||
# Merge duplicate region entries.
|
||||
region_aliases = {}
|
||||
for (deprecated_region, (default, non_default_replacements)) in (
|
||||
sorted(complex_region_mappings.items(), key=itemgetter(0))
|
||||
):
|
||||
key = hash_key(default, non_default_replacements)
|
||||
if key not in region_aliases:
|
||||
region_aliases[key] = []
|
||||
else:
|
||||
region_aliases[key].append(deprecated_region)
|
||||
|
||||
first_region = True
|
||||
for (deprecated_region, (default, non_default_replacements)) in (
|
||||
sorted(complex_region_mappings.items(), key=itemgetter(0))
|
||||
):
|
||||
key = hash_key(default, non_default_replacements)
|
||||
if deprecated_region in region_aliases[key]:
|
||||
continue
|
||||
|
||||
if_kind = u"if" if first_region else u"else if"
|
||||
first_region = False
|
||||
|
||||
cond = (u"region().equalTo(\"{}\")".format(region)
|
||||
for region in [deprecated_region] + region_aliases[key])
|
||||
cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond)
|
||||
|
||||
println(u"""
|
||||
{} ({}) {{""".format(if_kind, cond).strip("\n"))
|
||||
|
||||
replacement_regions = sorted({region for (_, _, region) in non_default_replacements})
|
||||
|
||||
first_case = True
|
||||
for replacement_region in replacement_regions:
|
||||
replacement_language_script = sorted(((language, script)
|
||||
for (language, script, region) in (
|
||||
non_default_replacements
|
||||
)
|
||||
if region == replacement_region),
|
||||
key=itemgetter(0))
|
||||
|
||||
if_kind = u"if" if first_case else u"else if"
|
||||
first_case = False
|
||||
|
||||
def compare_tags(language, script):
|
||||
if script is None:
|
||||
return u"language().equalTo(\"{}\")".format(language)
|
||||
return u"(language().equalTo(\"{}\") && script().equalTo(\"{}\"))".format(
|
||||
language, script)
|
||||
|
||||
cond = (compare_tags(language, script)
|
||||
for (language, script) in replacement_language_script)
|
||||
cond = (u" ||\n" + u" " * (4 + len(if_kind) + 2)).join(cond)
|
||||
|
||||
println(u"""
|
||||
{} ({}) {{
|
||||
setRegion("{}");
|
||||
}}""".format(if_kind, cond, replacement_region).rstrip().strip("\n"))
|
||||
|
||||
println(u"""
|
||||
else {{
|
||||
setRegion("{}");
|
||||
}}
|
||||
}}""".format(default).rstrip().strip("\n"))
|
||||
|
||||
println(u"""
|
||||
}
|
||||
""".strip("\n"))
|
||||
|
||||
|
||||
def writeGrandfatheredMappingsFunctionNative(println, grandfathered_mappings,
|
||||
description, source, url):
|
||||
""" Writes a function definition that maps grandfathered language tags. """
|
||||
println(u"")
|
||||
writeMappingHeader(println, description, source, url)
|
||||
println(u"""\
|
||||
bool js::intl::LanguageTag::updateGrandfatheredMappings(JSContext* cx) {
|
||||
// We're mapping regular grandfathered tags to non-grandfathered form here.
|
||||
// Other tags remain unchanged.
|
||||
//
|
||||
// regular = "art-lojban"
|
||||
// / "cel-gaulish"
|
||||
// / "no-bok"
|
||||
// / "no-nyn"
|
||||
// / "zh-guoyu"
|
||||
// / "zh-hakka"
|
||||
// / "zh-min"
|
||||
// / "zh-min-nan"
|
||||
// / "zh-xiang"
|
||||
//
|
||||
// Therefore we can quickly exclude most tags by checking every
|
||||
// |unicode_locale_id| subcomponent for characteristics not shared by any of
|
||||
// the regular grandfathered (RG) tags:
|
||||
//
|
||||
// * Real-world |unicode_language_subtag|s are all two or three letters,
|
||||
// so don't waste time running a useless |language.length > 3| fast-path.
|
||||
// * No RG tag has a "script"-looking component.
|
||||
// * No RG tag has a "region"-looking component.
|
||||
// * The RG tags that match |unicode_locale_id| (art-lojban, cel-gaulish,
|
||||
// zh-guoyu, zh-hakka, zh-xiang) have exactly one "variant". (no-bok,
|
||||
// no-nyn, zh-min, and zh-min-nan require BCP47's extlang subtag
|
||||
// that |unicode_locale_id| doesn't support.)
|
||||
// * No RG tag contains |extensions| or |pu_extensions|.
|
||||
if (script().length() != 0 ||
|
||||
region().length() != 0 ||
|
||||
variants().length() != 1 ||
|
||||
extensions().length() != 0 ||
|
||||
privateuse()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
auto variantEqualTo = [this](const char* variant) {
|
||||
return strcmp(variants()[0].get(), variant) == 0;
|
||||
};""")
|
||||
|
||||
# From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>.
|
||||
#
|
||||
# Doesn't allow any 'extensions' subtags.
|
||||
re_unicode_locale_id = re.compile(
|
||||
r"""
|
||||
^
|
||||
# unicode_language_id = unicode_language_subtag
|
||||
# unicode_language_subtag = alpha{2,3} | alpha{5,8}
|
||||
(?P<language>[a-z]{2,3}|[a-z]{5,8})
|
||||
|
||||
# (sep unicode_script_subtag)?
|
||||
# unicode_script_subtag = alpha{4}
|
||||
(?:-(?P<script>[a-z]{4}))?
|
||||
|
||||
# (sep unicode_region_subtag)?
|
||||
# unicode_region_subtag = (alpha{2} | digit{3})
|
||||
(?:-(?P<region>([a-z]{2}|[0-9]{3})))?
|
||||
|
||||
# (sep unicode_variant_subtag)*
|
||||
# unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
|
||||
(?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)?
|
||||
|
||||
# pu_extensions?
|
||||
# pu_extensions = sep [xX] (sep alphanum{1,8})+
|
||||
(?:-(?P<privateuse>x(-[a-z0-9]{1,8})+))?
|
||||
$
|
||||
""", re.IGNORECASE | re.VERBOSE)
|
||||
|
||||
is_first = True
|
||||
|
||||
for (tag, modern) in sorted(grandfathered_mappings.items(), key=itemgetter(0)):
|
||||
tag_match = re_unicode_locale_id.match(tag)
|
||||
assert tag_match is not None
|
||||
|
||||
tag_language = tag_match.group("language")
|
||||
assert tag_match.group("script") is None, (
|
||||
"{} does not contain a script subtag".format(tag))
|
||||
assert tag_match.group("region") is None, (
|
||||
"{} does not contain a region subtag".format(tag))
|
||||
tag_variants = tag_match.group("variants")
|
||||
assert tag_variants is not None, (
|
||||
"{} contains a variant subtag".format(tag))
|
||||
assert tag_match.group("privateuse") is None, (
|
||||
"{} does not contain a privateuse subtag".format(tag))
|
||||
|
||||
tag_variant = tag_variants[1:]
|
||||
assert "-" not in tag_variant, (
|
||||
"{} contains only a single variant".format(tag))
|
||||
|
||||
modern_match = re_unicode_locale_id.match(modern)
|
||||
assert modern_match is not None
|
||||
|
||||
modern_language = modern_match.group("language")
|
||||
modern_script = modern_match.group("script")
|
||||
modern_region = modern_match.group("region")
|
||||
modern_variants = modern_match.group("variants")
|
||||
modern_privateuse = modern_match.group("privateuse")
|
||||
|
||||
println(u"""
|
||||
// {} -> {}
|
||||
""".format(tag, modern).rstrip())
|
||||
|
||||
println(u"""
|
||||
{}if (language().equalTo("{}") && variantEqualTo("{}")) {{
|
||||
""".format("" if is_first else "else ",
|
||||
tag_language,
|
||||
tag_variant).rstrip().strip("\n"))
|
||||
|
||||
is_first = False
|
||||
|
||||
println(u"""
|
||||
setLanguage("{}");
|
||||
""".format(modern_language).rstrip().strip("\n"))
|
||||
|
||||
if modern_script is not None:
|
||||
println(u"""
|
||||
setScript("{}");
|
||||
""".format(modern_script).rstrip().strip("\n"))
|
||||
|
||||
if modern_region is not None:
|
||||
println(u"""
|
||||
setRegion("{}");
|
||||
""".format(modern_region).rstrip().strip("\n"))
|
||||
|
||||
assert modern_variants is None, (
|
||||
"all regular grandfathered tags' modern forms do not contain variant subtags")
|
||||
|
||||
println(u"""
|
||||
clearVariants();
|
||||
""".rstrip().strip("\n"))
|
||||
|
||||
if modern_privateuse is not None:
|
||||
println(u"""
|
||||
auto privateuse = DuplicateString(cx, "{}");
|
||||
if (!privateuse) {{
|
||||
return false;
|
||||
}}
|
||||
setPrivateuse(std::move(privateuse));
|
||||
""".format(modern_privateuse).rstrip().rstrip("\n"))
|
||||
|
||||
println(u"""
|
||||
return true;
|
||||
}""".rstrip().strip("\n"))
|
||||
|
||||
println(u"""
|
||||
return true;
|
||||
}""")
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def TemporaryDirectory():
|
||||
tmpDir = tempfile.mkdtemp()
|
||||
|
@ -674,6 +1096,106 @@ def writeCLDRLanguageTagData(println, data, url):
|
|||
source, url)
|
||||
|
||||
|
||||
def writeCLDRLanguageTagDataNative(println, data, url):
|
||||
""" Writes the language tag data to the Intl data file. """
|
||||
|
||||
println(generatedFileWarning)
|
||||
|
||||
println(u"""
|
||||
#include "mozilla/Assertions.h"
|
||||
#include "mozilla/Range.h"
|
||||
#include "mozilla/TextUtils.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <iterator>
|
||||
#include <type_traits>
|
||||
|
||||
#include "builtin/intl/LanguageTag.h"
|
||||
#include "util/Text.h"
|
||||
#include "vm/JSContext.h"
|
||||
|
||||
using ConstCharRange = mozilla::Range<const char>;
|
||||
|
||||
template <size_t Length, size_t TagLength, size_t SubtagLength>
|
||||
static inline bool HasReplacement(
|
||||
const char (&subtags)[Length][TagLength],
|
||||
const js::intl::LanguageTagSubtag<SubtagLength>& subtag) {
|
||||
MOZ_ASSERT(subtag.length() == TagLength - 1,
|
||||
"subtag must have the same length as the list of subtags");
|
||||
|
||||
const char* ptr = subtag.range().begin().get();
|
||||
return std::binary_search(std::begin(subtags), std::end(subtags), ptr,
|
||||
[](const char* a, const char* b) {
|
||||
return memcmp(a, b, TagLength - 1) < 0;
|
||||
});
|
||||
}
|
||||
|
||||
template <size_t Length, size_t TagLength, size_t SubtagLength>
|
||||
static inline const char* SearchReplacement(
|
||||
const char (&subtags)[Length][TagLength],
|
||||
const char* (&aliases)[Length],
|
||||
const js::intl::LanguageTagSubtag<SubtagLength>& subtag) {
|
||||
MOZ_ASSERT(subtag.length() == TagLength - 1,
|
||||
"subtag must have the same length as the list of subtags");
|
||||
|
||||
const char* ptr = subtag.range().begin().get();
|
||||
auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr,
|
||||
[](const char* a, const char* b) {
|
||||
return memcmp(a, b, TagLength - 1) < 0;
|
||||
});
|
||||
if (p != std::end(subtags) && memcmp(*p, ptr, TagLength - 1) == 0) {
|
||||
return aliases[std::distance(std::begin(subtags), p)];
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
""".rstrip())
|
||||
|
||||
source = u"CLDR Supplemental Data, version {}".format(data["version"])
|
||||
grandfathered_mappings = data["grandfatheredMappings"]
|
||||
language_mappings = data["languageMappings"]
|
||||
complex_language_mappings = data["complexLanguageMappings"]
|
||||
region_mappings = data["regionMappings"]
|
||||
complex_region_mappings = data["complexRegionMappings"]
|
||||
|
||||
# unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
|
||||
language_maxlength = 8
|
||||
|
||||
# unicode_region_subtag = (alpha{2} | digit{3}) ;
|
||||
region_maxlength = 3
|
||||
|
||||
writeMappingsBinarySearch(println, "languageMapping",
|
||||
"LanguageSubtag&", "language",
|
||||
"IsStructurallyValidLanguageTag",
|
||||
language_mappings, language_maxlength,
|
||||
"Mappings from language subtags to preferred values.", source, url)
|
||||
writeMappingsBinarySearch(println, "complexLanguageMapping",
|
||||
"const LanguageSubtag&", "language",
|
||||
"IsStructurallyValidLanguageTag",
|
||||
complex_language_mappings.keys(), language_maxlength,
|
||||
"Language subtags with complex mappings.", source, url)
|
||||
writeMappingsBinarySearch(println, "regionMapping",
|
||||
"RegionSubtag&", "region",
|
||||
"IsStructurallyValidRegionTag",
|
||||
region_mappings, region_maxlength,
|
||||
"Mappings from region subtags to preferred values.", source, url)
|
||||
writeMappingsBinarySearch(println, "complexRegionMapping",
|
||||
"const RegionSubtag&", "region",
|
||||
"IsStructurallyValidRegionTag",
|
||||
complex_region_mappings.keys(), region_maxlength,
|
||||
"Region subtags with complex mappings.", source, url)
|
||||
|
||||
writeComplexLanguageTagMappingsNative(println, complex_language_mappings,
|
||||
"Language subtags with complex mappings.", source, url)
|
||||
writeComplexRegionTagMappingsNative(println, complex_region_mappings,
|
||||
"Region subtags with complex mappings.", source, url)
|
||||
|
||||
writeGrandfatheredMappingsFunctionNative(println, grandfathered_mappings,
|
||||
"Canonicalize grandfathered locale identifiers.", source,
|
||||
url)
|
||||
|
||||
|
||||
def writeCLDRLanguageTagLikelySubtagsTest(println, data, url):
|
||||
""" Writes the likely-subtags test file. """
|
||||
|
||||
|
@ -886,6 +1408,13 @@ def updateCLDRLangTags(args):
|
|||
println(u"// Generated by make_intl_data.py. DO NOT EDIT.")
|
||||
writeCLDRLanguageTagData(println, data, url)
|
||||
|
||||
print("Writing Intl data...")
|
||||
native_out = "LanguageTagGenerated.cpp"
|
||||
# native_out = os.path.splitext(out)[0] + ".cpp"
|
||||
with io.open(native_out, mode="w", encoding="utf-8", newline="") as f:
|
||||
println = partial(print, file=f)
|
||||
writeCLDRLanguageTagDataNative(println, data, url)
|
||||
|
||||
print("Writing Intl test data...")
|
||||
test_file = os.path.join(os.path.dirname(os.path.abspath(__file__)),
|
||||
"../../tests/non262/Intl/Locale/likely-subtags-generated.js")
|
||||
|
@ -894,7 +1423,7 @@ def updateCLDRLangTags(args):
|
|||
|
||||
println(u"// |reftest| skip-if(!this.hasOwnProperty('Intl')||"
|
||||
u"(!this.Intl.Locale&&!this.hasOwnProperty('addIntlExtras')))")
|
||||
println(u"// Generated by make_intl_data.py. DO NOT EDIT.")
|
||||
println(generatedFileWarning)
|
||||
writeCLDRLanguageTagLikelySubtagsTest(println, data, url)
|
||||
|
||||
|
||||
|
@ -1780,6 +2309,179 @@ def writeUnicodeExtensionsFile(version, url, mapping, out):
|
|||
println(u" },")
|
||||
println(u"};")
|
||||
|
||||
with io.open(os.path.splitext(out)[0] + ".cpp", mode="w", encoding="utf-8", newline="") as f:
|
||||
println = partial(print, file=f)
|
||||
|
||||
println(generatedFileWarning)
|
||||
println(u"// Version: CLDR-{}".format(version))
|
||||
println(u"// URL: {}".format(url))
|
||||
|
||||
println(u"""
|
||||
#include "mozilla/Assertions.h"
|
||||
#include "mozilla/Range.h"
|
||||
#include "mozilla/TextUtils.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
||||
#include "builtin/intl/LanguageTag.h"
|
||||
|
||||
using namespace js::intl::LanguageTagLimits;
|
||||
|
||||
using ConstCharRange = mozilla::Range<const char>;
|
||||
|
||||
template <size_t Length>
|
||||
static inline bool IsUnicodeKey(const ConstCharRange& key,
|
||||
const char (&str)[Length]) {
|
||||
static_assert(Length == UnicodeKeyLength + 1,
|
||||
"Unicode extension key is two characters long");
|
||||
return memcmp(key.begin().get(), str, Length - 1) == 0;
|
||||
}
|
||||
|
||||
template <size_t Length>
|
||||
static inline bool IsUnicodeType(const ConstCharRange& type,
|
||||
const char (&str)[Length]) {
|
||||
static_assert(Length > UnicodeKeyLength + 1,
|
||||
"Unicode extension type contains more than two characters");
|
||||
return type.length() == (Length - 1) &&
|
||||
memcmp(type.begin().get(), str, Length - 1) == 0;
|
||||
}
|
||||
|
||||
static int32_t CompareUnicodeType(const char* a, const ConstCharRange& b) {
|
||||
#ifdef DEBUG
|
||||
auto isNull = [](char c) {
|
||||
return c == '\\0';
|
||||
};
|
||||
#endif
|
||||
|
||||
MOZ_ASSERT(std::none_of(b.begin().get(), b.end().get(), isNull),
|
||||
"unexpected null-character in string");
|
||||
|
||||
using UnsignedChar = unsigned char;
|
||||
for (size_t i = 0; i < b.length(); i++) {
|
||||
// |a| is zero-terminated and |b| doesn't contain a null-terminator. So if
|
||||
// we've reached the end of |a|, the below if-statement will always be true.
|
||||
// That ensures we don't read past the end of |a|.
|
||||
if (int32_t r = UnsignedChar(a[i]) - UnsignedChar(b[i])) {
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
// Return zero if both strings are equal or a negative number if |b| is a
|
||||
// prefix of |a|.
|
||||
return -int32_t(UnsignedChar(a[b.length()]));
|
||||
};
|
||||
|
||||
template <size_t Length>
|
||||
static inline const char* SearchReplacement(const char* (&types)[Length],
|
||||
const char* (&aliases)[Length],
|
||||
const ConstCharRange& type) {
|
||||
|
||||
auto p = std::lower_bound(std::begin(types), std::end(types), type,
|
||||
[](const auto& a, const auto& b) {
|
||||
return CompareUnicodeType(a, b) < 0;
|
||||
});
|
||||
if (p != std::end(types) && CompareUnicodeType(*p, type) == 0) {
|
||||
return aliases[std::distance(std::begin(types), p)];
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
""".rstrip("\n"))
|
||||
|
||||
println(u"""
|
||||
/**
|
||||
* Mapping from deprecated BCP 47 Unicode extension types to their preferred
|
||||
* values.
|
||||
*
|
||||
* Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files
|
||||
*/
|
||||
const char* js::intl::LanguageTag::replaceUnicodeExtensionType(
|
||||
const ConstCharRange& key, const ConstCharRange& type) {
|
||||
#ifdef DEBUG
|
||||
static auto isAsciiLowercaseAlphanumeric = [](char c) {
|
||||
return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c);
|
||||
};
|
||||
|
||||
static auto isAsciiLowercaseAlphanumericOrDash = [](char c) {
|
||||
return isAsciiLowercaseAlphanumeric(c) || c == '-';
|
||||
};
|
||||
#endif
|
||||
|
||||
MOZ_ASSERT(key.length() == UnicodeKeyLength);
|
||||
MOZ_ASSERT(std::all_of(key.begin().get(), key.end().get(),
|
||||
isAsciiLowercaseAlphanumeric));
|
||||
|
||||
MOZ_ASSERT(type.length() > UnicodeKeyLength);
|
||||
MOZ_ASSERT(std::all_of(type.begin().get(), type.end().get(),
|
||||
isAsciiLowercaseAlphanumericOrDash));
|
||||
""")
|
||||
|
||||
def to_hash_key(replacements):
|
||||
return str(sorted([str((k, v["preferred"])) for (k, v) in replacements.items()]))
|
||||
|
||||
def write_array(subtags, name, length):
|
||||
max_entries = (80 - len(" ")) // (length + len('"", '))
|
||||
|
||||
println(u" static const char* {}[{}] = {{".format(name, len(subtags)))
|
||||
|
||||
for entries in grouper(subtags, max_entries):
|
||||
entries = (u"\"{}\"".format(tag).rjust(length + 2)
|
||||
for tag in entries if tag is not None)
|
||||
println(u" {},".format(u", ".join(entries)))
|
||||
|
||||
println(u" };")
|
||||
|
||||
# Merge duplicate keys.
|
||||
key_aliases = {}
|
||||
for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)):
|
||||
hash_key = to_hash_key(replacements)
|
||||
if hash_key not in key_aliases:
|
||||
key_aliases[hash_key] = []
|
||||
else:
|
||||
key_aliases[hash_key].append(key)
|
||||
|
||||
first_key = True
|
||||
for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)):
|
||||
hash_key = to_hash_key(replacements)
|
||||
if key in key_aliases[hash_key]:
|
||||
continue
|
||||
|
||||
cond = (u"IsUnicodeKey(key, \"{}\")".format(k) for k in [key] + key_aliases[hash_key])
|
||||
|
||||
if_kind = u"if" if first_key else u"else if"
|
||||
cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond)
|
||||
println(u"""
|
||||
{} ({}) {{""".format(if_kind, cond).strip("\n"))
|
||||
first_key = False
|
||||
|
||||
replacements = sorted(replacements.items(), key=itemgetter(0))
|
||||
|
||||
if len(replacements) > 4:
|
||||
types = [t for (t, _) in replacements]
|
||||
preferred = [r["preferred"] for (_, r) in replacements]
|
||||
max_len = max(len(k) for k in types + preferred)
|
||||
|
||||
write_array(types, "types", max_len)
|
||||
write_array(preferred, "aliases", max_len)
|
||||
println(u"""
|
||||
return SearchReplacement(types, aliases, type);
|
||||
""".strip("\n"))
|
||||
else:
|
||||
for (type, replacement) in replacements:
|
||||
println(u"""
|
||||
if (IsUnicodeType(type, "{}")) {{
|
||||
return "{}";
|
||||
}}""".format(type, replacement["preferred"]).strip("\n"))
|
||||
|
||||
println(u"""
|
||||
}""".lstrip("\n"))
|
||||
|
||||
println(u"""
|
||||
return nullptr;
|
||||
}
|
||||
""".strip("\n"))
|
||||
|
||||
|
||||
def updateUnicodeExtensions(args):
|
||||
""" Update the UnicodeExtensionsGenerated.js file. """
|
||||
|
|
|
@ -380,11 +380,14 @@ if CONFIG['ENABLE_INTL_API']:
|
|||
'builtin/intl/CommonFunctions.cpp',
|
||||
'builtin/intl/DateTimeFormat.cpp',
|
||||
'builtin/intl/IntlObject.cpp',
|
||||
'builtin/intl/LanguageTag.cpp',
|
||||
'builtin/intl/LanguageTagGenerated.cpp',
|
||||
'builtin/intl/Locale.cpp',
|
||||
'builtin/intl/NumberFormat.cpp',
|
||||
'builtin/intl/PluralRules.cpp',
|
||||
'builtin/intl/RelativeTimeFormat.cpp',
|
||||
'builtin/intl/SharedIntlData.cpp',
|
||||
'builtin/intl/UnicodeExtensionsGenerated.cpp',
|
||||
]
|
||||
|
||||
if CONFIG['MOZ_INSTRUMENTS']:
|
||||
|
|
Загрузка…
Ссылка в новой задаче