Bug 1570370 - Part 1: Port Unicode BCP 47 locale identifier parser to C++. r=jwalden

Differential Revision: https://phabricator.services.mozilla.com/D40067

--HG--
extra : moz-landing-system : lando
This commit is contained in:
André Bargull 2019-10-11 20:05:43 +00:00
Родитель 8b869f9cf6
Коммит 4a8f76c4de
7 изменённых файлов: 3702 добавлений и 3 удалений

Просмотреть файл

@ -4,7 +4,9 @@ build/clang-plugin/.*
config/gcc-stl-wrapper.template.h
config/msvc-stl-wrapper.template.h
# Generated code
js/src/builtin/intl/LanguageTagGenerated.cpp
js/src/builtin/intl/TimeZoneDataGenerated.h
js/src/builtin/intl/UnicodeExtensionsGenerated.cpp
# Don't want to reformat irregexp. bug 1510128
js/src/irregexp/.*

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,689 @@
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
* vim: set ts=8 sts=2 et sw=2 tw=80:
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/* Structured representation of Unicode locale IDs used with Intl functions. */
#ifndef builtin_intl_LanguageTag_h
#define builtin_intl_LanguageTag_h
#include "mozilla/Assertions.h"
#include "mozilla/Range.h"
#include "mozilla/TextUtils.h"
#include "mozilla/TypedEnumBits.h"
#include "mozilla/Variant.h"
#include <algorithm>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <utility>
#include "js/AllocPolicy.h"
#include "js/GCAPI.h"
#include "js/Result.h"
#include "js/Utility.h"
#include "js/Vector.h"
struct JSContext;
class JSLinearString;
class JSString;
namespace js {
class StringBuffer;
namespace intl {
#ifdef DEBUG
/**
* Return true if |language| is a valid, case-normalized language subtag.
*/
template <typename CharT>
bool IsStructurallyValidLanguageTag(
const mozilla::Range<const CharT>& language);
/**
* Return true if |script| is a valid, case-normalized script subtag.
*/
template <typename CharT>
bool IsStructurallyValidScriptTag(const mozilla::Range<const CharT>& script);
/**
* Return true if |region| is a valid, case-normalized region subtag.
*/
template <typename CharT>
bool IsStructurallyValidRegionTag(const mozilla::Range<const CharT>& region);
/**
* Return true if |variant| is a valid, case-normalized variant subtag.
*/
bool IsStructurallyValidVariantTag(const mozilla::Range<const char>& variant);
/**
* Return true if |extension| is a valid, case-normalized Unicode extension
* subtag.
*/
bool IsStructurallyValidUnicodeExtensionTag(
const mozilla::Range<const char>& extension);
/**
* Return true if |privateUse| is a valid, case-normalized private-use subtag.
*/
bool IsStructurallyValidPrivateUseTag(
const mozilla::Range<const char>& privateUse);
#endif
template <typename CharT>
char AsciiToLowerCase(CharT c) {
MOZ_ASSERT(mozilla::IsAscii(c));
return mozilla::IsAsciiUppercaseAlpha(c) ? (c | 0x20) : c;
}
template <typename CharT>
char AsciiToUpperCase(CharT c) {
MOZ_ASSERT(mozilla::IsAscii(c));
return mozilla::IsAsciiLowercaseAlpha(c) ? (c & ~0x20) : c;
}
template <typename CharT>
void AsciiToLowerCase(CharT* chars, size_t length, char* dest) {
// Tell the analysis the |std::transform| function can't GC.
JS::AutoSuppressGCAnalysis nogc;
char (&fn)(CharT) = AsciiToLowerCase;
std::transform(chars, chars + length, dest, fn);
}
template <typename CharT>
void AsciiToUpperCase(CharT* chars, size_t length, char* dest) {
// Tell the analysis the |std::transform| function can't GC.
JS::AutoSuppressGCAnalysis nogc;
char (&fn)(CharT) = AsciiToUpperCase;
std::transform(chars, chars + length, dest, fn);
}
template <typename CharT>
void AsciiToTitleCase(CharT* chars, size_t length, char* dest) {
if (length > 0) {
AsciiToUpperCase(chars, 1, dest);
AsciiToLowerCase(chars + 1, length - 1, dest + 1);
}
}
// Constants for language subtag lengths.
namespace LanguageTagLimits {
// unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
static constexpr size_t LanguageLength = 8;
// unicode_script_subtag = alpha{4} ;
static constexpr size_t ScriptLength = 4;
// unicode_region_subtag = (alpha{2} | digit{3}) ;
static constexpr size_t RegionLength = 3;
static constexpr size_t AlphaRegionLength = 2;
static constexpr size_t DigitRegionLength = 3;
// key = alphanum alpha ;
static constexpr size_t UnicodeKeyLength = 2;
// tkey = alpha digit ;
static constexpr size_t TransformKeyLength = 2;
} // namespace LanguageTagLimits
// Fixed size language subtag which is stored inline in LanguageTag.
template <size_t Length>
class LanguageTagSubtag final {
uint8_t length_ = 0;
char chars_[Length];
public:
LanguageTagSubtag() = default;
LanguageTagSubtag(const LanguageTagSubtag&) = delete;
LanguageTagSubtag& operator=(const LanguageTagSubtag&) = delete;
size_t length() const { return length_; }
mozilla::Range<const char> range() const { return {chars_, length_}; }
template <typename CharT>
void set(const mozilla::Range<const CharT>& str) {
MOZ_ASSERT(str.length() <= Length);
std::copy_n(str.begin().get(), str.length(), chars_);
length_ = str.length();
}
void toLowerCase() { AsciiToLowerCase(chars_, length(), chars_); }
void toUpperCase() { AsciiToUpperCase(chars_, length(), chars_); }
void toTitleCase() { AsciiToTitleCase(chars_, length(), chars_); }
template <size_t N>
bool equalTo(const char (&str)[N]) const {
static_assert(N - 1 <= Length,
"subtag literals must not exceed the maximum subtag length");
return length_ == N - 1 && memcmp(chars_, str, N - 1) == 0;
}
};
using LanguageSubtag = LanguageTagSubtag<LanguageTagLimits::LanguageLength>;
using ScriptSubtag = LanguageTagSubtag<LanguageTagLimits::ScriptLength>;
using RegionSubtag = LanguageTagSubtag<LanguageTagLimits::RegionLength>;
/**
* Object representing a language tag.
*
* All subtags are already in canonicalized case.
*/
class MOZ_STACK_CLASS LanguageTag final {
LanguageSubtag language_ = {};
ScriptSubtag script_ = {};
RegionSubtag region_ = {};
using VariantsVector = Vector<JS::UniqueChars, 2>;
using ExtensionsVector = Vector<JS::UniqueChars, 2>;
VariantsVector variants_;
ExtensionsVector extensions_;
JS::UniqueChars privateuse_ = nullptr;
friend class LanguageTagParser;
public:
// Flag to request canonicalized Unicode extensions.
enum class UnicodeExtensionCanonicalForm : bool { No, Yes };
private:
bool canonicalizeUnicodeExtension(
JSContext* cx, JS::UniqueChars& unicodeExtension,
UnicodeExtensionCanonicalForm canonicalForm);
bool canonicalizeTransformExtension(JSContext* cx,
JS::UniqueChars& transformExtension);
public:
static bool languageMapping(LanguageSubtag& language);
static bool complexLanguageMapping(const LanguageSubtag& language);
private:
static bool regionMapping(RegionSubtag& region);
static bool complexRegionMapping(const RegionSubtag& region);
void performComplexLanguageMappings();
void performComplexRegionMappings();
MOZ_MUST_USE bool updateGrandfatheredMappings(JSContext* cx);
static const char* replaceUnicodeExtensionType(
const mozilla::Range<const char>& key,
const mozilla::Range<const char>& type);
public:
explicit LanguageTag(JSContext* cx) : variants_(cx), extensions_(cx) {}
LanguageTag(const LanguageTag&) = delete;
LanguageTag& operator=(const LanguageTag&) = delete;
const LanguageSubtag& language() const { return language_; }
const ScriptSubtag& script() const { return script_; }
const RegionSubtag& region() const { return region_; }
const auto& variants() const { return variants_; }
const auto& extensions() const { return extensions_; }
const char* privateuse() const { return privateuse_.get(); }
/**
* Set the language subtag. The input must be a valid, case-normalized
* language subtag.
*/
template <size_t N>
void setLanguage(const char (&language)[N]) {
mozilla::Range<const char> range(language, N - 1);
MOZ_ASSERT(IsStructurallyValidLanguageTag(range));
language_.set(range);
}
/**
* Set the language subtag. The input must be a valid, case-normalized
* language subtag.
*/
void setLanguage(const LanguageSubtag& language) {
MOZ_ASSERT(IsStructurallyValidLanguageTag(language.range()));
language_.set(language.range());
}
/**
* Set the script subtag. The input must be a valid, case-normalized
* script subtag or the empty string.
*/
template <size_t N>
void setScript(const char (&script)[N]) {
mozilla::Range<const char> range(script, N - 1);
MOZ_ASSERT(IsStructurallyValidScriptTag(range));
script_.set(range);
}
/**
* Set the script subtag. The input must be a valid, case-normalized
* script subtag or the empty string.
*/
void setScript(const ScriptSubtag& script) {
MOZ_ASSERT(script.length() == 0 ||
IsStructurallyValidScriptTag(script.range()));
script_.set(script.range());
}
/**
* Set the region subtag. The input must be a valid, case-normalized
* region subtag or the empty string.
*/
template <size_t N>
void setRegion(const char (&region)[N]) {
mozilla::Range<const char> range(region, N - 1);
MOZ_ASSERT(IsStructurallyValidRegionTag(range));
region_.set(range);
}
/**
* Set the region subtag. The input must be a valid, case-normalized
* region subtag or the empty string.
*/
void setRegion(const RegionSubtag& region) {
MOZ_ASSERT(region.length() == 0 ||
IsStructurallyValidRegionTag(region.range()));
region_.set(region.range());
}
/**
* Removes all variant subtags.
*/
void clearVariants() { variants_.clearAndFree(); }
/**
* Set the Unicode extension subtag. The input must be a valid,
* case-normalized Unicode extension subtag.
*/
bool setUnicodeExtension(JS::UniqueChars extension);
/**
* Set the private-use subtag. The input must be a valid, case-normalized
* private-use subtag or the empty string.
*/
void setPrivateuse(JS::UniqueChars privateuse) {
MOZ_ASSERT(!privateuse ||
IsStructurallyValidPrivateUseTag(
{privateuse.get(), strlen(privateuse.get())}));
privateuse_ = std::move(privateuse);
}
/**
* Canonicalize the base-name subtags, that means the language, script,
* region, and variant subtags.
*/
bool canonicalizeBaseName(JSContext* cx);
/**
* Canonicalize all extension subtags.
*/
bool canonicalizeExtensions(JSContext* cx,
UnicodeExtensionCanonicalForm canonicalForm);
/**
* Canonicalizes the given structurally valid Unicode BCP 47 locale
* identifier, including regularized case of subtags. For example, the
* language tag Zh-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE,
* where
*
* Zh ; 2*3ALPHA
* -haNS ; ["-" script]
* -bu ; ["-" region]
* -variant2 ; *("-" variant)
* -Variant1
* -u-ca-chinese ; *("-" extension)
* -t-Zh-laTN
* -x-PRIVATE ; ["-" privateuse]
*
* becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private
*
* UTS 35 specifies two different canonicalization algorithms. There's one to
* canonicalize BCP 47 language tags and other one to canonicalize Unicode
* locale identifiers. The latter one wasn't present when ECMA-402 was changed
* to use Unicode BCP 47 locale identifiers instead of BCP 47 language tags,
* so ECMA-402 currently only uses the former to canonicalize Unicode BCP 47
* locale identifiers.
*
* Spec: ECMAScript Internationalization API Specification, 6.2.3.
* Spec:
* https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers
* Spec: https://unicode.org/reports/tr35/#BCP_47_Language_Tag_Conversion
*/
bool canonicalize(JSContext* cx,
UnicodeExtensionCanonicalForm canonicalForm) {
return canonicalizeBaseName(cx) &&
canonicalizeExtensions(cx, canonicalForm);
}
/**
* Append the string representation of this language tag to the given
* string buffer.
*/
bool appendTo(JSContext* cx, StringBuffer& sb) const;
/**
* Add likely-subtags to the language tag.
*
* Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags>
*/
bool addLikelySubtags(JSContext* cx);
/**
* Remove likely-subtags from the language tag.
*
* Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags>
*/
bool removeLikelySubtags(JSContext* cx);
};
/**
* Parser for Unicode BCP 47 locale identifiers.
*
* <https://unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers>
*/
class MOZ_STACK_CLASS LanguageTagParser final {
public:
// Exposed as |public| for |MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS|.
enum class TokenKind : uint8_t {
None = 0b000,
Alpha = 0b001,
Digit = 0b010,
AlphaDigit = 0b011,
Error = 0b100
};
private:
class Token final {
size_t index_;
size_t length_;
TokenKind kind_;
public:
Token(TokenKind kind, size_t index, size_t length)
: index_(index), length_(length), kind_(kind) {}
TokenKind kind() const { return kind_; }
size_t index() const { return index_; }
size_t length() const { return length_; }
bool isError() const { return kind_ == TokenKind::Error; }
bool isNone() const { return kind_ == TokenKind::None; }
bool isAlpha() const { return kind_ == TokenKind::Alpha; }
bool isDigit() const { return kind_ == TokenKind::Digit; }
bool isAlphaDigit() const { return kind_ == TokenKind::AlphaDigit; }
};
using LocaleChars = mozilla::Variant<const JS::Latin1Char*, const char16_t*>;
const LocaleChars& locale_;
size_t length_;
size_t index_ = 0;
LanguageTagParser(const LocaleChars& locale, size_t length)
: locale_(locale), length_(length) {}
char16_t charAtUnchecked(size_t index) const {
if (locale_.is<const JS::Latin1Char*>()) {
return locale_.as<const JS::Latin1Char*>()[index];
}
return locale_.as<const char16_t*>()[index];
}
char charAt(size_t index) const {
char16_t c = charAtUnchecked(index);
MOZ_ASSERT(mozilla::IsAscii(c));
return c;
}
// Copy the token characters into |subtag|.
template <size_t N>
void copyChars(const Token& tok, LanguageTagSubtag<N>& subtag) const {
size_t index = tok.index();
size_t length = tok.length();
if (locale_.is<const JS::Latin1Char*>()) {
using T = const JS::Latin1Char;
subtag.set(mozilla::Range<T>(locale_.as<T*>() + index, length));
} else {
using T = const char16_t;
subtag.set(mozilla::Range<T>(locale_.as<T*>() + index, length));
}
}
// Create a string copy of |length| characters starting at |index|.
JS::UniqueChars chars(JSContext* cx, size_t index, size_t length) const;
// Create a string copy of the token characters.
JS::UniqueChars chars(JSContext* cx, const Token& tok) const {
return chars(cx, tok.index(), tok.length());
}
Token nextToken();
JS::UniqueChars extension(JSContext* cx, const Token& start,
const Token& end) const;
// unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
//
// Four character language subtags are not allowed in Unicode BCP 47 locale
// identifiers. Also see the comparison to Unicode CLDR locale identifiers in
// <https://unicode.org/reports/tr35/#BCP_47_Conformance>.
bool isLanguage(const Token& tok) const {
return tok.isAlpha() && ((2 <= tok.length() && tok.length() <= 3) ||
(5 <= tok.length() && tok.length() <= 8));
}
// unicode_script_subtag = alpha{4} ;
bool isScript(const Token& tok) const {
return tok.isAlpha() && tok.length() == 4;
}
// unicode_region_subtag = (alpha{2} | digit{3}) ;
bool isRegion(const Token& tok) const {
return (tok.isAlpha() && tok.length() == 2) ||
(tok.isDigit() && tok.length() == 3);
}
// unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ;
bool isVariant(const Token& tok) const {
return (5 <= tok.length() && tok.length() <= 8) ||
(tok.length() == 4 && mozilla::IsAsciiDigit(charAt(tok.index())));
}
// Returns the code unit of the first character at the given singleton token.
// Always returns the lower case form of an alphabetical character.
char singletonKey(const Token& tok) const {
MOZ_ASSERT(tok.length() == 1);
char c = charAt(tok.index());
return mozilla::IsAsciiUppercaseAlpha(c) ? (c | 0x20) : c;
}
// extensions = unicode_locale_extensions |
// transformed_extensions |
// other_extensions ;
//
// unicode_locale_extensions = sep [uU] ((sep keyword)+ |
// (sep attribute)+ (sep keyword)*) ;
//
// transformed_extensions = sep [tT] ((sep tlang (sep tfield)*) |
// (sep tfield)+) ;
//
// other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
bool isExtensionStart(const Token& tok) const {
return tok.length() == 1 && singletonKey(tok) != 'x';
}
// other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
bool isOtherExtensionPart(const Token& tok) const {
return 2 <= tok.length() && tok.length() <= 8;
}
// unicode_locale_extensions = sep [uU] ((sep keyword)+ |
// (sep attribute)+ (sep keyword)*) ;
// keyword = key (sep type)? ;
bool isUnicodeExtensionPart(const Token& tok) const {
return isUnicodeExtensionKey(tok) || isUnicodeExtensionType(tok) ||
isUnicodeExtensionAttribute(tok);
}
// attribute = alphanum{3,8} ;
bool isUnicodeExtensionAttribute(const Token& tok) const {
return 3 <= tok.length() && tok.length() <= 8;
}
// key = alphanum alpha ;
bool isUnicodeExtensionKey(const Token& tok) const {
return tok.length() == 2 && mozilla::IsAsciiAlpha(charAt(tok.index() + 1));
}
// type = alphanum{3,8} (sep alphanum{3,8})* ;
bool isUnicodeExtensionType(const Token& tok) const {
return 3 <= tok.length() && tok.length() <= 8;
}
// tkey = alpha digit ;
bool isTransformExtensionKey(const Token& tok) const {
return tok.length() == 2 && mozilla::IsAsciiAlpha(charAt(tok.index())) &&
mozilla::IsAsciiDigit(charAt(tok.index() + 1));
}
// tvalue = (sep alphanum{3,8})+ ;
bool isTransformExtensionPart(const Token& tok) const {
return 3 <= tok.length() && tok.length() <= 8;
}
// pu_extensions = sep [xX] (sep alphanum{1,8})+ ;
bool isPrivateUseStart(const Token& tok) const {
return tok.length() == 1 && singletonKey(tok) == 'x';
}
// pu_extensions = sep [xX] (sep alphanum{1,8})+ ;
bool isPrivateUsePart(const Token& tok) const {
return 1 <= tok.length() && tok.length() <= 8;
}
enum class BaseNameParsing : bool { Normal, WithinTransformExtension };
// Helper function for use in |parseBaseName| and
// |parseTlangInTransformExtension|. Do not use this directly!
static JS::Result<bool> internalParseBaseName(JSContext* cx,
LanguageTagParser& ts,
LanguageTag& tag, Token& tok,
BaseNameParsing parseType);
// Parse the `unicode_language_id` production, i.e. the
// language/script/region/variants portion of a language tag, into |tag|,
// which will be filled with canonical-cased components (lowercase language,
// titlecase script, uppercase region, lowercased and alphabetized and
// deduplicated variants). |tok| must be the current token.
static JS::Result<bool> parseBaseName(JSContext* cx, LanguageTagParser& ts,
LanguageTag& tag, Token& tok) {
return internalParseBaseName(cx, ts, tag, tok, BaseNameParsing::Normal);
}
// Parse the `tlang` production within a parsed 't' transform extension.
// The precise requirements for "previously parsed" are:
//
// * the input begins from current token |tok| with a valid `tlang`
// * the `tlang` is wholly lowercase (*not* canonical case)
// * variant subtags in the `tlang` may contain duplicates and be
// unordered
//
// Return an error on internal failure. Otherwise, return a success value. If
// there was no `tlang`, then |tag.language().missing()|. But if there was a
// `tlang`, then |tag| is filled with subtags exactly as they appeared in the
// parse input: fully lowercase, variants in alphabetical order without
// duplicates.
static JS::Result<JS::Ok> parseTlangInTransformExtension(
JSContext* cx, LanguageTagParser& ts, LanguageTag& tag, Token& tok) {
MOZ_ASSERT(ts.isLanguage(tok));
return internalParseBaseName(cx, ts, tag, tok,
BaseNameParsing::WithinTransformExtension)
.map([](bool parsed) {
MOZ_ASSERT(parsed);
return JS::Ok();
});
}
friend class LanguageTag;
class Range final {
size_t begin_;
size_t length_;
public:
Range(size_t begin, size_t length) : begin_(begin), length_(length) {}
template <typename T>
T* begin(T* ptr) const {
return ptr + begin_;
}
size_t length() const { return length_; }
};
using TFieldVector = js::Vector<Range, 8>;
using AttributesVector = js::Vector<Range, 8>;
using KeywordsVector = js::Vector<Range, 8>;
// Parse |extension|, which must be a validated, fully lowercase
// `transformed_extensions` subtag, and fill |tag| and |fields| from the
// `tlang` and `tfield` components. Data in |tag| is lowercase, consistent
// with |extension|.
static JS::Result<bool> parseTransformExtension(
JSContext* cx, mozilla::Range<const char> extension, LanguageTag& tag,
TFieldVector& fields);
// Parse |extension|, which must be a validated, fully lowercase
// `unicode_locale_extensions` subtag, and fill |attributes| and |keywords|
// from the `attribute` and `keyword` components.
static JS::Result<bool> parseUnicodeExtension(
JSContext* cx, mozilla::Range<const char> extension,
AttributesVector& attributes, KeywordsVector& keywords);
public:
// Parse the input string as a language tag. Reports an error to the context
// if the input can't be parsed completely.
static bool parse(JSContext* cx, JSLinearString* locale, LanguageTag& tag);
// Parse the input string as a language tag. Returns Ok(true) if the input
// could be completely parsed, Ok(false) if the input couldn't be parsed,
// or Err() in case of internal error.
static JS::Result<bool> tryParse(JSContext* cx, JSLinearString* locale,
LanguageTag& tag);
// Parse the input string as the base-name parts (language, script, region,
// variants) of a language tag. Ignores any trailing characters.
static bool parseBaseName(JSContext* cx, mozilla::Range<const char> locale,
LanguageTag& tag);
// Return true iff |extension| can be parsed as a Unicode extension subtag.
static bool canParseUnicodeExtension(mozilla::Range<const char> extension);
// Return true iff |unicodeType| can be parsed as a Unicode extension type.
static bool canParseUnicodeExtensionType(JSLinearString* unicodeType);
};
MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS(LanguageTagParser::TokenKind)
} // namespace intl
} // namespace js
#endif /* builtin_intl_LanguageTag_h */

Просмотреть файл

@ -0,0 +1,615 @@
// Generated by make_intl_data.py. DO NOT EDIT.
#include "mozilla/Assertions.h"
#include "mozilla/Range.h"
#include "mozilla/TextUtils.h"
#include <algorithm>
#include <cstdint>
#include <cstring>
#include <iterator>
#include <type_traits>
#include "builtin/intl/LanguageTag.h"
#include "util/Text.h"
#include "vm/JSContext.h"
using ConstCharRange = mozilla::Range<const char>;
template <size_t Length, size_t TagLength, size_t SubtagLength>
static inline bool HasReplacement(
const char (&subtags)[Length][TagLength],
const js::intl::LanguageTagSubtag<SubtagLength>& subtag) {
MOZ_ASSERT(subtag.length() == TagLength - 1,
"subtag must have the same length as the list of subtags");
const char* ptr = subtag.range().begin().get();
return std::binary_search(std::begin(subtags), std::end(subtags), ptr,
[](const char* a, const char* b) {
return memcmp(a, b, TagLength - 1) < 0;
});
}
template <size_t Length, size_t TagLength, size_t SubtagLength>
static inline const char* SearchReplacement(
const char (&subtags)[Length][TagLength],
const char* (&aliases)[Length],
const js::intl::LanguageTagSubtag<SubtagLength>& subtag) {
MOZ_ASSERT(subtag.length() == TagLength - 1,
"subtag must have the same length as the list of subtags");
const char* ptr = subtag.range().begin().get();
auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr,
[](const char* a, const char* b) {
return memcmp(a, b, TagLength - 1) < 0;
});
if (p != std::end(subtags) && memcmp(*p, ptr, TagLength - 1) == 0) {
return aliases[std::distance(std::begin(subtags), p)];
}
return nullptr;
}
// Mappings from language subtags to preferred values.
// Derived from CLDR Supplemental Data, version 35.1.
// https://github.com/unicode-org/cldr.git
bool js::intl::LanguageTag::languageMapping(LanguageSubtag& language) {
MOZ_ASSERT(IsStructurallyValidLanguageTag(language.range()));
if (language.length() == 2) {
static const char languages[9][3] = {
"bh", "in", "iw", "ji", "jw", "mo", "no", "tl", "tw",
};
static const char* aliases[9] = {
"bho", "id", "he", "yi", "jv", "ro", "nb", "fil", "ak",
};
if (const char* replacement = SearchReplacement(languages, aliases, language)) {
language.set(ConstCharRange(replacement, strlen(replacement)));
return true;
}
return false;
}
if (language.length() == 3) {
static const char languages[340][4] = {
"aam", "aar", "abk", "adp", "afr", "aju", "aka", "alb", "als", "amh",
"ara", "arb", "arg", "arm", "asm", "aue", "ava", "ave", "aym", "ayr",
"ayx", "aze", "azj", "bak", "bam", "baq", "bcc", "bcl", "bel", "ben",
"bgm", "bih", "bis", "bjd", "bod", "bos", "bre", "bul", "bur", "bxk",
"bxr", "cat", "ccq", "ces", "cha", "che", "chi", "chu", "chv", "cjr",
"cka", "cld", "cmk", "cmn", "cor", "cos", "coy", "cqu", "cre", "cwd",
"cym", "cze", "dan", "deu", "dgo", "dhd", "dik", "diq", "div", "drh",
"dut", "dzo", "ekk", "ell", "emk", "eng", "epo", "esk", "est", "eus",
"ewe", "fao", "fas", "fat", "fij", "fin", "fra", "fre", "fry", "fuc",
"ful", "gav", "gaz", "gbo", "geo", "ger", "gfx", "ggn", "gla", "gle",
"glg", "glv", "gno", "gre", "grn", "gti", "gug", "guj", "guv", "gya",
"hat", "hau", "hdn", "hea", "heb", "her", "him", "hin", "hmo", "hrr",
"hrv", "hun", "hye", "ibi", "ibo", "ice", "ido", "iii", "ike", "iku",
"ile", "ilw", "ina", "ind", "ipk", "isl", "ita", "jav", "jeg", "jpn",
"kal", "kan", "kas", "kat", "kau", "kaz", "kgc", "kgh", "khk", "khm",
"kik", "kin", "kir", "kmr", "knc", "kng", "knn", "koj", "kom", "kon",
"kor", "kpv", "krm", "ktr", "kua", "kur", "kvs", "kwq", "kxe", "kzj",
"kzt", "lao", "lat", "lav", "lbk", "lii", "lim", "lin", "lit", "lmm",
"ltz", "lub", "lug", "lvs", "mac", "mah", "mal", "mao", "mar", "may",
"meg", "mhr", "mkd", "mlg", "mlt", "mnk", "mol", "mon", "mri", "msa",
"mst", "mup", "mwj", "mya", "myt", "nad", "nau", "nav", "nbl", "ncp",
"nde", "ndo", "nep", "nld", "nno", "nnx", "nob", "nor", "npi", "nts",
"nya", "oci", "ojg", "oji", "ori", "orm", "ory", "oss", "oun", "pan",
"pbu", "pcr", "per", "pes", "pli", "plt", "pmc", "pmu", "pnb", "pol",
"por", "ppa", "ppr", "pry", "pus", "puz", "que", "quz", "rmy", "roh",
"ron", "rum", "run", "rus", "sag", "san", "sca", "scc", "scr", "sin",
"skk", "slk", "slo", "slv", "sme", "smo", "sna", "snd", "som", "sot",
"spa", "spy", "sqi", "src", "srd", "srp", "ssw", "sun", "swa", "swe",
"swh", "tah", "tam", "tat", "tdu", "tel", "tgk", "tgl", "tha", "thc",
"thx", "tib", "tie", "tir", "tkk", "tlw", "tmp", "tne", "ton", "tsf",
"tsn", "tso", "ttq", "tuk", "tur", "twi", "uig", "ukr", "umu", "uok",
"urd", "uzb", "uzn", "ven", "vie", "vol", "wel", "wln", "wol", "xba",
"xho", "xia", "xkh", "xpe", "xsj", "xsl", "ybd", "ydd", "yid", "yma",
"ymt", "yor", "yos", "yuu", "zai", "zha", "zho", "zsm", "zul", "zyb",
};
static const char* aliases[340] = {
"aas", "aa", "ab", "dz", "af", "jrb", "ak", "sq", "sq", "am",
"ar", "ar", "an", "hy", "as", "ktz", "av", "ae", "ay", "ay",
"nun", "az", "az", "ba", "bm", "eu", "bal", "bik", "be", "bn",
"bcg", "bho", "bi", "drl", "bo", "bs", "br", "bg", "my", "luy",
"bua", "ca", "rki", "cs", "ch", "ce", "zh", "cu", "cv", "mom",
"cmr", "syr", "xch", "zh", "kw", "co", "pij", "quh", "cr", "cr",
"cy", "cs", "da", "de", "doi", "mwr", "din", "zza", "dv", "mn",
"nl", "dz", "et", "el", "man", "en", "eo", "ik", "et", "eu",
"ee", "fo", "fa", "ak", "fj", "fi", "fr", "fr", "fy", "ff",
"ff", "dev", "om", "grb", "ka", "de", "vaj", "gvr", "gd", "ga",
"gl", "gv", "gon", "el", "gn", "nyc", "gn", "gu", "duz", "gba",
"ht", "ha", "hai", "hmn", "he", "hz", "srx", "hi", "ho", "jal",
"hr", "hu", "hy", "opa", "ig", "is", "io", "ii", "iu", "iu",
"ie", "gal", "ia", "id", "ik", "is", "it", "jv", "oyb", "ja",
"kl", "kn", "ks", "ka", "kr", "kk", "tdf", "kml", "mn", "km",
"ki", "rw", "ky", "ku", "kr", "kg", "kok", "kwv", "kv", "kg",
"ko", "kv", "bmf", "dtp", "kj", "ku", "gdj", "yam", "tvd", "dtp",
"dtp", "lo", "la", "lv", "bnc", "raq", "li", "ln", "lt", "rmx",
"lb", "lu", "lg", "lv", "mk", "mh", "ml", "mi", "mr", "ms",
"cir", "chm", "mk", "mg", "mt", "man", "ro", "mn", "mi", "ms",
"mry", "raj", "vaj", "my", "mry", "xny", "na", "nv", "nr", "kdz",
"nd", "ng", "ne", "nl", "nn", "ngv", "nb", "nb", "ne", "pij",
"ny", "oc", "oj", "oj", "or", "om", "or", "os", "vaj", "pa",
"ps", "adx", "fa", "fa", "pi", "mg", "huw", "phr", "lah", "pl",
"pt", "bfy", "lcq", "prt", "ps", "pub", "qu", "qu", "rom", "rm",
"ro", "ro", "rn", "ru", "sg", "sa", "hle", "sr", "hr", "si",
"oyb", "sk", "sk", "sl", "se", "sm", "sn", "sd", "so", "st",
"es", "kln", "sq", "sc", "sc", "sr", "ss", "su", "sw", "sv",
"sw", "ty", "ta", "tt", "dtp", "te", "tg", "fil", "th", "tpo",
"oyb", "bo", "ras", "ti", "twm", "weo", "tyj", "kak", "to", "taj",
"tn", "ts", "tmh", "tk", "tr", "ak", "ug", "uk", "del", "ema",
"ur", "uz", "uz", "ve", "vi", "vo", "cy", "wa", "wo", "cax",
"xh", "acn", "waw", "kpe", "suj", "den", "rki", "yi", "yi", "lrr",
"mtm", "yo", "zom", "yug", "zap", "za", "zh", "ms", "zu", "za",
};
if (const char* replacement = SearchReplacement(languages, aliases, language)) {
language.set(ConstCharRange(replacement, strlen(replacement)));
return true;
}
return false;
}
return false;
}
// Language subtags with complex mappings.
// Derived from CLDR Supplemental Data, version 35.1.
// https://github.com/unicode-org/cldr.git
bool js::intl::LanguageTag::complexLanguageMapping(const LanguageSubtag& language) {
MOZ_ASSERT(IsStructurallyValidLanguageTag(language.range()));
if (language.length() == 2) {
return language.equalTo("sh");
}
if (language.length() == 3) {
static const char languages[6][4] = {
"cnr", "drw", "hbs", "prs", "swc", "tnf",
};
return HasReplacement(languages, language);
}
return false;
}
// Mappings from region subtags to preferred values.
// Derived from CLDR Supplemental Data, version 35.1.
// https://github.com/unicode-org/cldr.git
bool js::intl::LanguageTag::regionMapping(RegionSubtag& region) {
MOZ_ASSERT(IsStructurallyValidRegionTag(region.range()));
if (region.length() == 2) {
static const char regions[23][3] = {
"BU", "CS", "CT", "DD", "DY", "FQ", "FX", "HV", "JT", "MI",
"NH", "NQ", "PU", "PZ", "QU", "RH", "TP", "UK", "VD", "WK",
"YD", "YU", "ZR",
};
static const char* aliases[23] = {
"MM", "RS", "KI", "DE", "BJ", "AQ", "FR", "BF", "UM", "UM",
"VU", "AQ", "UM", "PA", "EU", "ZW", "TL", "GB", "VN", "UM",
"YE", "RS", "CD",
};
if (const char* replacement = SearchReplacement(regions, aliases, region)) {
region.set(ConstCharRange(replacement, strlen(replacement)));
return true;
}
return false;
}
{
static const char regions[300][4] = {
"004", "008", "010", "012", "016", "020", "024", "028", "031", "032",
"036", "040", "044", "048", "050", "051", "052", "056", "060", "062",
"064", "068", "070", "072", "074", "076", "084", "086", "090", "092",
"096", "100", "104", "108", "112", "116", "120", "124", "132", "136",
"140", "144", "148", "152", "156", "158", "162", "166", "170", "174",
"175", "178", "180", "184", "188", "191", "192", "196", "203", "204",
"208", "212", "214", "218", "222", "226", "230", "231", "232", "233",
"234", "238", "239", "242", "246", "248", "249", "250", "254", "258",
"260", "262", "266", "268", "270", "275", "276", "278", "280", "288",
"292", "296", "300", "304", "308", "312", "316", "320", "324", "328",
"332", "334", "336", "340", "344", "348", "352", "356", "360", "364",
"368", "372", "376", "380", "384", "388", "392", "398", "400", "404",
"408", "410", "414", "417", "418", "422", "426", "428", "430", "434",
"438", "440", "442", "446", "450", "454", "458", "462", "466", "470",
"474", "478", "480", "484", "492", "496", "498", "499", "500", "504",
"508", "512", "516", "520", "524", "528", "531", "533", "534", "535",
"540", "548", "554", "558", "562", "566", "570", "574", "578", "580",
"581", "583", "584", "585", "586", "591", "598", "600", "604", "608",
"612", "616", "620", "624", "626", "630", "634", "638", "642", "643",
"646", "652", "654", "659", "660", "662", "663", "666", "670", "674",
"678", "682", "686", "688", "690", "694", "702", "703", "704", "705",
"706", "710", "716", "720", "724", "728", "729", "732", "736", "740",
"744", "748", "752", "756", "760", "762", "764", "768", "772", "776",
"780", "784", "788", "792", "795", "796", "798", "800", "804", "807",
"818", "826", "830", "831", "832", "833", "834", "840", "850", "854",
"858", "860", "862", "876", "882", "886", "887", "891", "894", "958",
"959", "960", "962", "963", "964", "965", "966", "967", "968", "969",
"970", "971", "972", "973", "974", "975", "976", "977", "978", "979",
"980", "981", "982", "983", "984", "985", "986", "987", "988", "989",
"990", "991", "992", "993", "994", "995", "996", "997", "998", "999",
};
static const char* aliases[300] = {
"AF", "AL", "AQ", "DZ", "AS", "AD", "AO", "AG", "AZ", "AR",
"AU", "AT", "BS", "BH", "BD", "AM", "BB", "BE", "BM", "034",
"BT", "BO", "BA", "BW", "BV", "BR", "BZ", "IO", "SB", "VG",
"BN", "BG", "MM", "BI", "BY", "KH", "CM", "CA", "CV", "KY",
"CF", "LK", "TD", "CL", "CN", "TW", "CX", "CC", "CO", "KM",
"YT", "CG", "CD", "CK", "CR", "HR", "CU", "CY", "CZ", "BJ",
"DK", "DM", "DO", "EC", "SV", "GQ", "ET", "ET", "ER", "EE",
"FO", "FK", "GS", "FJ", "FI", "AX", "FR", "FR", "GF", "PF",
"TF", "DJ", "GA", "GE", "GM", "PS", "DE", "DE", "DE", "GH",
"GI", "KI", "GR", "GL", "GD", "GP", "GU", "GT", "GN", "GY",
"HT", "HM", "VA", "HN", "HK", "HU", "IS", "IN", "ID", "IR",
"IQ", "IE", "IL", "IT", "CI", "JM", "JP", "KZ", "JO", "KE",
"KP", "KR", "KW", "KG", "LA", "LB", "LS", "LV", "LR", "LY",
"LI", "LT", "LU", "MO", "MG", "MW", "MY", "MV", "ML", "MT",
"MQ", "MR", "MU", "MX", "MC", "MN", "MD", "ME", "MS", "MA",
"MZ", "OM", "NA", "NR", "NP", "NL", "CW", "AW", "SX", "BQ",
"NC", "VU", "NZ", "NI", "NE", "NG", "NU", "NF", "NO", "MP",
"UM", "FM", "MH", "PW", "PK", "PA", "PG", "PY", "PE", "PH",
"PN", "PL", "PT", "GW", "TL", "PR", "QA", "RE", "RO", "RU",
"RW", "BL", "SH", "KN", "AI", "LC", "MF", "PM", "VC", "SM",
"ST", "SA", "SN", "RS", "SC", "SL", "SG", "SK", "VN", "SI",
"SO", "ZA", "ZW", "YE", "ES", "SS", "SD", "EH", "SD", "SR",
"SJ", "SZ", "SE", "CH", "SY", "TJ", "TH", "TG", "TK", "TO",
"TT", "AE", "TN", "TR", "TM", "TC", "TV", "UG", "UA", "MK",
"EG", "GB", "JE", "GG", "JE", "IM", "TZ", "US", "VI", "BF",
"UY", "UZ", "VE", "WF", "WS", "YE", "YE", "RS", "ZM", "AA",
"QM", "QN", "QP", "QQ", "QR", "QS", "QT", "EU", "QV", "QW",
"QX", "QY", "QZ", "XA", "XB", "XC", "XD", "XE", "XF", "XG",
"XH", "XI", "XJ", "XK", "XL", "XM", "XN", "XO", "XP", "XQ",
"XR", "XS", "XT", "XU", "XV", "XW", "XX", "XY", "XZ", "ZZ",
};
if (const char* replacement = SearchReplacement(regions, aliases, region)) {
region.set(ConstCharRange(replacement, strlen(replacement)));
return true;
}
return false;
}
}
// Region subtags with complex mappings.
// Derived from CLDR Supplemental Data, version 35.1.
// https://github.com/unicode-org/cldr.git
bool js::intl::LanguageTag::complexRegionMapping(const RegionSubtag& region) {
MOZ_ASSERT(IsStructurallyValidRegionTag(region.range()));
if (region.length() == 2) {
return region.equalTo("AN") ||
region.equalTo("NT") ||
region.equalTo("PC") ||
region.equalTo("SU");
}
{
static const char regions[8][4] = {
"172", "200", "530", "532", "536", "582", "810", "890",
};
return HasReplacement(regions, region);
}
}
// Language subtags with complex mappings.
// Derived from CLDR Supplemental Data, version 35.1.
// https://github.com/unicode-org/cldr.git
void js::intl::LanguageTag::performComplexLanguageMappings() {
MOZ_ASSERT(IsStructurallyValidLanguageTag(language().range()));
if (language().equalTo("cnr")) {
setLanguage("sr");
if (region().length() == 0) {
setRegion("ME");
}
}
else if (language().equalTo("drw") ||
language().equalTo("prs") ||
language().equalTo("tnf")) {
setLanguage("fa");
if (region().length() == 0) {
setRegion("AF");
}
}
else if (language().equalTo("hbs") ||
language().equalTo("sh")) {
setLanguage("sr");
if (script().length() == 0) {
setScript("Latn");
}
}
else if (language().equalTo("swc")) {
setLanguage("sw");
if (region().length() == 0) {
setRegion("CD");
}
}
}
// Region subtags with complex mappings.
// Derived from CLDR Supplemental Data, version 35.1.
// https://github.com/unicode-org/cldr.git
void js::intl::LanguageTag::performComplexRegionMappings() {
MOZ_ASSERT(IsStructurallyValidLanguageTag(language().range()));
MOZ_ASSERT(IsStructurallyValidRegionTag(region().range()));
if (region().equalTo("172")) {
if (language().equalTo("hy") ||
(language().equalTo("und") && script().equalTo("Armn"))) {
setRegion("AM");
}
else if (language().equalTo("az") ||
language().equalTo("tkr") ||
language().equalTo("tly") ||
language().equalTo("ttt")) {
setRegion("AZ");
}
else if (language().equalTo("be")) {
setRegion("BY");
}
else if (language().equalTo("ab") ||
language().equalTo("ka") ||
language().equalTo("os") ||
(language().equalTo("und") && script().equalTo("Geor")) ||
language().equalTo("xmf")) {
setRegion("GE");
}
else if (language().equalTo("ky")) {
setRegion("KG");
}
else if (language().equalTo("kk") ||
(language().equalTo("ug") && script().equalTo("Cyrl"))) {
setRegion("KZ");
}
else if (language().equalTo("gag")) {
setRegion("MD");
}
else if (language().equalTo("tg")) {
setRegion("TJ");
}
else if (language().equalTo("tk")) {
setRegion("TM");
}
else if (language().equalTo("crh") ||
language().equalTo("got") ||
language().equalTo("ji") ||
language().equalTo("rue") ||
language().equalTo("uk") ||
(language().equalTo("und") && script().equalTo("Goth"))) {
setRegion("UA");
}
else if (language().equalTo("kaa") ||
language().equalTo("sog") ||
(language().equalTo("und") && script().equalTo("Sogd")) ||
(language().equalTo("und") && script().equalTo("Sogo")) ||
language().equalTo("uz")) {
setRegion("UZ");
}
else {
setRegion("RU");
}
}
else if (region().equalTo("200")) {
if (language().equalTo("sk")) {
setRegion("SK");
}
else {
setRegion("CZ");
}
}
else if (region().equalTo("530") ||
region().equalTo("532") ||
region().equalTo("AN")) {
if (language().equalTo("vic")) {
setRegion("SX");
}
else {
setRegion("CW");
}
}
else if (region().equalTo("536") ||
region().equalTo("NT")) {
if (language().equalTo("akk") ||
language().equalTo("ckb") ||
(language().equalTo("ku") && script().equalTo("Arab")) ||
language().equalTo("mis") ||
language().equalTo("syr") ||
(language().equalTo("und") && script().equalTo("Xsux")) ||
(language().equalTo("und") && script().equalTo("Hatr")) ||
(language().equalTo("und") && script().equalTo("Syrc"))) {
setRegion("IQ");
}
else {
setRegion("SA");
}
}
else if (region().equalTo("582") ||
region().equalTo("PC")) {
if (language().equalTo("mh")) {
setRegion("MH");
}
else if (language().equalTo("pau")) {
setRegion("PW");
}
else {
setRegion("FM");
}
}
else if (region().equalTo("810") ||
region().equalTo("SU")) {
if (language().equalTo("hy") ||
(language().equalTo("und") && script().equalTo("Armn"))) {
setRegion("AM");
}
else if (language().equalTo("az") ||
language().equalTo("tkr") ||
language().equalTo("tly") ||
language().equalTo("ttt")) {
setRegion("AZ");
}
else if (language().equalTo("be")) {
setRegion("BY");
}
else if (language().equalTo("et") ||
language().equalTo("vro")) {
setRegion("EE");
}
else if (language().equalTo("ab") ||
language().equalTo("ka") ||
language().equalTo("os") ||
(language().equalTo("und") && script().equalTo("Geor")) ||
language().equalTo("xmf")) {
setRegion("GE");
}
else if (language().equalTo("ky")) {
setRegion("KG");
}
else if (language().equalTo("kk") ||
(language().equalTo("ug") && script().equalTo("Cyrl"))) {
setRegion("KZ");
}
else if (language().equalTo("lt") ||
language().equalTo("sgs")) {
setRegion("LT");
}
else if (language().equalTo("ltg") ||
language().equalTo("lv")) {
setRegion("LV");
}
else if (language().equalTo("gag")) {
setRegion("MD");
}
else if (language().equalTo("tg")) {
setRegion("TJ");
}
else if (language().equalTo("tk")) {
setRegion("TM");
}
else if (language().equalTo("crh") ||
language().equalTo("got") ||
language().equalTo("ji") ||
language().equalTo("rue") ||
language().equalTo("uk") ||
(language().equalTo("und") && script().equalTo("Goth"))) {
setRegion("UA");
}
else if (language().equalTo("kaa") ||
language().equalTo("sog") ||
(language().equalTo("und") && script().equalTo("Sogd")) ||
(language().equalTo("und") && script().equalTo("Sogo")) ||
language().equalTo("uz")) {
setRegion("UZ");
}
else {
setRegion("RU");
}
}
else if (region().equalTo("890")) {
if (language().equalTo("bs")) {
setRegion("BA");
}
else if (language().equalTo("hr")) {
setRegion("HR");
}
else if (language().equalTo("mk")) {
setRegion("MK");
}
else if (language().equalTo("sl")) {
setRegion("SI");
}
else {
setRegion("RS");
}
}
}
// Canonicalize grandfathered locale identifiers.
// Derived from CLDR Supplemental Data, version 35.1.
// https://github.com/unicode-org/cldr.git
bool js::intl::LanguageTag::updateGrandfatheredMappings(JSContext* cx) {
// We're mapping regular grandfathered tags to non-grandfathered form here.
// Other tags remain unchanged.
//
// regular = "art-lojban"
// / "cel-gaulish"
// / "no-bok"
// / "no-nyn"
// / "zh-guoyu"
// / "zh-hakka"
// / "zh-min"
// / "zh-min-nan"
// / "zh-xiang"
//
// Therefore we can quickly exclude most tags by checking every
// |unicode_locale_id| subcomponent for characteristics not shared by any of
// the regular grandfathered (RG) tags:
//
// * Real-world |unicode_language_subtag|s are all two or three letters,
// so don't waste time running a useless |language.length > 3| fast-path.
// * No RG tag has a "script"-looking component.
// * No RG tag has a "region"-looking component.
// * The RG tags that match |unicode_locale_id| (art-lojban, cel-gaulish,
// zh-guoyu, zh-hakka, zh-xiang) have exactly one "variant". (no-bok,
// no-nyn, zh-min, and zh-min-nan require BCP47's extlang subtag
// that |unicode_locale_id| doesn't support.)
// * No RG tag contains |extensions| or |pu_extensions|.
if (script().length() != 0 ||
region().length() != 0 ||
variants().length() != 1 ||
extensions().length() != 0 ||
privateuse()) {
return true;
}
auto variantEqualTo = [this](const char* variant) {
return strcmp(variants()[0].get(), variant) == 0;
};
// art-lojban -> jbo
if (language().equalTo("art") && variantEqualTo("lojban")) {
setLanguage("jbo");
clearVariants();
return true;
}
// cel-gaulish -> xtg-x-cel-gaulish
else if (language().equalTo("cel") && variantEqualTo("gaulish")) {
setLanguage("xtg");
clearVariants();
auto privateuse = DuplicateString(cx, "x-cel-gaulish");
if (!privateuse) {
return false;
}
setPrivateuse(std::move(privateuse));
return true;
}
// zh-guoyu -> zh
else if (language().equalTo("zh") && variantEqualTo("guoyu")) {
setLanguage("zh");
clearVariants();
return true;
}
// zh-hakka -> hak
else if (language().equalTo("zh") && variantEqualTo("hakka")) {
setLanguage("hak");
clearVariants();
return true;
}
// zh-xiang -> hsn
else if (language().equalTo("zh") && variantEqualTo("xiang")) {
setLanguage("hsn");
clearVariants();
return true;
}
return true;
}

Просмотреть файл

@ -0,0 +1,188 @@
// Generated by make_intl_data.py. DO NOT EDIT.
// Version: CLDR-35.1
// URL: https://unicode.org/Public/cldr/35.1/core.zip
#include "mozilla/Assertions.h"
#include "mozilla/Range.h"
#include "mozilla/TextUtils.h"
#include <algorithm>
#include <cstdint>
#include <cstring>
#include "builtin/intl/LanguageTag.h"
using namespace js::intl::LanguageTagLimits;
using ConstCharRange = mozilla::Range<const char>;
template <size_t Length>
static inline bool IsUnicodeKey(const ConstCharRange& key,
const char (&str)[Length]) {
static_assert(Length == UnicodeKeyLength + 1,
"Unicode extension key is two characters long");
return memcmp(key.begin().get(), str, Length - 1) == 0;
}
template <size_t Length>
static inline bool IsUnicodeType(const ConstCharRange& type,
const char (&str)[Length]) {
static_assert(Length > UnicodeKeyLength + 1,
"Unicode extension type contains more than two characters");
return type.length() == (Length - 1) &&
memcmp(type.begin().get(), str, Length - 1) == 0;
}
static int32_t CompareUnicodeType(const char* a, const ConstCharRange& b) {
#ifdef DEBUG
auto isNull = [](char c) {
return c == '\0';
};
#endif
MOZ_ASSERT(std::none_of(b.begin().get(), b.end().get(), isNull),
"unexpected null-character in string");
using UnsignedChar = unsigned char;
for (size_t i = 0; i < b.length(); i++) {
// |a| is zero-terminated and |b| doesn't contain a null-terminator. So if
// we've reached the end of |a|, the below if-statement will always be true.
// That ensures we don't read past the end of |a|.
if (int32_t r = UnsignedChar(a[i]) - UnsignedChar(b[i])) {
return r;
}
}
// Return zero if both strings are equal or a negative number if |b| is a
// prefix of |a|.
return -int32_t(UnsignedChar(a[b.length()]));
};
template <size_t Length>
static inline const char* SearchReplacement(const char* (&types)[Length],
const char* (&aliases)[Length],
const ConstCharRange& type) {
auto p = std::lower_bound(std::begin(types), std::end(types), type,
[](const auto& a, const auto& b) {
return CompareUnicodeType(a, b) < 0;
});
if (p != std::end(types) && CompareUnicodeType(*p, type) == 0) {
return aliases[std::distance(std::begin(types), p)];
}
return nullptr;
}
/**
* Mapping from deprecated BCP 47 Unicode extension types to their preferred
* values.
*
* Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files
*/
const char* js::intl::LanguageTag::replaceUnicodeExtensionType(
const ConstCharRange& key, const ConstCharRange& type) {
#ifdef DEBUG
static auto isAsciiLowercaseAlphanumeric = [](char c) {
return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c);
};
static auto isAsciiLowercaseAlphanumericOrDash = [](char c) {
return isAsciiLowercaseAlphanumeric(c) || c == '-';
};
#endif
MOZ_ASSERT(key.length() == UnicodeKeyLength);
MOZ_ASSERT(std::all_of(key.begin().get(), key.end().get(),
isAsciiLowercaseAlphanumeric));
MOZ_ASSERT(type.length() > UnicodeKeyLength);
MOZ_ASSERT(std::all_of(type.begin().get(), type.end().get(),
isAsciiLowercaseAlphanumericOrDash));
if (IsUnicodeKey(key, "ca")) {
if (IsUnicodeType(type, "ethiopic-amete-alem")) {
return "ethioaa";
}
if (IsUnicodeType(type, "islamicc")) {
return "islamic-civil";
}
}
else if (IsUnicodeKey(key, "kb") ||
IsUnicodeKey(key, "kc") ||
IsUnicodeKey(key, "kh") ||
IsUnicodeKey(key, "kk") ||
IsUnicodeKey(key, "kn")) {
if (IsUnicodeType(type, "yes")) {
return "true";
}
}
else if (IsUnicodeKey(key, "ks")) {
if (IsUnicodeType(type, "primary")) {
return "level1";
}
if (IsUnicodeType(type, "tertiary")) {
return "level3";
}
}
else if (IsUnicodeKey(key, "ms")) {
if (IsUnicodeType(type, "imperial")) {
return "uksystem";
}
}
else if (IsUnicodeKey(key, "rg") ||
IsUnicodeKey(key, "sd")) {
static const char* types[116] = {
"cn11", "cn12", "cn13", "cn14", "cn15", "cn21", "cn22", "cn23",
"cn31", "cn32", "cn33", "cn34", "cn35", "cn36", "cn37", "cn41",
"cn42", "cn43", "cn44", "cn45", "cn46", "cn50", "cn51", "cn52",
"cn53", "cn54", "cn61", "cn62", "cn63", "cn64", "cn65", "cz10a",
"cz10b", "cz10c", "cz10d", "cz10e", "cz10f", "cz611", "cz612", "cz613",
"cz614", "cz615", "cz621", "cz622", "cz623", "cz624", "cz626", "cz627",
"czjc", "czjm", "czka", "czkr", "czli", "czmo", "czol", "czpa",
"czpl", "czpr", "czst", "czus", "czvy", "czzl", "fra", "frb",
"frc", "frd", "fre", "frf", "frg", "frh", "fri", "frj",
"frk", "frl", "frm", "frn", "fro", "frp", "frq", "frr",
"frs", "frt", "fru", "frv", "laxn", "lud", "lug", "lul",
"mrnkc", "nzn", "nzs", "omba", "omsh", "plds", "plkp", "pllb",
"plld", "pllu", "plma", "plmz", "plop", "plpd", "plpk", "plpm",
"plsk", "plsl", "plwn", "plwp", "plzp", "tteto", "ttrcm", "ttwto",
"twkhq", "twtnq", "twtpq", "twtxq",
};
static const char* aliases[116] = {
"cnbj", "cntj", "cnhe", "cnsx", "cnmn", "cnln", "cnjl", "cnhl",
"cnsh", "cnjs", "cnzj", "cnah", "cnfj", "cnjx", "cnsd", "cnha",
"cnhb", "cnhn", "cngd", "cngx", "cnhi", "cncq", "cnsc", "cngz",
"cnyn", "cnxz", "cnsn", "cngs", "cnqh", "cnnx", "cnxj", "cz110",
"cz111", "cz112", "cz113", "cz114", "cz115", "cz663", "cz632", "cz633",
"cz634", "cz635", "cz641", "cz642", "cz643", "cz644", "cz646", "cz647",
"cz31", "cz64", "cz41", "cz52", "cz51", "cz80", "cz71", "cz53",
"cz32", "cz10", "cz20", "cz42", "cz63", "cz72", "frges", "frnaq",
"frara", "frbfc", "frbre", "frcvl", "frges", "frcor", "frbfc", "fridf",
"frocc", "frnaq", "frges", "frocc", "frhdf", "frnor", "frnor", "frpdl",
"frhdf", "frnaq", "frpac", "frara", "laxs", "lucl", "luec", "luca",
"mr13", "nzauk", "nzcan", "ombj", "omsj", "pl02", "pl04", "pl08",
"pl10", "pl06", "pl12", "pl14", "pl16", "pl20", "pl18", "pl22",
"pl26", "pl24", "pl28", "pl30", "pl32", "tttob", "ttmrc", "tttob",
"twkhh", "twtnn", "twnwt", "twtxg",
};
return SearchReplacement(types, aliases, type);
}
else if (IsUnicodeKey(key, "tz")) {
static const char* types[28] = {
"aqams", "cnckg", "cnhrb", "cnkhg", "cuba", "egypt",
"eire", "est", "gmt0", "hongkong", "hst", "iceland",
"iran", "israel", "jamaica", "japan", "libya", "mst",
"navajo", "poland", "portugal", "prc", "roc", "rok",
"turkey", "uct", "usnavajo", "zulu",
};
static const char* aliases[28] = {
"nzakl", "cnsha", "cnsha", "cnurc", "cuhav", "egcai",
"iedub", "utcw05", "gmt", "hkhkg", "utcw10", "isrey",
"irthr", "jeruslm", "jmkin", "jptyo", "lytip", "utcw07",
"usden", "plwaw", "ptlis", "cnsha", "twtpe", "krsel",
"trist", "utc", "usden", "utc",
};
return SearchReplacement(types, aliases, type);
}
return nullptr;
}

Просмотреть файл

@ -50,15 +50,24 @@ from operator import attrgetter, itemgetter
from zipfile import ZipFile
if sys.version_info.major == 2:
from itertools import ifilter as filter, ifilterfalse as filterfalse, imap as map
from itertools import ifilter as filter, ifilterfalse as filterfalse, imap as map,\
izip_longest as zip_longest
from urllib2 import urlopen, Request as UrlRequest
from urlparse import urlsplit, urlunsplit
else:
from itertools import filterfalse
from itertools import filterfalse, zip_longest
from urllib.request import urlopen, Request as UrlRequest
from urllib.parse import urlsplit, urlunsplit
# From https://docs.python.org/3/library/itertools.html
def grouper(iterable, n, fillvalue=None):
"Collect data into fixed-length chunks or blocks"
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
return zip_longest(*args, fillvalue=fillvalue)
def writeMappingHeader(println, description, source, url):
if type(description) is not list:
description = [description]
@ -383,6 +392,419 @@ function updateGrandfatheredMappings(tag) {
}""".lstrip("\n"))
def writeMappingsBinarySearch(println, fn_name, type_name, name, validate_fn, mappings,
tag_maxlength, description, source, url):
""" Emit code to perform a binary search on language tag subtags.
Uses the contents of |mapping|, which can either be a dictionary or set,
to emit a mapping function to find subtag replacements.
"""
println(u"")
writeMappingHeader(println, description, source, url)
println(u"""
bool js::intl::LanguageTag::{0}({1} {2}) {{
MOZ_ASSERT({3}({2}.range()));
""".format(fn_name, type_name, name, validate_fn).strip())
def write_array(subtags, name, length, fixed):
if fixed:
println(u" static const char {}[{}][{}] = {{".format(name, len(subtags),
length + 1))
else:
println(u" static const char* {}[{}] = {{".format(name, len(subtags)))
# Group in pairs of ten to not exceed the 80 line column limit.
for entries in grouper(subtags, 10):
entries = (u"\"{}\"".format(tag).rjust(length + 2)
for tag in entries if tag is not None)
println(u" {},".format(u", ".join(entries)))
println(u" };")
trailing_return = True
# Sort the subtags by length. That enables using an optimized comparator
# for the binary search, which only performs a single |memcmp| for multiple
# of two subtag lengths.
mappings_keys = mappings.keys() if type(mappings) == dict else mappings
for (length, subtags) in groupby(sorted(mappings_keys, key=len), len):
# Omit the length check if the current length is the maximum length.
if length != tag_maxlength:
println(u"""
if ({}.length() == {}) {{
""".format(name, length).rstrip("\n"))
else:
trailing_return = False
println(u"""
{
""".rstrip("\n"))
# The subtags need to be sorted for binary search to work.
subtags = sorted(subtags)
def equals(subtag):
return u"""{}.equalTo("{}")""".format(name, subtag)
# Don't emit a binary search for short lists.
if len(subtags) == 1:
if type(mappings) == dict:
println(u"""
if ({}) {{
{}.set("{}");
return true;
}}
return false;
""".format(equals(subtags[0]), name, mappings[subtags[0]]).strip("\n"))
else:
println(u"""
return {};
""".format(equals(subtags[0])).strip("\n"))
elif len(subtags) <= 4:
if type(mappings) == dict:
for subtag in subtags:
println(u"""
if ({}) {{
{}.set("{}");
return true;
}}
""".format(equals(subtag), name, mappings[subtag]).strip("\n"))
println(u"""
return false;
""".strip("\n"))
else:
cond = (equals(subtag) for subtag in subtags)
cond = (u" ||\n" + u" " * (4 + len("return "))).join(cond)
println(u"""
return {};
""".format(cond).strip("\n"))
else:
write_array(subtags, name + "s", length, True)
if type(mappings) == dict:
write_array([mappings[k] for k in subtags], u"aliases", length, False)
println(u"""
if (const char* replacement = SearchReplacement({0}s, aliases, {0})) {{
{0}.set(ConstCharRange(replacement, strlen(replacement)));
return true;
}}
return false;
""".format(name).rstrip())
else:
println(u"""
return HasReplacement({0}s, {0});
""".format(name).rstrip())
println(u"""
}
""".strip("\n"))
if trailing_return:
println(u"""
return false;""")
println(u"""
}""".lstrip("\n"))
def writeComplexLanguageTagMappingsNative(println, complex_language_mappings,
description, source, url):
println(u"")
writeMappingHeader(println, description, source, url)
println(u"""
void js::intl::LanguageTag::performComplexLanguageMappings() {
MOZ_ASSERT(IsStructurallyValidLanguageTag(language().range()));
""".lstrip())
# Merge duplicate language entries.
language_aliases = {}
for (deprecated_language, (language, script, region)) in (
sorted(complex_language_mappings.items(), key=itemgetter(0))
):
key = (language, script, region)
if key not in language_aliases:
language_aliases[key] = []
else:
language_aliases[key].append(deprecated_language)
first_language = True
for (deprecated_language, (language, script, region)) in (
sorted(complex_language_mappings.items(), key=itemgetter(0))
):
key = (language, script, region)
if deprecated_language in language_aliases[key]:
continue
if_kind = u"if" if first_language else u"else if"
first_language = False
cond = (u"language().equalTo(\"{}\")".format(lang)
for lang in [deprecated_language] + language_aliases[key])
cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond)
println(u"""
{} ({}) {{""".format(if_kind, cond).strip("\n"))
println(u"""
setLanguage("{}");""".format(language).strip("\n"))
if script is not None:
println(u"""
if (script().length() == 0) {{
setScript("{}");
}}""".format(script).strip("\n"))
if region is not None:
println(u"""
if (region().length() == 0) {{
setRegion("{}");
}}""".format(region).strip("\n"))
println(u"""
}""".strip("\n"))
println(u"""
}
""".strip("\n"))
def writeComplexRegionTagMappingsNative(println, complex_region_mappings,
description, source, url):
println(u"")
writeMappingHeader(println, description, source, url)
println(u"""
void js::intl::LanguageTag::performComplexRegionMappings() {
MOZ_ASSERT(IsStructurallyValidLanguageTag(language().range()));
MOZ_ASSERT(IsStructurallyValidRegionTag(region().range()));
""".lstrip())
# |non_default_replacements| is a list and hence not hashable. Convert it
# to a string to get a proper hashable value.
def hash_key(default, non_default_replacements):
return (default, str(sorted(str(v) for v in non_default_replacements)))
# Merge duplicate region entries.
region_aliases = {}
for (deprecated_region, (default, non_default_replacements)) in (
sorted(complex_region_mappings.items(), key=itemgetter(0))
):
key = hash_key(default, non_default_replacements)
if key not in region_aliases:
region_aliases[key] = []
else:
region_aliases[key].append(deprecated_region)
first_region = True
for (deprecated_region, (default, non_default_replacements)) in (
sorted(complex_region_mappings.items(), key=itemgetter(0))
):
key = hash_key(default, non_default_replacements)
if deprecated_region in region_aliases[key]:
continue
if_kind = u"if" if first_region else u"else if"
first_region = False
cond = (u"region().equalTo(\"{}\")".format(region)
for region in [deprecated_region] + region_aliases[key])
cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond)
println(u"""
{} ({}) {{""".format(if_kind, cond).strip("\n"))
replacement_regions = sorted({region for (_, _, region) in non_default_replacements})
first_case = True
for replacement_region in replacement_regions:
replacement_language_script = sorted(((language, script)
for (language, script, region) in (
non_default_replacements
)
if region == replacement_region),
key=itemgetter(0))
if_kind = u"if" if first_case else u"else if"
first_case = False
def compare_tags(language, script):
if script is None:
return u"language().equalTo(\"{}\")".format(language)
return u"(language().equalTo(\"{}\") && script().equalTo(\"{}\"))".format(
language, script)
cond = (compare_tags(language, script)
for (language, script) in replacement_language_script)
cond = (u" ||\n" + u" " * (4 + len(if_kind) + 2)).join(cond)
println(u"""
{} ({}) {{
setRegion("{}");
}}""".format(if_kind, cond, replacement_region).rstrip().strip("\n"))
println(u"""
else {{
setRegion("{}");
}}
}}""".format(default).rstrip().strip("\n"))
println(u"""
}
""".strip("\n"))
def writeGrandfatheredMappingsFunctionNative(println, grandfathered_mappings,
description, source, url):
""" Writes a function definition that maps grandfathered language tags. """
println(u"")
writeMappingHeader(println, description, source, url)
println(u"""\
bool js::intl::LanguageTag::updateGrandfatheredMappings(JSContext* cx) {
// We're mapping regular grandfathered tags to non-grandfathered form here.
// Other tags remain unchanged.
//
// regular = "art-lojban"
// / "cel-gaulish"
// / "no-bok"
// / "no-nyn"
// / "zh-guoyu"
// / "zh-hakka"
// / "zh-min"
// / "zh-min-nan"
// / "zh-xiang"
//
// Therefore we can quickly exclude most tags by checking every
// |unicode_locale_id| subcomponent for characteristics not shared by any of
// the regular grandfathered (RG) tags:
//
// * Real-world |unicode_language_subtag|s are all two or three letters,
// so don't waste time running a useless |language.length > 3| fast-path.
// * No RG tag has a "script"-looking component.
// * No RG tag has a "region"-looking component.
// * The RG tags that match |unicode_locale_id| (art-lojban, cel-gaulish,
// zh-guoyu, zh-hakka, zh-xiang) have exactly one "variant". (no-bok,
// no-nyn, zh-min, and zh-min-nan require BCP47's extlang subtag
// that |unicode_locale_id| doesn't support.)
// * No RG tag contains |extensions| or |pu_extensions|.
if (script().length() != 0 ||
region().length() != 0 ||
variants().length() != 1 ||
extensions().length() != 0 ||
privateuse()) {
return true;
}
auto variantEqualTo = [this](const char* variant) {
return strcmp(variants()[0].get(), variant) == 0;
};""")
# From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>.
#
# Doesn't allow any 'extensions' subtags.
re_unicode_locale_id = re.compile(
r"""
^
# unicode_language_id = unicode_language_subtag
# unicode_language_subtag = alpha{2,3} | alpha{5,8}
(?P<language>[a-z]{2,3}|[a-z]{5,8})
# (sep unicode_script_subtag)?
# unicode_script_subtag = alpha{4}
(?:-(?P<script>[a-z]{4}))?
# (sep unicode_region_subtag)?
# unicode_region_subtag = (alpha{2} | digit{3})
(?:-(?P<region>([a-z]{2}|[0-9]{3})))?
# (sep unicode_variant_subtag)*
# unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
(?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)?
# pu_extensions?
# pu_extensions = sep [xX] (sep alphanum{1,8})+
(?:-(?P<privateuse>x(-[a-z0-9]{1,8})+))?
$
""", re.IGNORECASE | re.VERBOSE)
is_first = True
for (tag, modern) in sorted(grandfathered_mappings.items(), key=itemgetter(0)):
tag_match = re_unicode_locale_id.match(tag)
assert tag_match is not None
tag_language = tag_match.group("language")
assert tag_match.group("script") is None, (
"{} does not contain a script subtag".format(tag))
assert tag_match.group("region") is None, (
"{} does not contain a region subtag".format(tag))
tag_variants = tag_match.group("variants")
assert tag_variants is not None, (
"{} contains a variant subtag".format(tag))
assert tag_match.group("privateuse") is None, (
"{} does not contain a privateuse subtag".format(tag))
tag_variant = tag_variants[1:]
assert "-" not in tag_variant, (
"{} contains only a single variant".format(tag))
modern_match = re_unicode_locale_id.match(modern)
assert modern_match is not None
modern_language = modern_match.group("language")
modern_script = modern_match.group("script")
modern_region = modern_match.group("region")
modern_variants = modern_match.group("variants")
modern_privateuse = modern_match.group("privateuse")
println(u"""
// {} -> {}
""".format(tag, modern).rstrip())
println(u"""
{}if (language().equalTo("{}") && variantEqualTo("{}")) {{
""".format("" if is_first else "else ",
tag_language,
tag_variant).rstrip().strip("\n"))
is_first = False
println(u"""
setLanguage("{}");
""".format(modern_language).rstrip().strip("\n"))
if modern_script is not None:
println(u"""
setScript("{}");
""".format(modern_script).rstrip().strip("\n"))
if modern_region is not None:
println(u"""
setRegion("{}");
""".format(modern_region).rstrip().strip("\n"))
assert modern_variants is None, (
"all regular grandfathered tags' modern forms do not contain variant subtags")
println(u"""
clearVariants();
""".rstrip().strip("\n"))
if modern_privateuse is not None:
println(u"""
auto privateuse = DuplicateString(cx, "{}");
if (!privateuse) {{
return false;
}}
setPrivateuse(std::move(privateuse));
""".format(modern_privateuse).rstrip().rstrip("\n"))
println(u"""
return true;
}""".rstrip().strip("\n"))
println(u"""
return true;
}""")
@contextlib.contextmanager
def TemporaryDirectory():
tmpDir = tempfile.mkdtemp()
@ -674,6 +1096,106 @@ def writeCLDRLanguageTagData(println, data, url):
source, url)
def writeCLDRLanguageTagDataNative(println, data, url):
""" Writes the language tag data to the Intl data file. """
println(generatedFileWarning)
println(u"""
#include "mozilla/Assertions.h"
#include "mozilla/Range.h"
#include "mozilla/TextUtils.h"
#include <algorithm>
#include <cstdint>
#include <cstring>
#include <iterator>
#include <type_traits>
#include "builtin/intl/LanguageTag.h"
#include "util/Text.h"
#include "vm/JSContext.h"
using ConstCharRange = mozilla::Range<const char>;
template <size_t Length, size_t TagLength, size_t SubtagLength>
static inline bool HasReplacement(
const char (&subtags)[Length][TagLength],
const js::intl::LanguageTagSubtag<SubtagLength>& subtag) {
MOZ_ASSERT(subtag.length() == TagLength - 1,
"subtag must have the same length as the list of subtags");
const char* ptr = subtag.range().begin().get();
return std::binary_search(std::begin(subtags), std::end(subtags), ptr,
[](const char* a, const char* b) {
return memcmp(a, b, TagLength - 1) < 0;
});
}
template <size_t Length, size_t TagLength, size_t SubtagLength>
static inline const char* SearchReplacement(
const char (&subtags)[Length][TagLength],
const char* (&aliases)[Length],
const js::intl::LanguageTagSubtag<SubtagLength>& subtag) {
MOZ_ASSERT(subtag.length() == TagLength - 1,
"subtag must have the same length as the list of subtags");
const char* ptr = subtag.range().begin().get();
auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr,
[](const char* a, const char* b) {
return memcmp(a, b, TagLength - 1) < 0;
});
if (p != std::end(subtags) && memcmp(*p, ptr, TagLength - 1) == 0) {
return aliases[std::distance(std::begin(subtags), p)];
}
return nullptr;
}
""".rstrip())
source = u"CLDR Supplemental Data, version {}".format(data["version"])
grandfathered_mappings = data["grandfatheredMappings"]
language_mappings = data["languageMappings"]
complex_language_mappings = data["complexLanguageMappings"]
region_mappings = data["regionMappings"]
complex_region_mappings = data["complexRegionMappings"]
# unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
language_maxlength = 8
# unicode_region_subtag = (alpha{2} | digit{3}) ;
region_maxlength = 3
writeMappingsBinarySearch(println, "languageMapping",
"LanguageSubtag&", "language",
"IsStructurallyValidLanguageTag",
language_mappings, language_maxlength,
"Mappings from language subtags to preferred values.", source, url)
writeMappingsBinarySearch(println, "complexLanguageMapping",
"const LanguageSubtag&", "language",
"IsStructurallyValidLanguageTag",
complex_language_mappings.keys(), language_maxlength,
"Language subtags with complex mappings.", source, url)
writeMappingsBinarySearch(println, "regionMapping",
"RegionSubtag&", "region",
"IsStructurallyValidRegionTag",
region_mappings, region_maxlength,
"Mappings from region subtags to preferred values.", source, url)
writeMappingsBinarySearch(println, "complexRegionMapping",
"const RegionSubtag&", "region",
"IsStructurallyValidRegionTag",
complex_region_mappings.keys(), region_maxlength,
"Region subtags with complex mappings.", source, url)
writeComplexLanguageTagMappingsNative(println, complex_language_mappings,
"Language subtags with complex mappings.", source, url)
writeComplexRegionTagMappingsNative(println, complex_region_mappings,
"Region subtags with complex mappings.", source, url)
writeGrandfatheredMappingsFunctionNative(println, grandfathered_mappings,
"Canonicalize grandfathered locale identifiers.", source,
url)
def writeCLDRLanguageTagLikelySubtagsTest(println, data, url):
""" Writes the likely-subtags test file. """
@ -886,6 +1408,13 @@ def updateCLDRLangTags(args):
println(u"// Generated by make_intl_data.py. DO NOT EDIT.")
writeCLDRLanguageTagData(println, data, url)
print("Writing Intl data...")
native_out = "LanguageTagGenerated.cpp"
# native_out = os.path.splitext(out)[0] + ".cpp"
with io.open(native_out, mode="w", encoding="utf-8", newline="") as f:
println = partial(print, file=f)
writeCLDRLanguageTagDataNative(println, data, url)
print("Writing Intl test data...")
test_file = os.path.join(os.path.dirname(os.path.abspath(__file__)),
"../../tests/non262/Intl/Locale/likely-subtags-generated.js")
@ -894,7 +1423,7 @@ def updateCLDRLangTags(args):
println(u"// |reftest| skip-if(!this.hasOwnProperty('Intl')||"
u"(!this.Intl.Locale&&!this.hasOwnProperty('addIntlExtras')))")
println(u"// Generated by make_intl_data.py. DO NOT EDIT.")
println(generatedFileWarning)
writeCLDRLanguageTagLikelySubtagsTest(println, data, url)
@ -1780,6 +2309,179 @@ def writeUnicodeExtensionsFile(version, url, mapping, out):
println(u" },")
println(u"};")
with io.open(os.path.splitext(out)[0] + ".cpp", mode="w", encoding="utf-8", newline="") as f:
println = partial(print, file=f)
println(generatedFileWarning)
println(u"// Version: CLDR-{}".format(version))
println(u"// URL: {}".format(url))
println(u"""
#include "mozilla/Assertions.h"
#include "mozilla/Range.h"
#include "mozilla/TextUtils.h"
#include <algorithm>
#include <cstdint>
#include <cstring>
#include "builtin/intl/LanguageTag.h"
using namespace js::intl::LanguageTagLimits;
using ConstCharRange = mozilla::Range<const char>;
template <size_t Length>
static inline bool IsUnicodeKey(const ConstCharRange& key,
const char (&str)[Length]) {
static_assert(Length == UnicodeKeyLength + 1,
"Unicode extension key is two characters long");
return memcmp(key.begin().get(), str, Length - 1) == 0;
}
template <size_t Length>
static inline bool IsUnicodeType(const ConstCharRange& type,
const char (&str)[Length]) {
static_assert(Length > UnicodeKeyLength + 1,
"Unicode extension type contains more than two characters");
return type.length() == (Length - 1) &&
memcmp(type.begin().get(), str, Length - 1) == 0;
}
static int32_t CompareUnicodeType(const char* a, const ConstCharRange& b) {
#ifdef DEBUG
auto isNull = [](char c) {
return c == '\\0';
};
#endif
MOZ_ASSERT(std::none_of(b.begin().get(), b.end().get(), isNull),
"unexpected null-character in string");
using UnsignedChar = unsigned char;
for (size_t i = 0; i < b.length(); i++) {
// |a| is zero-terminated and |b| doesn't contain a null-terminator. So if
// we've reached the end of |a|, the below if-statement will always be true.
// That ensures we don't read past the end of |a|.
if (int32_t r = UnsignedChar(a[i]) - UnsignedChar(b[i])) {
return r;
}
}
// Return zero if both strings are equal or a negative number if |b| is a
// prefix of |a|.
return -int32_t(UnsignedChar(a[b.length()]));
};
template <size_t Length>
static inline const char* SearchReplacement(const char* (&types)[Length],
const char* (&aliases)[Length],
const ConstCharRange& type) {
auto p = std::lower_bound(std::begin(types), std::end(types), type,
[](const auto& a, const auto& b) {
return CompareUnicodeType(a, b) < 0;
});
if (p != std::end(types) && CompareUnicodeType(*p, type) == 0) {
return aliases[std::distance(std::begin(types), p)];
}
return nullptr;
}
""".rstrip("\n"))
println(u"""
/**
* Mapping from deprecated BCP 47 Unicode extension types to their preferred
* values.
*
* Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files
*/
const char* js::intl::LanguageTag::replaceUnicodeExtensionType(
const ConstCharRange& key, const ConstCharRange& type) {
#ifdef DEBUG
static auto isAsciiLowercaseAlphanumeric = [](char c) {
return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c);
};
static auto isAsciiLowercaseAlphanumericOrDash = [](char c) {
return isAsciiLowercaseAlphanumeric(c) || c == '-';
};
#endif
MOZ_ASSERT(key.length() == UnicodeKeyLength);
MOZ_ASSERT(std::all_of(key.begin().get(), key.end().get(),
isAsciiLowercaseAlphanumeric));
MOZ_ASSERT(type.length() > UnicodeKeyLength);
MOZ_ASSERT(std::all_of(type.begin().get(), type.end().get(),
isAsciiLowercaseAlphanumericOrDash));
""")
def to_hash_key(replacements):
return str(sorted([str((k, v["preferred"])) for (k, v) in replacements.items()]))
def write_array(subtags, name, length):
max_entries = (80 - len(" ")) // (length + len('"", '))
println(u" static const char* {}[{}] = {{".format(name, len(subtags)))
for entries in grouper(subtags, max_entries):
entries = (u"\"{}\"".format(tag).rjust(length + 2)
for tag in entries if tag is not None)
println(u" {},".format(u", ".join(entries)))
println(u" };")
# Merge duplicate keys.
key_aliases = {}
for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)):
hash_key = to_hash_key(replacements)
if hash_key not in key_aliases:
key_aliases[hash_key] = []
else:
key_aliases[hash_key].append(key)
first_key = True
for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)):
hash_key = to_hash_key(replacements)
if key in key_aliases[hash_key]:
continue
cond = (u"IsUnicodeKey(key, \"{}\")".format(k) for k in [key] + key_aliases[hash_key])
if_kind = u"if" if first_key else u"else if"
cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond)
println(u"""
{} ({}) {{""".format(if_kind, cond).strip("\n"))
first_key = False
replacements = sorted(replacements.items(), key=itemgetter(0))
if len(replacements) > 4:
types = [t for (t, _) in replacements]
preferred = [r["preferred"] for (_, r) in replacements]
max_len = max(len(k) for k in types + preferred)
write_array(types, "types", max_len)
write_array(preferred, "aliases", max_len)
println(u"""
return SearchReplacement(types, aliases, type);
""".strip("\n"))
else:
for (type, replacement) in replacements:
println(u"""
if (IsUnicodeType(type, "{}")) {{
return "{}";
}}""".format(type, replacement["preferred"]).strip("\n"))
println(u"""
}""".lstrip("\n"))
println(u"""
return nullptr;
}
""".strip("\n"))
def updateUnicodeExtensions(args):
""" Update the UnicodeExtensionsGenerated.js file. """

Просмотреть файл

@ -380,11 +380,14 @@ if CONFIG['ENABLE_INTL_API']:
'builtin/intl/CommonFunctions.cpp',
'builtin/intl/DateTimeFormat.cpp',
'builtin/intl/IntlObject.cpp',
'builtin/intl/LanguageTag.cpp',
'builtin/intl/LanguageTagGenerated.cpp',
'builtin/intl/Locale.cpp',
'builtin/intl/NumberFormat.cpp',
'builtin/intl/PluralRules.cpp',
'builtin/intl/RelativeTimeFormat.cpp',
'builtin/intl/SharedIntlData.cpp',
'builtin/intl/UnicodeExtensionsGenerated.cpp',
]
if CONFIG['MOZ_INSTRUMENTS']: