Bug 1451082 - Update IANA language subtag registry data to version 2018-03-30. r=Waldo

This commit is contained in:
André Bargull 2018-04-25 11:46:14 -07:00
Родитель 268b9d66c5
Коммит 37bc3aaed9
4 изменённых файлов: 781 добавлений и 195 удалений

Просмотреть файл

@ -125,17 +125,16 @@ function getUnicodeExtensions(locale) {
* Parser for BCP 47 language tags.
*
* Returns null if |locale| can't be parsed as a Language-Tag. If the input is
* an irregular grandfathered language tag, the object
* a grandfathered language tag, the object
*
* {
* locale: locale.toLowerCase(),
* locale: locale (normalized to canonical form),
* grandfathered: true,
* }
*
* is returned. Otherwise the returned object has the following structure:
*
* {
* locale: locale.toLowerCase(),
* language: language subtag without extlang / undefined,
* extlang1: first extlang subtag / undefined,
* extlang2: second extlang subtag / undefined,
@ -147,13 +146,12 @@ function getUnicodeExtensions(locale) {
* privateuse: privateuse subtag / undefined,
* }
*
* All language tag subtags are returned in lower-case:
* All language tag subtags are returned in their normalized case:
*
* var langtag = parseLanguageTag("en-Latn-US");
* assertEq("en-latn-us", langtag.locale);
* var langtag = parseLanguageTag("en-latn-us");
* assertEq("en", langtag.language);
* assertEq("latn", langtag.script);
* assertEq("us", langtag.region);
* assertEq("Latn", langtag.script);
* assertEq("US", langtag.region);
*
* Spec: RFC 5646 section 2.1.
*/
@ -307,6 +305,12 @@ function parseLanguageTag(locale) {
// script = 4ALPHA ; ISO 15924 code
if (tokenLength === 4 && token === ALPHA) {
script = tokenStringLower();
// The first character of a script code needs to be capitalized.
// "hans" -> "Hans"
script = callFunction(std_String_toUpperCase, script[0]) +
Substring(script, 1, script.length - 1);
if (!nextToken())
return null;
}
@ -315,6 +319,10 @@ function parseLanguageTag(locale) {
// / 3DIGIT ; UN M.49 code
if ((tokenLength === 2 && token === ALPHA) || (tokenLength === 3 && token === DIGIT)) {
region = tokenStringLower();
// Region codes need to be in upper-case. "bu" -> "BU"
region = callFunction(std_String_toUpperCase, region);
if (!nextToken())
return null;
}
@ -417,12 +425,11 @@ function parseLanguageTag(locale) {
localeLowercase.length - privateuseStart);
}
// Return if the complete input was successfully parsed. That means it is
// either a langtag or privateuse-only language tag, or it is a regular
// grandfathered language tag.
if (token === NONE) {
// Return if the complete input was successfully parsed and it is not a
// regular grandfathered language tag. That means it is either a langtag
// or privateuse-only language tag
if (token === NONE && !hasOwn(localeLowercase, grandfatheredMappings)) {
return {
locale: localeLowercase,
language,
extlang1,
extlang2,
@ -443,76 +450,48 @@ function parseLanguageTag(locale) {
// For example we need to reject "i-ha\u212A" (U+212A KELVIN SIGN) even
// though its lower-case form "i-hak" matches a grandfathered language
// tag.
do {
while (token !== NONE) {
if (!nextToken())
return null;
} while (token !== NONE);
}
// grandfathered = irregular ; non-redundant tags registered
// / regular ; during the RFC 3066 era
switch (localeLowercase) {
#ifdef DEBUG
// regular = "art-lojban" ; these tags match the 'langtag'
// / "cel-gaulish" ; production, but their subtags
// / "no-bok" ; are not extended language
// / "no-nyn" ; or variant subtags: their meaning
// / "zh-guoyu" ; is defined by their registration
// / "zh-hakka" ; and all of these are deprecated
// / "zh-min" ; in favor of a more modern
// / "zh-min-nan" ; subtag or sequence of subtags
// / "zh-xiang"
case "art-lojban":
case "cel-gaulish":
case "no-bok":
case "no-nyn":
case "zh-guoyu":
case "zh-hakka":
case "zh-min":
case "zh-min-nan":
case "zh-xiang":
assert(false, "regular grandfathered tags should have been matched above");
#endif /* DEBUG */
// irregular = "en-GB-oed" ; irregular tags do not match
// / "i-ami" ; the 'langtag' production and
// / "i-bnn" ; would not otherwise be
// / "i-default" ; considered 'well-formed'
// / "i-enochian" ; These tags are all valid,
// / "i-hak" ; but most are deprecated
// / "i-klingon" ; in favor of more modern
// / "i-lux" ; subtags or subtag
// / "i-mingo" ; combination
// / "i-navajo"
// / "i-pwn"
// / "i-tao"
// / "i-tay"
// / "i-tsu"
// / "sgn-BE-FR"
// / "sgn-BE-NL"
// / "sgn-CH-DE"
case "en-gb-oed":
case "i-ami":
case "i-bnn":
case "i-default":
case "i-enochian":
case "i-hak":
case "i-klingon":
case "i-lux":
case "i-mingo":
case "i-navajo":
case "i-pwn":
case "i-tao":
case "i-tay":
case "i-tsu":
case "sgn-be-fr":
case "sgn-be-nl":
case "sgn-ch-de":
return { locale: localeLowercase, grandfathered: true };
default:
return null;
// irregular = "en-GB-oed" ; irregular tags do not match
// / "i-ami" ; the 'langtag' production and
// / "i-bnn" ; would not otherwise be
// / "i-default" ; considered 'well-formed'
// / "i-enochian" ; These tags are all valid,
// / "i-hak" ; but most are deprecated
// / "i-klingon" ; in favor of more modern
// / "i-lux" ; subtags or subtag
// / "i-mingo" ; combination
// / "i-navajo"
// / "i-pwn"
// / "i-tao"
// / "i-tay"
// / "i-tsu"
// / "sgn-BE-FR"
// / "sgn-BE-NL"
// / "sgn-CH-DE"
// regular = "art-lojban" ; these tags match the 'langtag'
// / "cel-gaulish" ; production, but their subtags
// / "no-bok" ; are not extended language
// / "no-nyn" ; or variant subtags: their meaning
// / "zh-guoyu" ; is defined by their registration
// / "zh-hakka" ; and all of these are deprecated
// / "zh-min" ; in favor of a more modern
// / "zh-min-nan" ; subtag or sequence of subtags
// / "zh-xiang"
if (hasOwn(localeLowercase, grandfatheredMappings)) {
return {
locale: grandfatheredMappings[localeLowercase],
grandfathered: true
};
}
return null;
#undef NONE
#undef ALPHA
#undef DIGIT
@ -560,16 +539,12 @@ function IsStructurallyValidLanguageTag(locale) {
function CanonicalizeLanguageTagFromObject(localeObj) {
assert(IsObject(localeObj), "CanonicalizeLanguageTagFromObject");
var {locale} = localeObj;
assert(locale === callFunction(std_String_toLowerCase, locale),
"expected lower-case form for locale string");
// Handle grandfathered language tags.
if (hasOwn("grandfathered", localeObj))
return localeObj.locale;
// Handle mappings for complete tags.
if (hasOwn(locale, langTagMappings))
return langTagMappings[locale];
assert(!hasOwn("grandfathered", localeObj),
"grandfathered tags should be mapped completely");
// Update mappings for complete tags.
updateLangTagMappings(localeObj);
var {
language,
@ -630,25 +605,25 @@ function CanonicalizeLanguageTagFromObject(localeObj) {
if (extlang3)
canonical += "-" + extlang3;
// No script replacements are currently present, so append as is.
if (script) {
// The first character of a script code needs to be capitalized.
// "hans" -> "Hans"
script = callFunction(std_String_toUpperCase, script[0]) +
Substring(script, 1, script.length - 1);
// No script replacements are currently present, so append as is.
assert(script.length === 4 &&
script ===
callFunction(std_String_toUpperCase, script[0]) +
callFunction(std_String_toLowerCase, Substring(script, 1, script.length - 1)),
"script must be [A-Z][a-z]{3}");
canonical += "-" + script;
}
if (region) {
// Region codes need to be in upper-case. "bu" -> "BU"
region = callFunction(std_String_toUpperCase, region);
// Replace deprecated subtags with their preferred values.
// "BU" -> "MM"
if (hasOwn(region, regionMappings))
region = regionMappings[region];
assert((2 <= region.length && region.length <= 3) &&
region === callFunction(std_String_toUpperCase, region),
"region must be [A-Z]{2} or [0-9]{3}");
canonical += "-" + region;
}
@ -733,9 +708,9 @@ function ValidateAndCanonicalizeLanguageTag(locale) {
// The language subtag is canonicalized to lower case.
locale = callFunction(std_String_toLowerCase, locale);
// langTagMappings doesn't contain any 2*3ALPHA keys, so we don't need
// to check for possible replacements in this map.
assert(!hasOwn(locale, langTagMappings), "langTagMappings contains no 2*3ALPHA mappings");
// updateLangTagMappings doesn't modify tags containing only
// |language| subtags, so we don't need to call it for possible
// replacements.
// Replace deprecated subtags with their preferred values.
locale = hasOwn(locale, languageMappings)

Просмотреть файл

@ -1,9 +1,321 @@
// Generated by make_intl_data.py. DO NOT EDIT.
/* eslint-disable complexity */
// Mappings from complete tags to preferred values.
// Derived from IANA Language Subtag Registry, file date 2018-03-20.
// Derived from IANA Language Subtag Registry, file date 2018-03-30.
// https://www.iana.org/assignments/language-subtag-registry
var langTagMappings = {
function updateLangTagMappings(tag) {
assert(IsObject(tag), "tag is an object");
assert(!hasOwn("grandfathered", tag), "tag is not a grandfathered tag");
switch (tag.language) {
case "hy":
// hy-arevela -> hy
if (tag.variants.length >= 1 &&
callFunction(ArrayIndexOf, tag.variants, "arevela") > -1)
{
var newVariants = [];
for (var i = 0; i < tag.variants.length; i++) {
var variant = tag.variants[i];
if (variant === "arevela")
continue;
_DefineDataProperty(newVariants, newVariants.length, variant);
}
tag.variants = newVariants;
}
// hy-arevmda -> hyw
else if (tag.variants.length >= 1 &&
callFunction(ArrayIndexOf, tag.variants, "arevmda") > -1)
{
tag.language = "hyw";
var newVariants = [];
for (var i = 0; i < tag.variants.length; i++) {
var variant = tag.variants[i];
if (variant === "arevmda")
continue;
_DefineDataProperty(newVariants, newVariants.length, variant);
}
tag.variants = newVariants;
}
break;
case "ja":
// ja-Latn-hepburn-heploc -> ja-Latn-alalc97
if (tag.script === "Latn" &&
tag.variants.length >= 2 &&
callFunction(ArrayIndexOf, tag.variants, "hepburn") > -1 &&
callFunction(ArrayIndexOf, tag.variants, "heploc", callFunction(ArrayIndexOf, tag.variants, "hepburn") + 1) > -1)
{
var newVariants = [];
for (var i = 0; i < tag.variants.length; i++) {
var variant = tag.variants[i];
if (variant === "hepburn")
continue;
if (variant === "heploc")
continue;
_DefineDataProperty(newVariants, newVariants.length, variant);
}
if (callFunction(ArrayIndexOf, newVariants, "alalc97") < 0)
_DefineDataProperty(newVariants, newVariants.length, "alalc97");
tag.variants = newVariants;
}
break;
case "sgn":
// sgn-BR -> bzs
if (tag.region === "BR" &&
tag.extlang1 === undefined &&
tag.extlang2 === undefined &&
tag.extlang3 === undefined &&
tag.script === undefined &&
tag.variants.length === 0 &&
tag.extensions.length === 0 &&
tag.privateuse === undefined)
{
tag.language = "bzs";
tag.region = undefined;
}
// sgn-CO -> csn
else if (tag.region === "CO" &&
tag.extlang1 === undefined &&
tag.extlang2 === undefined &&
tag.extlang3 === undefined &&
tag.script === undefined &&
tag.variants.length === 0 &&
tag.extensions.length === 0 &&
tag.privateuse === undefined)
{
tag.language = "csn";
tag.region = undefined;
}
// sgn-DE -> gsg
else if (tag.region === "DE" &&
tag.extlang1 === undefined &&
tag.extlang2 === undefined &&
tag.extlang3 === undefined &&
tag.script === undefined &&
tag.variants.length === 0 &&
tag.extensions.length === 0 &&
tag.privateuse === undefined)
{
tag.language = "gsg";
tag.region = undefined;
}
// sgn-DK -> dsl
else if (tag.region === "DK" &&
tag.extlang1 === undefined &&
tag.extlang2 === undefined &&
tag.extlang3 === undefined &&
tag.script === undefined &&
tag.variants.length === 0 &&
tag.extensions.length === 0 &&
tag.privateuse === undefined)
{
tag.language = "dsl";
tag.region = undefined;
}
// sgn-ES -> ssp
else if (tag.region === "ES" &&
tag.extlang1 === undefined &&
tag.extlang2 === undefined &&
tag.extlang3 === undefined &&
tag.script === undefined &&
tag.variants.length === 0 &&
tag.extensions.length === 0 &&
tag.privateuse === undefined)
{
tag.language = "ssp";
tag.region = undefined;
}
// sgn-FR -> fsl
else if (tag.region === "FR" &&
tag.extlang1 === undefined &&
tag.extlang2 === undefined &&
tag.extlang3 === undefined &&
tag.script === undefined &&
tag.variants.length === 0 &&
tag.extensions.length === 0 &&
tag.privateuse === undefined)
{
tag.language = "fsl";
tag.region = undefined;
}
// sgn-GB -> bfi
else if (tag.region === "GB" &&
tag.extlang1 === undefined &&
tag.extlang2 === undefined &&
tag.extlang3 === undefined &&
tag.script === undefined &&
tag.variants.length === 0 &&
tag.extensions.length === 0 &&
tag.privateuse === undefined)
{
tag.language = "bfi";
tag.region = undefined;
}
// sgn-GR -> gss
else if (tag.region === "GR" &&
tag.extlang1 === undefined &&
tag.extlang2 === undefined &&
tag.extlang3 === undefined &&
tag.script === undefined &&
tag.variants.length === 0 &&
tag.extensions.length === 0 &&
tag.privateuse === undefined)
{
tag.language = "gss";
tag.region = undefined;
}
// sgn-IE -> isg
else if (tag.region === "IE" &&
tag.extlang1 === undefined &&
tag.extlang2 === undefined &&
tag.extlang3 === undefined &&
tag.script === undefined &&
tag.variants.length === 0 &&
tag.extensions.length === 0 &&
tag.privateuse === undefined)
{
tag.language = "isg";
tag.region = undefined;
}
// sgn-IT -> ise
else if (tag.region === "IT" &&
tag.extlang1 === undefined &&
tag.extlang2 === undefined &&
tag.extlang3 === undefined &&
tag.script === undefined &&
tag.variants.length === 0 &&
tag.extensions.length === 0 &&
tag.privateuse === undefined)
{
tag.language = "ise";
tag.region = undefined;
}
// sgn-JP -> jsl
else if (tag.region === "JP" &&
tag.extlang1 === undefined &&
tag.extlang2 === undefined &&
tag.extlang3 === undefined &&
tag.script === undefined &&
tag.variants.length === 0 &&
tag.extensions.length === 0 &&
tag.privateuse === undefined)
{
tag.language = "jsl";
tag.region = undefined;
}
// sgn-MX -> mfs
else if (tag.region === "MX" &&
tag.extlang1 === undefined &&
tag.extlang2 === undefined &&
tag.extlang3 === undefined &&
tag.script === undefined &&
tag.variants.length === 0 &&
tag.extensions.length === 0 &&
tag.privateuse === undefined)
{
tag.language = "mfs";
tag.region = undefined;
}
// sgn-NI -> ncs
else if (tag.region === "NI" &&
tag.extlang1 === undefined &&
tag.extlang2 === undefined &&
tag.extlang3 === undefined &&
tag.script === undefined &&
tag.variants.length === 0 &&
tag.extensions.length === 0 &&
tag.privateuse === undefined)
{
tag.language = "ncs";
tag.region = undefined;
}
// sgn-NL -> dse
else if (tag.region === "NL" &&
tag.extlang1 === undefined &&
tag.extlang2 === undefined &&
tag.extlang3 === undefined &&
tag.script === undefined &&
tag.variants.length === 0 &&
tag.extensions.length === 0 &&
tag.privateuse === undefined)
{
tag.language = "dse";
tag.region = undefined;
}
// sgn-NO -> nsl
else if (tag.region === "NO" &&
tag.extlang1 === undefined &&
tag.extlang2 === undefined &&
tag.extlang3 === undefined &&
tag.script === undefined &&
tag.variants.length === 0 &&
tag.extensions.length === 0 &&
tag.privateuse === undefined)
{
tag.language = "nsl";
tag.region = undefined;
}
// sgn-PT -> psr
else if (tag.region === "PT" &&
tag.extlang1 === undefined &&
tag.extlang2 === undefined &&
tag.extlang3 === undefined &&
tag.script === undefined &&
tag.variants.length === 0 &&
tag.extensions.length === 0 &&
tag.privateuse === undefined)
{
tag.language = "psr";
tag.region = undefined;
}
// sgn-SE -> swl
else if (tag.region === "SE" &&
tag.extlang1 === undefined &&
tag.extlang2 === undefined &&
tag.extlang3 === undefined &&
tag.script === undefined &&
tag.variants.length === 0 &&
tag.extensions.length === 0 &&
tag.privateuse === undefined)
{
tag.language = "swl";
tag.region = undefined;
}
// sgn-US -> ase
else if (tag.region === "US" &&
tag.extlang1 === undefined &&
tag.extlang2 === undefined &&
tag.extlang3 === undefined &&
tag.script === undefined &&
tag.variants.length === 0 &&
tag.extensions.length === 0 &&
tag.privateuse === undefined)
{
tag.language = "ase";
tag.region = undefined;
}
// sgn-ZA -> sfs
else if (tag.region === "ZA" &&
tag.extlang1 === undefined &&
tag.extlang2 === undefined &&
tag.extlang3 === undefined &&
tag.script === undefined &&
tag.variants.length === 0 &&
tag.extensions.length === 0 &&
tag.privateuse === undefined)
{
tag.language = "sfs";
tag.region = undefined;
}
break;
}
}
/* eslint-enable complexity */
// Mappings from grandfathered tags to preferred values.
// Derived from IANA Language Subtag Registry, file date 2018-03-30.
// https://www.iana.org/assignments/language-subtag-registry
var grandfatheredMappings = {
"art-lojban": "jbo",
"cel-gaulish": "cel-gaulish",
"en-gb-oed": "en-GB-oxendict",
@ -20,46 +332,20 @@ var langTagMappings = {
"i-tao": "tao",
"i-tay": "tay",
"i-tsu": "tsu",
"ja-latn-hepburn-heploc": "ja-Latn-alalc97",
"no-bok": "nb",
"no-nyn": "nn",
"sgn-be-fr": "sfb",
"sgn-be-nl": "vgt",
"sgn-br": "bzs",
"sgn-ch-de": "sgg",
"sgn-co": "csn",
"sgn-de": "gsg",
"sgn-dk": "dsl",
"sgn-es": "ssp",
"sgn-fr": "fsl",
"sgn-gb": "bfi",
"sgn-gr": "gss",
"sgn-ie": "isg",
"sgn-it": "ise",
"sgn-jp": "jsl",
"sgn-mx": "mfs",
"sgn-ni": "ncs",
"sgn-nl": "dse",
"sgn-no": "nsl",
"sgn-pt": "psr",
"sgn-se": "swl",
"sgn-us": "ase",
"sgn-za": "sfs",
"zh-cmn": "cmn",
"zh-cmn-hans": "cmn-Hans",
"zh-cmn-hant": "cmn-Hant",
"zh-gan": "gan",
"zh-guoyu": "cmn",
"zh-hakka": "hak",
"zh-min": "zh-min",
"zh-min-nan": "nan",
"zh-wuu": "wuu",
"zh-xiang": "hsn",
"zh-yue": "yue",
};
// Mappings from language subtags to preferred values.
// Derived from IANA Language Subtag Registry, file date 2018-03-20.
// Derived from IANA Language Subtag Registry, file date 2018-03-30.
// https://www.iana.org/assignments/language-subtag-registry
var languageMappings = {
"aam": "aas",
@ -143,7 +429,7 @@ var languageMappings = {
};
// Mappings from region subtags to preferred values.
// Derived from IANA Language Subtag Registry, file date 2018-03-20.
// Derived from IANA Language Subtag Registry, file date 2018-03-30.
// https://www.iana.org/assignments/language-subtag-registry
var regionMappings = {
"BU": "MM",
@ -158,7 +444,7 @@ var regionMappings = {
// All current deprecated extlang subtags have the form `<prefix>-<extlang>`
// and their preferred value is exactly equal to `<extlang>`. So each key in
// extlangMappings acts both as the extlang subtag and its preferred value.
// Derived from IANA Language Subtag Registry, file date 2018-03-20.
// Derived from IANA Language Subtag Registry, file date 2018-03-30.
// https://www.iana.org/assignments/language-subtag-registry
var extlangMappings = {
"aao": "ar",

Просмотреть файл

@ -69,30 +69,34 @@ def readRegistryRecord(registry):
yield record
return
def readRegistry(registry):
""" Reads IANA Language Subtag Registry and extracts information for Intl.js.
Information extracted:
- langTagMappings: mappings from complete language tags to preferred
- grandfatheredMappings: mappings from grandfathered tags to preferred
complete language tags
- redundantMappings: mappings from redundant tags to preferred complete
language tags
- languageMappings: mappings from language subtags to preferred subtags
- regionMappings: mappings from region subtags to preferred subtags
- variantMappings: mappings from complete language tags to preferred
complete language tags
- extlangMappings: mappings from extlang subtags to preferred subtags,
with prefix to be removed
Returns these four mappings as dictionaries, along with the registry's
Returns these six mappings as dictionaries, along with the registry's
file date.
We also check that extlang mappings don't generate preferred values
which in turn are subject to language subtag mappings, so that
CanonicalizeLanguageTag can process subtags sequentially.
"""
langTagMappings = {}
grandfatheredMappings = {}
redundantMappings = {}
languageMappings = {}
regionMappings = {}
variantMappings = {}
extlangMappings = {}
languageSubtags = set()
extlangSubtags = set()
extlangSubtags = []
for record in readRegistryRecord(registry):
if "File-Date" in record:
@ -103,23 +107,22 @@ def readRegistry(registry):
# Grandfathered tags don't use standard syntax, so
# CanonicalizeLanguageTag expects the mapping table to provide
# the final form for all.
# For langTagMappings, keys must be in lower case; values in
# For grandfatheredMappings, keys must be in lower case; values in
# the case used in the registry.
tag = record["Tag"]
if "Preferred-Value" in record:
langTagMappings[tag.lower()] = record["Preferred-Value"]
grandfatheredMappings[tag.lower()] = record["Preferred-Value"]
else:
langTagMappings[tag.lower()] = tag
grandfatheredMappings[tag.lower()] = tag
elif record["Type"] == "redundant":
# For langTagMappings, keys must be in lower case; values in
# the case used in the registry.
# For redundantMappings, keys and values must be in the case used
# in the registry.
if "Preferred-Value" in record:
langTagMappings[record["Tag"].lower()] = record["Preferred-Value"]
redundantMappings[record["Tag"]] = record["Preferred-Value"]
elif record["Type"] == "language":
# For languageMappings, keys and values must be in the case used
# in the registry.
subtag = record["Subtag"]
languageSubtags.add(subtag)
if "Preferred-Value" in record:
# The 'Prefix' field is not allowed for language records.
# https://tools.ietf.org/html/rfc5646#section-3.1.2
@ -139,21 +142,19 @@ def readRegistry(registry):
# The registry currently doesn't contain mappings for scripts.
raise Exception("Unexpected mapping for script subtags")
elif record["Type"] == "variant":
subtag = record["Subtag"]
# For variantMappings, keys and values must be in the case used in
# the registry.
if "Preferred-Value" in record:
if subtag == "heploc":
# The entry for heploc is unique in its complexity; handle
# it as special case below.
continue
# The registry currently doesn't contain mappings for variants,
# except for heploc which is already handled above.
raise Exception("Unexpected mapping for variant subtags")
if "Prefix" not in record:
raise Exception("Unexpected mapping for variant subtags")
tag = "{}-{}".format(record["Prefix"], record["Subtag"])
variantMappings[tag] = record["Preferred-Value"]
elif record["Type"] == "extlang":
# For extlangMappings, keys must be in the case used in the
# registry; values are records with the preferred value and the
# prefix to be removed.
subtag = record["Subtag"]
extlangSubtags.add(subtag)
extlangSubtags.append(subtag)
if "Preferred-Value" in record:
preferred = record["Preferred-Value"]
# The 'Preferred-Value' and 'Subtag' fields MUST be identical.
@ -173,57 +174,349 @@ def readRegistry(registry):
raise Exception("Conflict: extlang with lang mapping: " + extlang)
# Special case for heploc.
langTagMappings["ja-latn-hepburn-heploc"] = "ja-Latn-alalc97"
assert variantMappings["ja-Latn-hepburn-heploc"] == "alalc97"
variantMappings["ja-Latn-hepburn-heploc"] = "ja-Latn-alalc97"
# ValidateAndCanonicalizeLanguageTag in CommonFunctions.js expects
# langTagMappings contains no 2*3ALPHA.
assert all(len(lang) > 3 for lang in langTagMappings.iterkeys())
# redundantMappings contains no 2*3ALPHA.
assert all(len(lang) > 3 for lang in redundantMappings.iterkeys())
return {"fileDate": fileDate,
"langTagMappings": langTagMappings,
"grandfatheredMappings": grandfatheredMappings,
"redundantMappings": redundantMappings,
"languageMappings": languageMappings,
"regionMappings": regionMappings,
"variantMappings": variantMappings,
"extlangMappings": extlangMappings}
def writeMappingsVar(intlData, dict, name, description, fileDate, url):
""" Writes a variable definition with a mapping table to file intlData.
Writes the contents of dictionary dict to file intlData with the given
variable name and a comment with description, fileDate, and URL.
"""
intlData.write("\n")
def writeMappingHeader(println, description, fileDate, url):
if type(description) is not list:
description = [description]
for desc in description:
intlData.write("// {0}\n".format(desc))
intlData.write("// Derived from IANA Language Subtag Registry, file date {0}.\n".format(fileDate))
intlData.write("// {0}\n".format(url))
intlData.write("var {0} = {{\n".format(name))
keys = sorted(dict)
for key in keys:
if isinstance(dict[key], basestring):
value = '"{0}"'.format(dict[key])
println(u"// {0}".format(desc))
println(u"// Derived from IANA Language Subtag Registry, file date {0}.".format(fileDate))
println(u"// {0}".format(url))
def writeMappingsVar(println, mapping, name, description, fileDate, url):
""" Writes a variable definition with a mapping table.
Writes the contents of dictionary |mapping| through the |println|
function with the given variable name and a comment with description,
fileDate, and URL.
"""
println(u"")
writeMappingHeader(println, description, fileDate, url)
println(u"var {0} = {{".format(name))
for key in sorted(mapping):
if isinstance(mapping[key], basestring):
value = '"{0}"'.format(mapping[key])
else:
preferred = dict[key]["preferred"]
prefix = dict[key]["prefix"]
preferred = mapping[key]["preferred"]
prefix = mapping[key]["prefix"]
if key != preferred:
raise Exception("Expected '{0}' matches preferred locale '{1}'".format(key, preferred))
value = '"{0}"'.format(prefix)
intlData.write(' "{0}": {1},\n'.format(key, value))
intlData.write("};\n")
println(u' "{0}": {1},'.format(key, value))
println(u"};")
def writeMappingsFunction(println, variantMappings, redundantMappings, extlangMappings, description, fileDate, url):
""" Writes a function definition which performs language tag mapping.
def writeLanguageTagData(intlData, fileDate, url, langTagMappings, languageMappings,
regionMappings, extlangMappings):
Processes the contents of dictionaries |variantMappings| and
|redundantMappings| through the |println| function with the given
function name and a comment with description, fileDate, and URL.
"""
class Subtag:
Language, ExtLang, Script, Region, Variant = range(5)
Invalid = -1
def splitSubtags(tag):
seenLanguage = False
for subtag in tag.split("-"):
# language = 2*3ALPHA / 4ALPHA / 5*8ALPHA
if len(subtag) in range(2, 8+1) and subtag.isalpha() and not seenLanguage:
seenLanguage = True
kind = Subtag.Language
# extlang = 3ALPHA
elif len(subtag) == 3 and subtag.isalpha() and seenLanguage:
kind = Subtag.ExtLang
# script = 4ALPHA
elif len(subtag) == 4 and subtag.isalpha():
kind = Subtag.Script
# region = 2ALPHA / 3DIGIT
elif ((len(subtag) == 2 and subtag.isalpha()) or
(len(subtag) == 3 and subtag.isdigit())):
kind = Subtag.Region
# variant = 5*8alphanum / (DIGIT 3alphanum)
elif ((len(subtag) in range(5, 8+1) and subtag.isalnum()) or
(len(subtag) == 4 and subtag[0].isdigit() and subtag[1:].isalnum())):
kind = Subtag.Variant
else:
assert False, "unexpected language tag '{}'".format(key)
yield (kind, subtag)
def language(tag):
(kind, subtag) = next(splitSubtags(tag))
assert kind == Subtag.Language
return subtag
def variants(tag):
return [v for (k, v) in splitSubtags(tag) if k == Subtag.Variant]
def emitCompare(tag, preferred, isFirstLanguageTag):
def println_indent(level, *args):
println(u" " * (4 * level - 1), *args)
println2 = partial(println_indent, 2)
println3 = partial(println_indent, 3)
def maybeNext(it):
dummy = (Subtag.Invalid, "")
return next(it, dummy)
# Add a comment for the language tag mapping.
println2(u"// {} -> {}".format(tag, preferred))
# Compare the input language tag with the current language tag.
cond = []
extlangIndex = 1
lastVariant = None
for (kind, subtag) in splitSubtags(tag):
if kind == Subtag.Language:
continue
if kind == Subtag.ExtLang:
assert extlangIndex in [1, 2, 3],\
"Language-Tag permits no more than three extlang subtags"
cond.append('tag.extlang{} === "{}"'.format(extlangIndex, subtag))
extlangIndex += 1
elif kind == Subtag.Script:
cond.append('tag.script === "{}"'.format(subtag))
elif kind == Subtag.Region:
cond.append('tag.region === "{}"'.format(subtag))
else:
assert kind == Subtag.Variant
if lastVariant is None:
cond.append("tag.variants.length >= {}".format(len(variants(tag))))
cond.append('callFunction(ArrayIndexOf, tag.variants, "{}") > -1'.format(subtag))
else:
cond.append('callFunction(ArrayIndexOf, tag.variants, "{}", callFunction(ArrayIndexOf, tag.variants, "{}") + 1) > -1'.format(subtag, lastVariant))
lastVariant = subtag
# Require exact matches for redundant language tags.
if tag in redundantMappings:
tag_it = splitSubtags(tag)
tag_next = partial(maybeNext, tag_it)
(tag_kind, _) = tag_next()
assert tag_kind == Subtag.Language
(tag_kind, _) = tag_next()
subtags = ([(Subtag.ExtLang, "extlang{}".format(i)) for i in range(1, 3+1)] +
[(Subtag.Script, "script"), (Subtag.Region, "region")])
for kind, prop_name in subtags:
if tag_kind == kind:
(tag_kind, _) = tag_next()
else:
cond.append("tag.{} === undefined".format(prop_name))
cond.append("tag.variants.length === {}".format(len(variants(tag))))
while tag_kind == Subtag.Variant:
(tag_kind, _) = tag_next()
cond.append("tag.extensions.length === 0")
cond.append("tag.privateuse === undefined")
assert list(tag_it) == [], "unhandled tag subtags"
# Emit either:
#
# if (cond) {
#
# or:
#
# if (cond_1 &&
# cond_2 &&
# ...
# cond_n)
# {
#
# depending on the number of conditions.
ifOrElseIf = "if" if isFirstLanguageTag else "else if"
assert len(cond) > 0, "expect at least one subtag condition"
if len(cond) == 1:
println2(u"{} ({}) {{".format(ifOrElseIf, cond[0]))
else:
println2(u"{} ({} &&".format(ifOrElseIf, cond[0]))
for c in cond[1:-1]:
println2(u"{}{} &&".format(" " * (len(ifOrElseIf) + 2), c))
println2(u"{}{})".format(" " * (len(ifOrElseIf) + 2), cond[-1]))
println2(u"{")
# Iterate over all subtags of |tag| and |preferred| and update |tag|
# with |preferred| in the process. |tag| is modified in-place to use
# the preferred values.
tag_it = splitSubtags(tag)
tag_next = partial(maybeNext, tag_it)
(tag_kind, tag_subtag) = tag_next()
preferred_it = splitSubtags(preferred)
preferred_next = partial(maybeNext, preferred_it)
(preferred_kind, preferred_subtag) = preferred_next()
# Update the language subtag.
assert tag_kind == Subtag.Language and preferred_kind == Subtag.Language
if tag_subtag != preferred_subtag:
println3(u'tag.language = "{}";'.format(preferred_subtag))
(tag_kind, tag_subtag) = tag_next()
(preferred_kind, preferred_subtag) = preferred_next()
# Remove any extlang subtags per RFC 5646, 4.5:
# 'The canonical form contains no 'extlang' subtags.'
# https://tools.ietf.org/html/rfc5646#section-4.5
assert preferred_kind != Subtag.ExtLang
extlangIndex = 1
while tag_kind == Subtag.ExtLang:
assert extlangIndex in [1, 2, 3],\
"Language-Tag permits no more than three extlang subtags"
println3(u"tag.extlang{} = undefined;".format(extlangIndex))
extlangIndex += 1
(tag_kind, tag_subtag) = tag_next()
# Update the script and region subtags.
for kind, prop_name in [(Subtag.Script, "script"), (Subtag.Region, "region")]:
if tag_kind == kind and preferred_kind == kind:
if tag_subtag != preferred_subtag:
println3(u'tag.{} = "{}";'.format(prop_name, preferred_subtag))
(tag_kind, tag_subtag) = tag_next()
(preferred_kind, preferred_subtag) = preferred_next()
elif tag_kind == kind:
println3(u"tag.{} = undefined;".format(prop_name))
(tag_kind, tag_subtag) = tag_next()
elif preferred_kind == kind:
println3(u'tag.{} = "{}";'.format(prop_name, preferred_subtag))
(preferred_kind, preferred_subtag) = preferred_next()
# Update variant subtags.
if tag_kind == Subtag.Variant or preferred_kind == Subtag.Variant:
# JS doesn't provide an easy way to remove elements from an array
# which doesn't trigger Symbol.species, so we need to create a new
# array and copy all elements.
println3(u"var newVariants = [];")
# Copy all variant subtags, ignoring those which should be removed.
println3(u"for (var i = 0; i < tag.variants.length; i++) {")
println3(u" var variant = tag.variants[i];")
while tag_kind == Subtag.Variant:
println3(u' if (variant === "{}")'.format(tag_subtag))
println3(u" continue;")
(tag_kind, tag_subtag) = tag_next()
println3(u" _DefineDataProperty(newVariants, newVariants.length, variant);")
println3(u"}")
# Add the new variants, unless already present.
while preferred_kind == Subtag.Variant:
println3(u'if (callFunction(ArrayIndexOf, newVariants, "{}") < 0)'.format(preferred_subtag))
println3(u' _DefineDataProperty(newVariants, newVariants.length, "{}");'.format(preferred_subtag))
(preferred_kind, preferred_subtag) = preferred_next()
# Update the property.
println3(u"tag.variants = newVariants;")
# Ensure both language tags were completely processed.
assert list(tag_it) == [], "unhandled tag subtags"
assert list(preferred_it) == [], "unhandled preferred subtags"
println2(u"}")
# Remove mappings for redundant language tags which are from our point of
# view, wait for it, redundant, because there is an equivalent extlang
# mapping.
#
# For example this entry for the redundant tag "zh-cmn":
#
# Type: redundant
# Tag: zh-cmn
# Preferred-Value: cmn
#
# Can also be expressed through the extlang mapping for "cmn":
#
# Type: extlang
# Subtag: cmn
# Preferred-Value: cmn
# Prefix: zh
#
def hasExtlangMapping(tag, preferred):
tag_it = splitSubtags(tag)
(_, tag_lang) = next(tag_it)
(tag_kind, tag_extlang) = next(tag_it)
preferred_it = splitSubtags(preferred)
(_, preferred_lang) = next(preferred_it)
# Return true if the mapping is for an extlang language and the extlang
# mapping table contains an equivalent entry and any trailing elements,
# if present, are the same.
return (tag_kind == Subtag.ExtLang and
(tag_extlang, {"preferred": preferred_lang, "prefix": tag_lang}) in extlangMappings.items() and
list(tag_it) == list(preferred_it))
# Create a single mapping for variant and redundant tags, ignoring the
# entries which are also covered through extlang mappings.
langTagMappings = {tag: preferred
for mapping in [variantMappings, redundantMappings]
for (tag, preferred) in mapping.items()
if not hasExtlangMapping(tag, preferred)}
println(u"")
println(u"/* eslint-disable complexity */")
writeMappingHeader(println, description, fileDate, url)
println(u"function updateLangTagMappings(tag) {")
println(u' assert(IsObject(tag), "tag is an object");')
println(u' assert(!hasOwn("grandfathered", tag), "tag is not a grandfathered tag");')
println(u"")
# Switch on the language subtag.
println(u" switch (tag.language) {")
for lang in sorted(set(language(tag) for tag in langTagMappings)):
println(u' case "{}":'.format(lang))
isFirstLanguageTag = True
for tag in sorted(tag for tag in langTagMappings if language(tag) == lang):
assert not isinstance(langTagMappings[tag], dict),\
"only supports complete language tags"
emitCompare(tag, langTagMappings[tag], isFirstLanguageTag)
isFirstLanguageTag = False
println(u" break;")
println(u" }")
println(u"}")
println(u"/* eslint-enable complexity */")
def writeLanguageTagData(println, data, url):
""" Writes the language tag data to the Intl data file. """
writeMappingsVar(intlData, langTagMappings, "langTagMappings",
"Mappings from complete tags to preferred values.", fileDate, url)
writeMappingsVar(intlData, languageMappings, "languageMappings",
fileDate = data["fileDate"]
grandfatheredMappings = data["grandfatheredMappings"]
redundantMappings = data["redundantMappings"]
languageMappings = data["languageMappings"]
regionMappings = data["regionMappings"]
variantMappings = data["variantMappings"]
extlangMappings = data["extlangMappings"]
writeMappingsFunction(println, variantMappings, redundantMappings, extlangMappings,
"Mappings from complete tags to preferred values.", fileDate, url)
writeMappingsVar(println, grandfatheredMappings, "grandfatheredMappings",
"Mappings from grandfathered tags to preferred values.", fileDate, url)
writeMappingsVar(println, languageMappings, "languageMappings",
"Mappings from language subtags to preferred values.", fileDate, url)
writeMappingsVar(intlData, regionMappings, "regionMappings",
writeMappingsVar(println, regionMappings, "regionMappings",
"Mappings from region subtags to preferred values.", fileDate, url)
writeMappingsVar(intlData, extlangMappings, "extlangMappings",
writeMappingsVar(println, extlangMappings, "extlangMappings",
["Mappings from extlang subtags to preferred values.",
"All current deprecated extlang subtags have the form `<prefix>-<extlang>`",
"and their preferred value is exactly equal to `<extlang>`. So each key in",
@ -256,17 +549,13 @@ def updateLangTags(args):
print("Processing IANA Language Subtag Registry...")
with closing(registry) as reg:
data = readRegistry(reg)
fileDate = data["fileDate"]
langTagMappings = data["langTagMappings"]
languageMappings = data["languageMappings"]
regionMappings = data["regionMappings"]
extlangMappings = data["extlangMappings"]
print("Writing Intl data...")
with codecs.open(out, "w", encoding="utf-8") as intlData:
intlData.write("// Generated by make_intl_data.py. DO NOT EDIT.\n")
writeLanguageTagData(intlData, fileDate, url, langTagMappings, languageMappings,
regionMappings, extlangMappings)
with io.open(out, mode="w", encoding="utf-8", newline="") as f:
println = partial(print, file=f)
println(u"// Generated by make_intl_data.py. DO NOT EDIT.")
writeLanguageTagData(println, data, url)
def flines(filepath, encoding="utf-8"):
""" Open filepath and iterate over its content. """
@ -746,11 +1035,11 @@ def processTimeZones(tzdataDir, icuDir, icuTzDir, version, ignoreBackzone, ignor
println(u"// Format:")
println(u'// "LinkName", "Target" // ICU-Target [time zone file]')
println(u"struct LinkAndTarget");
println(u"{");
println(u" const char* const link;");
println(u" const char* const target;");
println(u"};");
println(u"struct LinkAndTarget")
println(u"{")
println(u" const char* const link;")
println(u" const char* const target;")
println(u"};")
println(u"")
println(u"const LinkAndTarget ianaLinksCanonicalizedDifferentlyByICU[] = {")
for (zone, target, icuTarget) in incorrectLinks:

Просмотреть файл

@ -0,0 +1,36 @@
// |reftest| skip-if(!this.hasOwnProperty("Intl"))
const languageTags = {
// The preferred value of "hy-arevela" is "hy".
"hy-arevela": "hy",
"hy-Armn-arevela": "hy-Armn",
"hy-AM-arevela": "hy-AM",
"hy-arevela-fonipa": "hy-fonipa",
"hy-fonipa-arevela": "hy-fonipa",
// The preferred value of "hy-arevmda" is "hyw".
"hy-arevmda": "hyw",
"hy-Armn-arevmda": "hyw-Armn",
"hy-AM-arevmda": "hyw-AM",
"hy-arevmda-fonipa": "hyw-fonipa",
"hy-fonipa-arevmda": "hyw-fonipa",
// The preferred value of "ja-Latn-hepburn-heploc" is "ja-Latn-alalc97".
"ja-Latn-hepburn-heploc": "ja-Latn-alalc97",
"ja-Latn-JP-hepburn-heploc": "ja-Latn-JP-alalc97",
// Ensure we don't emit "alalc97" when it is already present.
"ja-Latn-alalc97-hepburn-heploc": "ja-Latn-alalc97",
"ja-Latn-hepburn-alalc97-heploc": "ja-Latn-alalc97",
"ja-Latn-hepburn-heploc-alalc97": "ja-Latn-alalc97",
// No replacement when "heploc" appears before "hepburn".
"ja-Latn-heploc-hepburn": "ja-Latn-heploc-hepburn",
};
for (let [tag, canonical] of Object.entries(languageTags)) {
assertEq(Intl.getCanonicalLocales(tag)[0], canonical);
}
if (typeof reportCompare === "function")
reportCompare(0, 0);