go.text/locale: Separated Macro canonicalization into Legacy and

Macro groups, as defined by CLDR. The Legacy cases are now hard coded.
This allows us to handle sh -> sr-Latn without introducing a new data type
just for this case. The set of legacy translations is unlikely to change,
but maketables.go now checks and fails if the set changes.
Also introduced Default CanonType in preperation for adding tag maximization
and minimization.

Further changes: deviating from CLDR in a few places to not have to deal with
legacy choice. CLDR is likely to head in this direction as well, so it
prevents incompatibilities down the road.
Added CLDR option to force strict compliance to CLDR.

Mapping "mo" to "ro-MD" instead of "ro".  In cases where ID is used as a
locale, preserving this piece of information may be important. It is up
to the matching code to establish that "ro" and "ro-MD" are mutually
intelligible.

R=r
CC=golang-dev
https://golang.org/cl/12903045
This commit is contained in:
Marcel van Lohuizen 2013-08-21 11:15:02 +02:00
Родитель 4980de1c40
Коммит 5059ed55b5
6 изменённых файлов: 119 добавлений и 22 удалений

Просмотреть файл

@ -16,20 +16,30 @@ func ExampleID_Canonicalize() {
fmt.Printf("BCP47(%s) -> %s\n", id, l)
l, _ = loc.Canonicalize(locale.Macro)
fmt.Printf("Macro(%s) -> %s\n", id, l)
l, _ = loc.Canonicalize(locale.All)
fmt.Printf("All(%s) -> %s\n", id, l)
}
p("en-Latn")
p("sh")
p("zh-cmn")
p("bjd")
p("iw-Latn-fonipa-u-cu-usd")
// Output:
// BCP47(en-Latn) -> en
// Macro(en-Latn) -> en-Latn
// All(en-Latn) -> en
// BCP47(sh) -> sh
// Macro(sh) -> sh
// All(sh) -> sr-Latn
// BCP47(zh-cmn) -> cmn
// Macro(zh-cmn) -> zh
// All(zh-cmn) -> zh
// BCP47(bjd) -> drl
// Macro(bjd) -> bjd
// All(bjd) -> drl
// BCP47(iw-Latn-fonipa-u-cu-usd) -> he-Latn-fonipa-u-cu-usd
// Macro(iw-Latn-fonipa-u-cu-usd) -> iw-Latn-fonipa-u-cu-usd
// All(iw-Latn-fonipa-u-cu-usd) -> he-Latn-fonipa-u-cu-usd
}
func ExampleID_Language() {

Просмотреть файл

@ -51,7 +51,7 @@ type ID struct {
// In most cases, locale IDs should be created using this method.
func Make(id string) ID {
loc, _ := Parse(id)
loc, _ = loc.Canonicalize(All)
loc, _ = loc.Canonicalize(Default)
return loc
}
@ -85,12 +85,18 @@ const (
Deprecated CanonType = 1 << iota
// Remove redundant scripts.
SuppressScript
// Map the dominant language of macro language group to the macro language identifier.
// Normalize legacy encodings, as defined by CLDR.
Legacy
// Map the dominant language of a macro language group to the macro language identifier.
// For example cmn -> zh.
Macro
// The CLDR flag should be used if full compatibility with CLDR is required. There are
// a few cases where locale.ID may differ from CLDR.
CLDR
// All canonicalizations prescribed by BCP 47.
BCP47 = Deprecated | SuppressScript
All = BCP47 | Macro
BCP47 = Deprecated | SuppressScript
All = BCP47 | Legacy | Macro
Default = All
// TODO: LikelyScript, LikelyRegion: supress similar to ICU.
)
@ -104,19 +110,57 @@ func (loc ID) Canonicalize(t CanonType) (ID, error) {
changed = true
}
}
if t&Legacy != 0 {
// We hard code this set as it is very small, unlikely to change and requires some
// handling that does not fit elsewhere.
switch loc.lang {
case lang_no:
if t&CLDR != 0 {
loc.lang = lang_nb
changed = true
}
case lang_tl:
loc.lang = lang_fil
changed = true
case lang_sh:
if loc.script == 0 {
loc.script = scrLatn
}
loc.lang = lang_sr
changed = true
}
}
if t&Deprecated != 0 {
l := normLang(langOldMap[:], loc.lang)
if l != loc.lang {
// CLDR maps "mo" to "ro". This mapping loses the piece of information
// that "mo" very likely implies the region "MD". This may be important
// for applications that insist on making a difference between these
// two language codes.
if loc.lang == lang_mo && loc.region == 0 && t&CLDR == 0 {
loc.region = regMD
}
changed = true
loc.lang = l
}
loc.lang = l
}
if t&Macro != 0 {
l := normLang(langMacroMap[:], loc.lang)
// We deviate here from CLDR. The mapping "nb" -> "no" qualifies as a typical
// Macro language mapping. However, for legacy reasons, CLDR maps "no,
// the macro language code for Norwegian, to the dominant variant "nb.
// This change is currently under consideration for CLDR as well.
// See http://unicode.org/cldr/trac/ticket/2698 and also
// http://unicode.org/cldr/trac/ticket/1790 for some of the practical
// implications.
// TODO: this code could be removed if CLDR adopts this change.
if l == lang_nb && t&CLDR == 0 {
l = lang_no
}
if l != loc.lang {
changed = true
loc.lang = l
}
loc.lang = l
}
if changed && loc.str != nil {
loc.remakeString()

Просмотреть файл

@ -266,3 +266,38 @@ func TestParseCurrency(t *testing.T) {
}
}
}
func TestCanonicalize(t *testing.T) {
// TODO: do a full test using CLDR data in a separate regression test.
tests := []struct {
in, out string
option CanonType
}{
{"en-Latn", "en", SuppressScript},
{"sr-Cyrl", "sr-Cyrl", SuppressScript},
{"sh", "sr-Latn", Legacy},
{"sh-HR", "sr-Latn-HR", Legacy},
{"sh-Cyrl-HR", "sr-Cyrl-HR", Legacy},
{"tl", "fil", Legacy},
{"no", "no", Legacy},
{"no", "nb", Legacy | CLDR},
{"cmn", "cmn", Legacy},
{"cmn", "zh", Macro},
{"yue", "yue", Macro},
{"nb", "no", Macro},
{"nb", "nb", Macro | CLDR},
{"no", "no", Macro},
{"no", "no", Macro | CLDR},
{"iw", "he", Deprecated},
{"iw", "he", Deprecated | CLDR},
{"mo", "ro-MD", Deprecated},
{"mo", "ro", Deprecated | CLDR},
}
for i, tt := range tests {
in, _ := Parse(tt.in)
in, _ = in.Canonicalize(tt.option)
if in.String() != tt.out {
t.Errorf("%d:%s: was %s; want %s", i, tt.in, in.String(), tt.out)
}
}
}

Просмотреть файл

@ -109,10 +109,10 @@ func TestLangID(t *testing.T) {
{id: "gsw", bcp47: "gsw", iso3: "gsw"},
{id: "gSW", bcp47: "gsw", iso3: "gsw"},
{id: "und", bcp47: "und", iso3: "und"},
{id: "sh", bcp47: "sh", iso3: "hbs", norm: "sr"},
{id: "hbs", bcp47: "sh", iso3: "hbs", norm: "sr"},
{id: "no", bcp47: "no", iso3: "nor", norm: "nb"},
{id: "nor", bcp47: "no", iso3: "nor", norm: "nb"},
{id: "sh", bcp47: "sh", iso3: "hbs", norm: "sh"},
{id: "hbs", bcp47: "sh", iso3: "hbs", norm: "sh"},
{id: "no", bcp47: "no", iso3: "nor", norm: "no"},
{id: "nor", bcp47: "no", iso3: "nor", norm: "no"},
{id: "cmn", bcp47: "cmn", iso3: "cmn", norm: "zh"},
}
for i, tt := range tests {

Просмотреть файл

@ -673,7 +673,7 @@ func (b *builder) writeLanguage() {
meta := b.supp.Metadata
b.writeConst("nonCanonicalUnd", b.lang.index("und"))
b.writeConsts("lang_", b.lang.index, "de", "en")
b.writeConsts("lang_", b.lang.index, "de", "en", "fil", "mo", "nb", "no", "sh", "sr", "tl")
b.writeConst("langPrivateStart", b.langIndex("qaa"))
b.writeConst("langPrivateEnd", b.langIndex("qtz"))
@ -705,10 +705,13 @@ func (b *builder) writeLanguage() {
lang.updateLater(a.Replacement, a.Type)
}
} else if len(a.Type) <= 3 {
if a.Reason != "deprecated" {
if a.Reason == "macrolanguage" {
langMacroMap.add(a.Type)
langMacroMap.updateLater(a.Type, repl)
println(a.Type, repl)
} else if a.Reason == "deprecated" {
// handled elsewhere
} else if l := a.Type; !(l == "sh" || l == "no" || l == "tl") {
log.Fatalf("new %s alias: %s", a.Reason, a.Type)
}
} else {
legacyTag[strings.Replace(a.Type, "_", "-", -1)] = repl
@ -829,7 +832,7 @@ func parseM49(s string) uint16 {
}
func (b *builder) writeRegion() {
b.writeConsts("reg", b.region.index, "US", "ZZ", "XA", "XC")
b.writeConsts("reg", b.region.index, "MD", "US", "ZZ", "XA", "XC")
isoOffset := b.region.index("AA")
m49map := make([]uint16, len(b.region.slice()))

Просмотреть файл

@ -7,8 +7,15 @@ package locale
const nonCanonicalUnd = 415
const (
lang_de = 82
lang_en = 97
lang_de = 82
lang_en = 97
lang_fil = 107
lang_mo = 266
lang_nb = 279
lang_no = 290
lang_sh = 346
lang_sr = 362
lang_tl = 392
)
const langPrivateStart = 11260
@ -312,11 +319,8 @@ var langOldMap = [27]fromTo{
}
// langMacroMap maps languages to their macro language replacement, if applicable.
// Size: 260 bytes, 65 elements
var langMacroMap = [65]fromTo{
{from: 0x122, to: 0x117},
{from: 0x15a, to: 0x16a},
{from: 0x188, to: 0x6b},
// Size: 248 bytes, 62 elements
var langMacroMap = [62]fromTo{
{from: 0x195, to: 0x8},
{from: 0x2ba, to: 0x1b3b},
{from: 0x2ec, to: 0x169},
@ -484,6 +488,7 @@ var suppressScript = [444]uint8{
}
const (
regMD = 181
regUS = 293
regZZ = 339
regXA = 305
@ -1924,4 +1929,4 @@ var regionInclusionNext = [75]uint8{
25, 74, 62,
}
// Size: 14.6K (14907 bytes); Check: 1BCAE8F
// Size: 14.5K (14895 bytes); Check: 1A91D521