зеркало из https://github.com/golang/text.git
go.text/locale: Separated Macro canonicalization into Legacy and
Macro groups, as defined by CLDR. The Legacy cases are now hard coded. This allows us to handle sh -> sr-Latn without introducing a new data type just for this case. The set of legacy translations is unlikely to change, but maketables.go now checks and fails if the set changes. Also introduced Default CanonType in preperation for adding tag maximization and minimization. Further changes: deviating from CLDR in a few places to not have to deal with legacy choice. CLDR is likely to head in this direction as well, so it prevents incompatibilities down the road. Added CLDR option to force strict compliance to CLDR. Mapping "mo" to "ro-MD" instead of "ro". In cases where ID is used as a locale, preserving this piece of information may be important. It is up to the matching code to establish that "ro" and "ro-MD" are mutually intelligible. R=r CC=golang-dev https://golang.org/cl/12903045
This commit is contained in:
Родитель
4980de1c40
Коммит
5059ed55b5
|
@ -16,20 +16,30 @@ func ExampleID_Canonicalize() {
|
|||
fmt.Printf("BCP47(%s) -> %s\n", id, l)
|
||||
l, _ = loc.Canonicalize(locale.Macro)
|
||||
fmt.Printf("Macro(%s) -> %s\n", id, l)
|
||||
l, _ = loc.Canonicalize(locale.All)
|
||||
fmt.Printf("All(%s) -> %s\n", id, l)
|
||||
}
|
||||
p("en-Latn")
|
||||
p("sh")
|
||||
p("zh-cmn")
|
||||
p("bjd")
|
||||
p("iw-Latn-fonipa-u-cu-usd")
|
||||
// Output:
|
||||
// BCP47(en-Latn) -> en
|
||||
// Macro(en-Latn) -> en-Latn
|
||||
// All(en-Latn) -> en
|
||||
// BCP47(sh) -> sh
|
||||
// Macro(sh) -> sh
|
||||
// All(sh) -> sr-Latn
|
||||
// BCP47(zh-cmn) -> cmn
|
||||
// Macro(zh-cmn) -> zh
|
||||
// All(zh-cmn) -> zh
|
||||
// BCP47(bjd) -> drl
|
||||
// Macro(bjd) -> bjd
|
||||
// All(bjd) -> drl
|
||||
// BCP47(iw-Latn-fonipa-u-cu-usd) -> he-Latn-fonipa-u-cu-usd
|
||||
// Macro(iw-Latn-fonipa-u-cu-usd) -> iw-Latn-fonipa-u-cu-usd
|
||||
// All(iw-Latn-fonipa-u-cu-usd) -> he-Latn-fonipa-u-cu-usd
|
||||
}
|
||||
|
||||
func ExampleID_Language() {
|
||||
|
|
|
@ -51,7 +51,7 @@ type ID struct {
|
|||
// In most cases, locale IDs should be created using this method.
|
||||
func Make(id string) ID {
|
||||
loc, _ := Parse(id)
|
||||
loc, _ = loc.Canonicalize(All)
|
||||
loc, _ = loc.Canonicalize(Default)
|
||||
return loc
|
||||
}
|
||||
|
||||
|
@ -85,12 +85,18 @@ const (
|
|||
Deprecated CanonType = 1 << iota
|
||||
// Remove redundant scripts.
|
||||
SuppressScript
|
||||
// Map the dominant language of macro language group to the macro language identifier.
|
||||
// Normalize legacy encodings, as defined by CLDR.
|
||||
Legacy
|
||||
// Map the dominant language of a macro language group to the macro language identifier.
|
||||
// For example cmn -> zh.
|
||||
Macro
|
||||
// The CLDR flag should be used if full compatibility with CLDR is required. There are
|
||||
// a few cases where locale.ID may differ from CLDR.
|
||||
CLDR
|
||||
// All canonicalizations prescribed by BCP 47.
|
||||
BCP47 = Deprecated | SuppressScript
|
||||
All = BCP47 | Macro
|
||||
BCP47 = Deprecated | SuppressScript
|
||||
All = BCP47 | Legacy | Macro
|
||||
Default = All
|
||||
|
||||
// TODO: LikelyScript, LikelyRegion: supress similar to ICU.
|
||||
)
|
||||
|
@ -104,19 +110,57 @@ func (loc ID) Canonicalize(t CanonType) (ID, error) {
|
|||
changed = true
|
||||
}
|
||||
}
|
||||
if t&Legacy != 0 {
|
||||
// We hard code this set as it is very small, unlikely to change and requires some
|
||||
// handling that does not fit elsewhere.
|
||||
switch loc.lang {
|
||||
case lang_no:
|
||||
if t&CLDR != 0 {
|
||||
loc.lang = lang_nb
|
||||
changed = true
|
||||
}
|
||||
case lang_tl:
|
||||
loc.lang = lang_fil
|
||||
changed = true
|
||||
case lang_sh:
|
||||
if loc.script == 0 {
|
||||
loc.script = scrLatn
|
||||
}
|
||||
loc.lang = lang_sr
|
||||
changed = true
|
||||
}
|
||||
}
|
||||
if t&Deprecated != 0 {
|
||||
l := normLang(langOldMap[:], loc.lang)
|
||||
if l != loc.lang {
|
||||
// CLDR maps "mo" to "ro". This mapping loses the piece of information
|
||||
// that "mo" very likely implies the region "MD". This may be important
|
||||
// for applications that insist on making a difference between these
|
||||
// two language codes.
|
||||
if loc.lang == lang_mo && loc.region == 0 && t&CLDR == 0 {
|
||||
loc.region = regMD
|
||||
}
|
||||
changed = true
|
||||
loc.lang = l
|
||||
}
|
||||
loc.lang = l
|
||||
}
|
||||
if t&Macro != 0 {
|
||||
l := normLang(langMacroMap[:], loc.lang)
|
||||
// We deviate here from CLDR. The mapping "nb" -> "no" qualifies as a typical
|
||||
// Macro language mapping. However, for legacy reasons, CLDR maps "no,
|
||||
// the macro language code for Norwegian, to the dominant variant "nb.
|
||||
// This change is currently under consideration for CLDR as well.
|
||||
// See http://unicode.org/cldr/trac/ticket/2698 and also
|
||||
// http://unicode.org/cldr/trac/ticket/1790 for some of the practical
|
||||
// implications.
|
||||
// TODO: this code could be removed if CLDR adopts this change.
|
||||
if l == lang_nb && t&CLDR == 0 {
|
||||
l = lang_no
|
||||
}
|
||||
if l != loc.lang {
|
||||
changed = true
|
||||
loc.lang = l
|
||||
}
|
||||
loc.lang = l
|
||||
}
|
||||
if changed && loc.str != nil {
|
||||
loc.remakeString()
|
||||
|
|
|
@ -266,3 +266,38 @@ func TestParseCurrency(t *testing.T) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestCanonicalize(t *testing.T) {
|
||||
// TODO: do a full test using CLDR data in a separate regression test.
|
||||
tests := []struct {
|
||||
in, out string
|
||||
option CanonType
|
||||
}{
|
||||
{"en-Latn", "en", SuppressScript},
|
||||
{"sr-Cyrl", "sr-Cyrl", SuppressScript},
|
||||
{"sh", "sr-Latn", Legacy},
|
||||
{"sh-HR", "sr-Latn-HR", Legacy},
|
||||
{"sh-Cyrl-HR", "sr-Cyrl-HR", Legacy},
|
||||
{"tl", "fil", Legacy},
|
||||
{"no", "no", Legacy},
|
||||
{"no", "nb", Legacy | CLDR},
|
||||
{"cmn", "cmn", Legacy},
|
||||
{"cmn", "zh", Macro},
|
||||
{"yue", "yue", Macro},
|
||||
{"nb", "no", Macro},
|
||||
{"nb", "nb", Macro | CLDR},
|
||||
{"no", "no", Macro},
|
||||
{"no", "no", Macro | CLDR},
|
||||
{"iw", "he", Deprecated},
|
||||
{"iw", "he", Deprecated | CLDR},
|
||||
{"mo", "ro-MD", Deprecated},
|
||||
{"mo", "ro", Deprecated | CLDR},
|
||||
}
|
||||
for i, tt := range tests {
|
||||
in, _ := Parse(tt.in)
|
||||
in, _ = in.Canonicalize(tt.option)
|
||||
if in.String() != tt.out {
|
||||
t.Errorf("%d:%s: was %s; want %s", i, tt.in, in.String(), tt.out)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -109,10 +109,10 @@ func TestLangID(t *testing.T) {
|
|||
{id: "gsw", bcp47: "gsw", iso3: "gsw"},
|
||||
{id: "gSW", bcp47: "gsw", iso3: "gsw"},
|
||||
{id: "und", bcp47: "und", iso3: "und"},
|
||||
{id: "sh", bcp47: "sh", iso3: "hbs", norm: "sr"},
|
||||
{id: "hbs", bcp47: "sh", iso3: "hbs", norm: "sr"},
|
||||
{id: "no", bcp47: "no", iso3: "nor", norm: "nb"},
|
||||
{id: "nor", bcp47: "no", iso3: "nor", norm: "nb"},
|
||||
{id: "sh", bcp47: "sh", iso3: "hbs", norm: "sh"},
|
||||
{id: "hbs", bcp47: "sh", iso3: "hbs", norm: "sh"},
|
||||
{id: "no", bcp47: "no", iso3: "nor", norm: "no"},
|
||||
{id: "nor", bcp47: "no", iso3: "nor", norm: "no"},
|
||||
{id: "cmn", bcp47: "cmn", iso3: "cmn", norm: "zh"},
|
||||
}
|
||||
for i, tt := range tests {
|
||||
|
|
|
@ -673,7 +673,7 @@ func (b *builder) writeLanguage() {
|
|||
meta := b.supp.Metadata
|
||||
|
||||
b.writeConst("nonCanonicalUnd", b.lang.index("und"))
|
||||
b.writeConsts("lang_", b.lang.index, "de", "en")
|
||||
b.writeConsts("lang_", b.lang.index, "de", "en", "fil", "mo", "nb", "no", "sh", "sr", "tl")
|
||||
b.writeConst("langPrivateStart", b.langIndex("qaa"))
|
||||
b.writeConst("langPrivateEnd", b.langIndex("qtz"))
|
||||
|
||||
|
@ -705,10 +705,13 @@ func (b *builder) writeLanguage() {
|
|||
lang.updateLater(a.Replacement, a.Type)
|
||||
}
|
||||
} else if len(a.Type) <= 3 {
|
||||
if a.Reason != "deprecated" {
|
||||
if a.Reason == "macrolanguage" {
|
||||
langMacroMap.add(a.Type)
|
||||
langMacroMap.updateLater(a.Type, repl)
|
||||
println(a.Type, repl)
|
||||
} else if a.Reason == "deprecated" {
|
||||
// handled elsewhere
|
||||
} else if l := a.Type; !(l == "sh" || l == "no" || l == "tl") {
|
||||
log.Fatalf("new %s alias: %s", a.Reason, a.Type)
|
||||
}
|
||||
} else {
|
||||
legacyTag[strings.Replace(a.Type, "_", "-", -1)] = repl
|
||||
|
@ -829,7 +832,7 @@ func parseM49(s string) uint16 {
|
|||
}
|
||||
|
||||
func (b *builder) writeRegion() {
|
||||
b.writeConsts("reg", b.region.index, "US", "ZZ", "XA", "XC")
|
||||
b.writeConsts("reg", b.region.index, "MD", "US", "ZZ", "XA", "XC")
|
||||
|
||||
isoOffset := b.region.index("AA")
|
||||
m49map := make([]uint16, len(b.region.slice()))
|
||||
|
|
|
@ -7,8 +7,15 @@ package locale
|
|||
const nonCanonicalUnd = 415
|
||||
|
||||
const (
|
||||
lang_de = 82
|
||||
lang_en = 97
|
||||
lang_de = 82
|
||||
lang_en = 97
|
||||
lang_fil = 107
|
||||
lang_mo = 266
|
||||
lang_nb = 279
|
||||
lang_no = 290
|
||||
lang_sh = 346
|
||||
lang_sr = 362
|
||||
lang_tl = 392
|
||||
)
|
||||
|
||||
const langPrivateStart = 11260
|
||||
|
@ -312,11 +319,8 @@ var langOldMap = [27]fromTo{
|
|||
}
|
||||
|
||||
// langMacroMap maps languages to their macro language replacement, if applicable.
|
||||
// Size: 260 bytes, 65 elements
|
||||
var langMacroMap = [65]fromTo{
|
||||
{from: 0x122, to: 0x117},
|
||||
{from: 0x15a, to: 0x16a},
|
||||
{from: 0x188, to: 0x6b},
|
||||
// Size: 248 bytes, 62 elements
|
||||
var langMacroMap = [62]fromTo{
|
||||
{from: 0x195, to: 0x8},
|
||||
{from: 0x2ba, to: 0x1b3b},
|
||||
{from: 0x2ec, to: 0x169},
|
||||
|
@ -484,6 +488,7 @@ var suppressScript = [444]uint8{
|
|||
}
|
||||
|
||||
const (
|
||||
regMD = 181
|
||||
regUS = 293
|
||||
regZZ = 339
|
||||
regXA = 305
|
||||
|
@ -1924,4 +1929,4 @@ var regionInclusionNext = [75]uint8{
|
|||
25, 74, 62,
|
||||
}
|
||||
|
||||
// Size: 14.6K (14907 bytes); Check: 1BCAE8F
|
||||
// Size: 14.5K (14895 bytes); Check: 1A91D521
|
||||
|
|
Загрузка…
Ссылка в новой задаче