зеркало из https://github.com/golang/text.git
308 строки
8.7 KiB
Go
308 строки
8.7 KiB
Go
// Copyright 2013 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
//go:build ignore
|
|
|
|
// Language tag table generator.
|
|
// Data read from the web.
|
|
|
|
package main
|
|
|
|
import (
|
|
"flag"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"golang.org/x/text/internal/gen"
|
|
"golang.org/x/text/internal/language"
|
|
"golang.org/x/text/unicode/cldr"
|
|
)
|
|
|
|
var (
|
|
test = flag.Bool("test",
|
|
false,
|
|
"test existing tables; can be used to compare web data with package data.")
|
|
outputFile = flag.String("output",
|
|
"tables.go",
|
|
"output file for generated tables")
|
|
)
|
|
|
|
func main() {
|
|
gen.Init()
|
|
|
|
w := gen.NewCodeWriter()
|
|
defer w.WriteGoFile("tables.go", "language")
|
|
|
|
b := newBuilder(w)
|
|
gen.WriteCLDRVersion(w)
|
|
|
|
b.writeConstants()
|
|
b.writeMatchData()
|
|
}
|
|
|
|
type builder struct {
|
|
w *gen.CodeWriter
|
|
hw io.Writer // MultiWriter for w and w.Hash
|
|
data *cldr.CLDR
|
|
supp *cldr.SupplementalData
|
|
}
|
|
|
|
func (b *builder) langIndex(s string) uint16 {
|
|
return uint16(language.MustParseBase(s))
|
|
}
|
|
|
|
func (b *builder) regionIndex(s string) int {
|
|
return int(language.MustParseRegion(s))
|
|
}
|
|
|
|
func (b *builder) scriptIndex(s string) int {
|
|
return int(language.MustParseScript(s))
|
|
}
|
|
|
|
func newBuilder(w *gen.CodeWriter) *builder {
|
|
r := gen.OpenCLDRCoreZip()
|
|
defer r.Close()
|
|
d := &cldr.Decoder{}
|
|
data, err := d.DecodeZip(r)
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
b := builder{
|
|
w: w,
|
|
hw: io.MultiWriter(w, w.Hash),
|
|
data: data,
|
|
supp: data.Supplemental(),
|
|
}
|
|
return &b
|
|
}
|
|
|
|
// writeConsts computes f(v) for all v in values and writes the results
|
|
// as constants named _v to a single constant block.
|
|
func (b *builder) writeConsts(f func(string) int, values ...string) {
|
|
fmt.Fprintln(b.w, "const (")
|
|
for _, v := range values {
|
|
fmt.Fprintf(b.w, "\t_%s = %v\n", v, f(v))
|
|
}
|
|
fmt.Fprintln(b.w, ")")
|
|
}
|
|
|
|
// TODO: region inclusion data will probably not be use used in future matchers.
|
|
|
|
var langConsts = []string{
|
|
"de", "en", "fr", "it", "mo", "no", "nb", "pt", "sh", "mul", "und",
|
|
}
|
|
|
|
var scriptConsts = []string{
|
|
"Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy",
|
|
"Zzzz",
|
|
}
|
|
|
|
var regionConsts = []string{
|
|
"001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US",
|
|
"ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo.
|
|
}
|
|
|
|
func (b *builder) writeConstants() {
|
|
b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...)
|
|
b.writeConsts(b.regionIndex, regionConsts...)
|
|
b.writeConsts(b.scriptIndex, scriptConsts...)
|
|
}
|
|
|
|
type mutualIntelligibility struct {
|
|
want, have uint16
|
|
distance uint8
|
|
oneway bool
|
|
}
|
|
|
|
type scriptIntelligibility struct {
|
|
wantLang, haveLang uint16
|
|
wantScript, haveScript uint8
|
|
distance uint8
|
|
// Always oneway
|
|
}
|
|
|
|
type regionIntelligibility struct {
|
|
lang uint16 // compact language id
|
|
script uint8 // 0 means any
|
|
group uint8 // 0 means any; if bit 7 is set it means inverse
|
|
distance uint8
|
|
// Always twoway.
|
|
}
|
|
|
|
// writeMatchData writes tables with languages and scripts for which there is
|
|
// mutual intelligibility. The data is based on CLDR's languageMatching data.
|
|
// Note that we use a different algorithm than the one defined by CLDR and that
|
|
// we slightly modify the data. For example, we convert scores to confidence levels.
|
|
// We also drop all region-related data as we use a different algorithm to
|
|
// determine region equivalence.
|
|
func (b *builder) writeMatchData() {
|
|
lm := b.supp.LanguageMatching.LanguageMatches
|
|
cldr.MakeSlice(&lm).SelectAnyOf("type", "written_new")
|
|
|
|
regionHierarchy := map[string][]string{}
|
|
for _, g := range b.supp.TerritoryContainment.Group {
|
|
regions := strings.Split(g.Contains, " ")
|
|
regionHierarchy[g.Type] = append(regionHierarchy[g.Type], regions...)
|
|
}
|
|
// Regions start at 1, so the slice must be one larger than the number of
|
|
// regions.
|
|
regionToGroups := make([]uint8, language.NumRegions+1)
|
|
|
|
idToIndex := map[string]uint8{}
|
|
for i, mv := range lm[0].MatchVariable {
|
|
if i > 6 {
|
|
log.Fatalf("Too many groups: %d", i)
|
|
}
|
|
idToIndex[mv.Id] = uint8(i + 1)
|
|
// TODO: also handle '-'
|
|
for _, r := range strings.Split(mv.Value, "+") {
|
|
todo := []string{r}
|
|
for k := 0; k < len(todo); k++ {
|
|
r := todo[k]
|
|
regionToGroups[b.regionIndex(r)] |= 1 << uint8(i)
|
|
todo = append(todo, regionHierarchy[r]...)
|
|
}
|
|
}
|
|
}
|
|
b.w.WriteVar("regionToGroups", regionToGroups)
|
|
|
|
// maps language id to in- and out-of-group region.
|
|
paradigmLocales := [][3]uint16{}
|
|
locales := strings.Split(lm[0].ParadigmLocales[0].Locales, " ")
|
|
for i := 0; i < len(locales); i += 2 {
|
|
x := [3]uint16{}
|
|
for j := 0; j < 2; j++ {
|
|
pc := strings.SplitN(locales[i+j], "-", 2)
|
|
x[0] = b.langIndex(pc[0])
|
|
if len(pc) == 2 {
|
|
x[1+j] = uint16(b.regionIndex(pc[1]))
|
|
}
|
|
}
|
|
paradigmLocales = append(paradigmLocales, x)
|
|
}
|
|
b.w.WriteVar("paradigmLocales", paradigmLocales)
|
|
|
|
b.w.WriteType(mutualIntelligibility{})
|
|
b.w.WriteType(scriptIntelligibility{})
|
|
b.w.WriteType(regionIntelligibility{})
|
|
|
|
matchLang := []mutualIntelligibility{}
|
|
matchScript := []scriptIntelligibility{}
|
|
matchRegion := []regionIntelligibility{}
|
|
// Convert the languageMatch entries in lists keyed by desired language.
|
|
for _, m := range lm[0].LanguageMatch {
|
|
// Different versions of CLDR use different separators.
|
|
desired := strings.Replace(m.Desired, "-", "_", -1)
|
|
supported := strings.Replace(m.Supported, "-", "_", -1)
|
|
d := strings.Split(desired, "_")
|
|
s := strings.Split(supported, "_")
|
|
if len(d) != len(s) {
|
|
log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
|
|
continue
|
|
}
|
|
distance, _ := strconv.ParseInt(m.Distance, 10, 8)
|
|
switch len(d) {
|
|
case 2:
|
|
if desired == supported && desired == "*_*" {
|
|
continue
|
|
}
|
|
// language-script pair.
|
|
matchScript = append(matchScript, scriptIntelligibility{
|
|
wantLang: uint16(b.langIndex(d[0])),
|
|
haveLang: uint16(b.langIndex(s[0])),
|
|
wantScript: uint8(b.scriptIndex(d[1])),
|
|
haveScript: uint8(b.scriptIndex(s[1])),
|
|
distance: uint8(distance),
|
|
})
|
|
if m.Oneway != "true" {
|
|
matchScript = append(matchScript, scriptIntelligibility{
|
|
wantLang: uint16(b.langIndex(s[0])),
|
|
haveLang: uint16(b.langIndex(d[0])),
|
|
wantScript: uint8(b.scriptIndex(s[1])),
|
|
haveScript: uint8(b.scriptIndex(d[1])),
|
|
distance: uint8(distance),
|
|
})
|
|
}
|
|
case 1:
|
|
if desired == supported && desired == "*" {
|
|
continue
|
|
}
|
|
if distance == 1 {
|
|
// nb == no is already handled by macro mapping. Check there
|
|
// really is only this case.
|
|
if d[0] != "no" || s[0] != "nb" {
|
|
log.Fatalf("unhandled equivalence %s == %s", s[0], d[0])
|
|
}
|
|
continue
|
|
}
|
|
// TODO: consider dropping oneway field and just doubling the entry.
|
|
matchLang = append(matchLang, mutualIntelligibility{
|
|
want: uint16(b.langIndex(d[0])),
|
|
have: uint16(b.langIndex(s[0])),
|
|
distance: uint8(distance),
|
|
oneway: m.Oneway == "true",
|
|
})
|
|
case 3:
|
|
if desired == supported && desired == "*_*_*" {
|
|
continue
|
|
}
|
|
if desired != supported {
|
|
// This is now supported by CLDR, but only one case, which
|
|
// should already be covered by paradigm locales. For instance,
|
|
// test case "und, en, en-GU, en-IN, en-GB ; en-ZA ; en-GB" in
|
|
// testdata/CLDRLocaleMatcherTest.txt tests this.
|
|
if supported != "en_*_GB" {
|
|
log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
|
|
}
|
|
continue
|
|
}
|
|
ri := regionIntelligibility{
|
|
lang: b.langIndex(d[0]),
|
|
distance: uint8(distance),
|
|
}
|
|
if d[1] != "*" {
|
|
ri.script = uint8(b.scriptIndex(d[1]))
|
|
}
|
|
switch {
|
|
case d[2] == "*":
|
|
ri.group = 0x80 // not contained in anything
|
|
case strings.HasPrefix(d[2], "$!"):
|
|
ri.group = 0x80
|
|
d[2] = "$" + d[2][len("$!"):]
|
|
fallthrough
|
|
case strings.HasPrefix(d[2], "$"):
|
|
ri.group |= idToIndex[d[2]]
|
|
}
|
|
matchRegion = append(matchRegion, ri)
|
|
default:
|
|
log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
|
|
}
|
|
}
|
|
sort.SliceStable(matchLang, func(i, j int) bool {
|
|
return matchLang[i].distance < matchLang[j].distance
|
|
})
|
|
b.w.WriteComment(`
|
|
matchLang holds pairs of langIDs of base languages that are typically
|
|
mutually intelligible. Each pair is associated with a confidence and
|
|
whether the intelligibility goes one or both ways.`)
|
|
b.w.WriteVar("matchLang", matchLang)
|
|
|
|
b.w.WriteComment(`
|
|
matchScript holds pairs of scriptIDs where readers of one script
|
|
can typically also read the other. Each is associated with a confidence.`)
|
|
sort.SliceStable(matchScript, func(i, j int) bool {
|
|
return matchScript[i].distance < matchScript[j].distance
|
|
})
|
|
b.w.WriteVar("matchScript", matchScript)
|
|
|
|
sort.SliceStable(matchRegion, func(i, j int) bool {
|
|
return matchRegion[i].distance < matchRegion[j].distance
|
|
})
|
|
b.w.WriteVar("matchRegion", matchRegion)
|
|
}
|