unicode/runenames: support for multiple unicode versions

Also rewrote the representation, which did not work for
Unicode 10

Change-Id: I39d9907908c67d07f8bbc153942ab0d8ab02ab6e
Reviewed-on: https://go-review.googlesource.com/96736
Run-TryBot: Marcel van Lohuizen <mpvl@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ross Light <light@google.com>
This commit is contained in:
Marcel van Lohuizen 2018-02-23 18:02:25 +01:00
Родитель de8df8581d
Коммит ffa9e5ed68
10 изменённых файлов: 31540 добавлений и 15825 удалений

1
gen.go
Просмотреть файл

@ -120,6 +120,7 @@ pkg unicode, var <new script or property> *RangeTable
mib = generate("./encoding/internal/identifier", unicode)
number = generate("./internal/number", unicode, cldr, language, internal)
cldrtree = generate("./internal/cldrtree", language, internal)
_ = generate("./unicode/runenames", unicode)
_ = generate("./encoding/htmlindex", unicode, language, mib)
_ = generate("./encoding/ianaindex", unicode, language, mib)
_ = generate("./secure/precis", unicode, norm, rangetable, cases, width, bidi)

Просмотреть файл

@ -1,59 +0,0 @@
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
package runenames
// This file contains code common to gen.go and the package code.
// The mapping from rune to string (i.e. offset and length in the data string)
// is encoded as a two level table. The first level maps from contiguous rune
// ranges [runeOffset, runeOffset+runeLength) to entries. Entries are either
// direct (for repeated names such as "<CJK Ideograph>") or indirect (for runs
// of unique names such as "SPACE", "EXCLAMATION MARK", "QUOTATION MARK", ...).
//
// Each first level table element is 64 bits. The runeOffset (21 bits) and
// runeLength (16 bits) take the 37 high bits. The entry takes the 27 low bits,
// with directness encoded in the least significant bit.
//
// A direct entry encodes a dataOffset (18 bits) and dataLength (8 bits) in the
// data string. 18 bits is too short to encode the entire data string's length,
// but the data string's contents are arranged so that all of the few direct
// entries' offsets come before all of the many indirect entries' offsets.
//
// An indirect entry encodes a dataBase (10 bits) and a table1Offset (16 bits).
// The table1Offset is the start of a range in the second level table. The
// length of that range is the same as the runeLength.
//
// Each second level table element is 16 bits, an index into data, relative to
// a bias equal to (dataBase << dataBaseUnit). That (bias + index) is the
// (dataOffset + dataLength) in the data string. The dataOffset is implied by
// the previous table element (with the same implicit bias).
const (
bitsRuneOffset = 21
bitsRuneLength = 16
bitsDataOffset = 18
bitsDataLength = 8
bitsDirect = 1
bitsDataBase = 10
bitsTable1Offset = 16
shiftRuneOffset = 0 + bitsDirect + bitsDataLength + bitsDataOffset + bitsRuneLength
shiftRuneLength = 0 + bitsDirect + bitsDataLength + bitsDataOffset
shiftDataOffset = 0 + bitsDirect + bitsDataLength
shiftDataLength = 0 + bitsDirect
shiftDirect = 0
shiftDataBase = 0 + bitsDirect + bitsTable1Offset
shiftTable1Offset = 0 + bitsDirect
maskRuneLength = 1<<bitsRuneLength - 1
maskDataOffset = 1<<bitsDataOffset - 1
maskDataLength = 1<<bitsDataLength - 1
maskDirect = 1<<bitsDirect - 1
maskDataBase = 1<<bitsDataBase - 1
maskTable1Offset = 1<<bitsTable1Offset - 1
dataBaseUnit = 10
)

Просмотреть файл

@ -41,7 +41,6 @@ func Example() {
'\U00004dc0',
'\U00009fd5',
'\U00009fd6',
'\U00009fff',
'\U0000a000',
0xdc00, // '\U0000dc00' (Low Surrogate) is an invalid Go literal.
@ -95,7 +94,6 @@ func Example() {
// 00003402 "<CJK Ideograph Extension A>"
// 00004dc0 "HEXAGRAM FOR THE CREATIVE HEAVEN"
// 00009fd5 "<CJK Ideograph>"
// 00009fd6 ""
// 00009fff ""
// 0000a000 "YI SYLLABLE IT"
// 0000dc00 "<Low Surrogate>"

Просмотреть файл

@ -7,189 +7,156 @@
package main
import (
"bytes"
"log"
"sort"
"strings"
"unicode"
"golang.org/x/text/internal/gen"
"golang.org/x/text/internal/gen/bitfield"
"golang.org/x/text/internal/ucd"
)
// snippet is a slice of data; data is the concatenation of all of the names.
type snippet struct {
offset int
length int
s string
}
func makeTable0EntryDirect(rOffset, rLength, dOffset, dLength int) uint64 {
if rOffset >= 1<<bitsRuneOffset {
log.Fatalf("makeTable0EntryDirect: rOffset %d is too large", rOffset)
}
if rLength >= 1<<bitsRuneLength {
log.Fatalf("makeTable0EntryDirect: rLength %d is too large", rLength)
}
if dOffset >= 1<<bitsDataOffset {
log.Fatalf("makeTable0EntryDirect: dOffset %d is too large", dOffset)
}
if dLength >= 1<<bitsRuneLength {
log.Fatalf("makeTable0EntryDirect: dLength %d is too large", dLength)
}
return uint64(rOffset)<<shiftRuneOffset |
uint64(rLength)<<shiftRuneLength |
uint64(dOffset)<<shiftDataOffset |
uint64(dLength)<<shiftDataLength |
1 // Direct bit.
}
func makeTable0EntryIndirect(rOffset, rLength, dBase, t1Offset int) uint64 {
if rOffset >= 1<<bitsRuneOffset {
log.Fatalf("makeTable0EntryIndirect: rOffset %d is too large", rOffset)
}
if rLength >= 1<<bitsRuneLength {
log.Fatalf("makeTable0EntryIndirect: rLength %d is too large", rLength)
}
if dBase >= 1<<bitsDataBase {
log.Fatalf("makeTable0EntryIndirect: dBase %d is too large", dBase)
}
if t1Offset >= 1<<bitsTable1Offset {
log.Fatalf("makeTable0EntryIndirect: t1Offset %d is too large", t1Offset)
}
return uint64(rOffset)<<shiftRuneOffset |
uint64(rLength)<<shiftRuneLength |
uint64(dBase)<<shiftDataBase |
uint64(t1Offset)<<shiftTable1Offset |
0 // Direct bit.
}
func makeTable1Entry(x int) uint16 {
if x < 0 || 0xffff < x {
log.Fatalf("makeTable1Entry: entry %d is out of range", x)
}
return uint16(x)
}
var (
data []byte
snippets = make([]snippet, 1+unicode.MaxRune)
// computed by computeDirectOffsets
directOffsets = map[string]int{}
directData bytes.Buffer
// computed by computeEntries
entries []entry
singleData bytes.Buffer
index []uint16
)
type entry struct {
start rune `bitfield:"21,startRune"`
numRunes int `bitfield:"16"`
end rune
index int `bitfield:"16"`
base int `bitfield:"6"`
direct bool `bitfield:""`
name string
}
func main() {
gen.Init()
names, counts := parse()
appendRepeatNames(names, counts)
appendUniqueNames(names, counts)
table0, table1 := makeTables()
gen.Repackage("gen_bits.go", "bits.go", "runenames")
w := gen.NewCodeWriter()
w.WriteVar("table0", table0)
w.WriteVar("table1", table1)
w.WriteConst("data", string(data))
w.WriteGoFile("tables.go", "runenames")
defer w.WriteVersionedGoFile("tables.go", "runenames")
gen.WriteUnicodeVersion(w)
computeDirectOffsets()
computeEntries()
if err := bitfield.Gen(w, entry{}, nil); err != nil {
log.Fatal(err)
}
type entry uint64 // trick the generation code to use the entry type
packed := []entry{}
for _, e := range entries {
e.numRunes = int(e.end - e.start + 1)
v, err := bitfield.Pack(e, nil)
if err != nil {
log.Fatal(err)
}
packed = append(packed, entry(v))
}
index = append(index, uint16(singleData.Len()))
w.WriteVar("entries", packed)
w.WriteVar("index", index)
w.WriteConst("directData", directData.String())
w.WriteConst("singleData", singleData.String())
}
func parse() (names []string, counts map[string]int) {
names = make([]string, 1+unicode.MaxRune)
counts = map[string]int{}
ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
r, s := p.Rune(0), p.String(ucd.Name)
if s == "" {
return
func computeDirectOffsets() {
counts := map[string]int{}
p := ucd.New(gen.OpenUCDFile("UnicodeData.txt"), ucd.KeepRanges)
for p.Next() {
start, end := p.Range(0)
counts[getName(p)] += int(end-start) + 1
}
direct := []string{}
for k, v := range counts {
if v > 1 {
direct = append(direct, k)
}
if s[0] == '<' {
const first = ", First>"
if i := strings.Index(s, first); i >= 0 {
s = s[:i] + ">"
}
sort.Strings(direct)
for _, s := range direct {
directOffsets[s] = directData.Len()
directData.WriteString(s)
}
}
func computeEntries() {
p := ucd.New(gen.OpenUCDFile("UnicodeData.txt"), ucd.KeepRanges)
for p.Next() {
start, end := p.Range(0)
last := entry{}
if len(entries) > 0 {
last = entries[len(entries)-1]
}
name := getName(p)
if index, ok := directOffsets[name]; ok {
if last.name == name && last.end+1 == start {
entries[len(entries)-1].end = end
continue
}
}
names[r] = s
counts[s]++
})
return names, counts
}
func appendRepeatNames(names []string, counts map[string]int) {
alreadySeen := map[string]snippet{}
for r, s := range names {
if s == "" || counts[s] == 1 {
continue
}
if s[0] != '<' {
log.Fatalf("Repeated name %q does not start with a '<'", s)
}
if z, ok := alreadySeen[s]; ok {
snippets[r] = z
entries = append(entries, entry{
start: start,
end: end,
index: index,
base: len(name),
direct: true,
name: name,
})
continue
}
z := snippet{
offset: len(data),
length: len(s),
s: s,
if start != end {
log.Fatalf("Expected start == end, found %x != %x", start, end)
}
data = append(data, s...)
snippets[r] = z
alreadySeen[s] = z
offset := singleData.Len()
base := offset >> 16
index = append(index, uint16(offset))
singleData.WriteString(name)
if last.base == base && last.end+1 == start {
entries[len(entries)-1].end = start
continue
}
entries = append(entries, entry{
start: start,
end: end,
index: len(index) - 1,
base: base,
name: name,
})
}
}
func appendUniqueNames(names []string, counts map[string]int) {
for r, s := range names {
if s == "" || counts[s] != 1 {
continue
}
if s[0] == '<' {
log.Fatalf("Unique name %q starts with a '<'", s)
}
z := snippet{
offset: len(data),
length: len(s),
s: s,
}
data = append(data, s...)
snippets[r] = z
func getName(p *ucd.Parser) string {
s := p.String(ucd.Name)
if s == "" {
return ""
}
}
func makeTables() (table0 []uint64, table1 []uint16) {
for i := 0; i < len(snippets); {
zi := snippets[i]
if zi == (snippet{}) {
i++
continue
if s[0] == '<' {
const first = ", First>"
if i := strings.Index(s, first); i >= 0 {
s = s[:i] + ">"
}
// Look for repeat names. If we have one, we only need a table0 entry.
j := i + 1
for ; j < len(snippets) && zi == snippets[j]; j++ {
}
if j > i+1 {
table0 = append(table0, makeTable0EntryDirect(i, j-i, zi.offset, zi.length))
i = j
continue
}
// Otherwise, we have a run of unique names. We need one table0 entry
// and two or more table1 entries.
base := zi.offset &^ (1<<dataBaseUnit - 1)
t1Offset := len(table1) + 1
table1 = append(table1, makeTable1Entry(zi.offset-base))
table1 = append(table1, makeTable1Entry(zi.offset+zi.length-base))
for ; j < len(snippets) && snippets[j] != (snippet{}); j++ {
zj := snippets[j]
if data[zj.offset] == '<' {
break
}
table1 = append(table1, makeTable1Entry(zj.offset+zj.length-base))
}
table0 = append(table0, makeTable0EntryIndirect(i, j-i, base>>dataBaseUnit, t1Offset))
i = j
}
return table0, table1
return s
}

Просмотреть файл

@ -1,63 +0,0 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This file contains code common to gen.go and the package code.
// The mapping from rune to string (i.e. offset and length in the data string)
// is encoded as a two level table. The first level maps from contiguous rune
// ranges [runeOffset, runeOffset+runeLength) to entries. Entries are either
// direct (for repeated names such as "<CJK Ideograph>") or indirect (for runs
// of unique names such as "SPACE", "EXCLAMATION MARK", "QUOTATION MARK", ...).
//
// Each first level table element is 64 bits. The runeOffset (21 bits) and
// runeLength (16 bits) take the 37 high bits. The entry takes the 27 low bits,
// with directness encoded in the least significant bit.
//
// A direct entry encodes a dataOffset (18 bits) and dataLength (8 bits) in the
// data string. 18 bits is too short to encode the entire data string's length,
// but the data string's contents are arranged so that all of the few direct
// entries' offsets come before all of the many indirect entries' offsets.
//
// An indirect entry encodes a dataBase (10 bits) and a table1Offset (16 bits).
// The table1Offset is the start of a range in the second level table. The
// length of that range is the same as the runeLength.
//
// Each second level table element is 16 bits, an index into data, relative to
// a bias equal to (dataBase << dataBaseUnit). That (bias + index) is the
// (dataOffset + dataLength) in the data string. The dataOffset is implied by
// the previous table element (with the same implicit bias).
const (
bitsRuneOffset = 21
bitsRuneLength = 16
bitsDataOffset = 18
bitsDataLength = 8
bitsDirect = 1
bitsDataBase = 10
bitsTable1Offset = 16
shiftRuneOffset = 0 + bitsDirect + bitsDataLength + bitsDataOffset + bitsRuneLength
shiftRuneLength = 0 + bitsDirect + bitsDataLength + bitsDataOffset
shiftDataOffset = 0 + bitsDirect + bitsDataLength
shiftDataLength = 0 + bitsDirect
shiftDirect = 0
shiftDataBase = 0 + bitsDirect + bitsTable1Offset
shiftTable1Offset = 0 + bitsDirect
maskRuneLength = 1<<bitsRuneLength - 1
maskDataOffset = 1<<bitsDataOffset - 1
maskDataLength = 1<<bitsDataLength - 1
maskDirect = 1<<bitsDirect - 1
maskDataBase = 1<<bitsDataBase - 1
maskTable1Offset = 1<<bitsTable1Offset - 1
dataBaseUnit = 10
)

Просмотреть файл

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run gen.go gen_bits.go
//go:generate go run gen.go
// Package runenames provides rune names from the Unicode Character Database.
// For example, the name for '\u0100' is "LATIN CAPITAL LETTER A WITH MACRON".
@ -16,33 +16,33 @@ import (
// Name returns the name for r.
func Name(r rune) string {
i := sort.Search(len(table0), func(j int) bool {
e := table0[j]
rOffset := rune(e >> shiftRuneOffset)
return r < rOffset
i := sort.Search(len(entries), func(j int) bool {
return entries[j].startRune() > r
})
if i == 0 {
return ""
}
e := entries[i-1]
e := table0[i-1]
rOffset := rune(e >> shiftRuneOffset)
rLength := rune(e>>shiftRuneLength) & maskRuneLength
if r >= rOffset+rLength {
offset := int(r - e.startRune())
if offset >= e.numRunes() {
return ""
}
if (e>>shiftDirect)&maskDirect != 0 {
o := int(e>>shiftDataOffset) & maskDataOffset
n := int(e>>shiftDataLength) & maskDataLength
return data[o : o+n]
if e.direct() {
o := e.index()
n := e.len()
return directData[o : o+n]
}
base := uint32(e>>shiftDataBase) & maskDataBase
base <<= dataBaseUnit
j := rune(e>>shiftTable1Offset) & maskTable1Offset
j += r - rOffset
d0 := base + uint32(table1[j-1]) // dataOffset
d1 := base + uint32(table1[j-0]) // dataOffset + dataLength
return data[d0:d1]
start := int(index[e.index()+offset])
end := int(index[e.index()+offset+1])
base1 := e.base() << 16
base2 := base1
if start > end {
base2 += 1 << 16
}
return singleData[start+base1 : end+base2]
}
func (e entry) len() int { return e.base() }

Просмотреть файл

@ -19,17 +19,7 @@ func TestName(t *testing.T) {
wants := make([]string, 1+unicode.MaxRune)
ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
r, s := p.Rune(0), p.String(ucd.Name)
if s == "" {
return
}
if s[0] == '<' {
const first = ", First>"
if i := strings.Index(s, first); i >= 0 {
s = s[:i] + ">"
}
}
wants[r] = s
wants[p.Rune(0)] = getName(p)
})
nErrors := 0
@ -44,3 +34,19 @@ func TestName(t *testing.T) {
}
}
}
// Copied from gen.go.
func getName(p *ucd.Parser) string {
s := p.String(ucd.Name)
if s == "" {
return ""
}
if s[0] == '<' {
const first = ", First>"
if i := strings.Index(s, first); i >= 0 {
s = s[:i] + ">"
}
}
return s
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу