unicode/runenames: support for multiple unicode versions

Also rewrote the representation, which did not work for Unicode 10 Change-Id: I39d9907908c67d07f8bbc153942ab0d8ab02ab6e Reviewed-on: https://go-review.googlesource.com/96736 Run-TryBot: Marcel van Lohuizen <mpvl@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Ross Light <light@google.com>
2018-02-23 18:02:25 +01:00 · 2018-02-23 18:02:25 +01:00 · ffa9e5ed68
--- a/gen.go
+++ b/gen.go
@ -120,6 +120,7 @@ pkg unicode, var <new script or property> *RangeTable
 		mib        = generate("./encoding/internal/identifier", unicode)
 		number     = generate("./internal/number", unicode, cldr, language, internal)
 		cldrtree   = generate("./internal/cldrtree", language, internal)
+		_          = generate("./unicode/runenames", unicode)
 		_          = generate("./encoding/htmlindex", unicode, language, mib)
 		_          = generate("./encoding/ianaindex", unicode, language, mib)
 		_          = generate("./secure/precis", unicode, norm, rangetable, cases, width, bidi)
--- a/unicode/runenames/bits.go
+++ b/unicode/runenames/bits.go
@ -1,59 +0,0 @@
-// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
-
-package runenames
-
-// This file contains code common to gen.go and the package code.
-
-// The mapping from rune to string (i.e. offset and length in the data string)
-// is encoded as a two level table. The first level maps from contiguous rune
-// ranges [runeOffset, runeOffset+runeLength) to entries. Entries are either
-// direct (for repeated names such as "<CJK Ideograph>") or indirect (for runs
-// of unique names such as "SPACE", "EXCLAMATION MARK", "QUOTATION MARK", ...).
-//
-// Each first level table element is 64 bits. The runeOffset (21 bits) and
-// runeLength (16 bits) take the 37 high bits. The entry takes the 27 low bits,
-// with directness encoded in the least significant bit.
-//
-// A direct entry encodes a dataOffset (18 bits) and dataLength (8 bits) in the
-// data string. 18 bits is too short to encode the entire data string's length,
-// but the data string's contents are arranged so that all of the few direct
-// entries' offsets come before all of the many indirect entries' offsets.
-//
-// An indirect entry encodes a dataBase (10 bits) and a table1Offset (16 bits).
-// The table1Offset is the start of a range in the second level table. The
-// length of that range is the same as the runeLength.
-//
-// Each second level table element is 16 bits, an index into data, relative to
-// a bias equal to (dataBase << dataBaseUnit). That (bias + index) is the
-// (dataOffset + dataLength) in the data string. The dataOffset is implied by
-// the previous table element (with the same implicit bias).
-
-const (
-	bitsRuneOffset = 21
-	bitsRuneLength = 16
-	bitsDataOffset = 18
-	bitsDataLength = 8
-	bitsDirect     = 1
-
-	bitsDataBase     = 10
-	bitsTable1Offset = 16
-
-	shiftRuneOffset = 0 + bitsDirect + bitsDataLength + bitsDataOffset + bitsRuneLength
-	shiftRuneLength = 0 + bitsDirect + bitsDataLength + bitsDataOffset
-	shiftDataOffset = 0 + bitsDirect + bitsDataLength
-	shiftDataLength = 0 + bitsDirect
-	shiftDirect     = 0
-
-	shiftDataBase     = 0 + bitsDirect + bitsTable1Offset
-	shiftTable1Offset = 0 + bitsDirect
-
-	maskRuneLength = 1<<bitsRuneLength - 1
-	maskDataOffset = 1<<bitsDataOffset - 1
-	maskDataLength = 1<<bitsDataLength - 1
-	maskDirect     = 1<<bitsDirect - 1
-
-	maskDataBase     = 1<<bitsDataBase - 1
-	maskTable1Offset = 1<<bitsTable1Offset - 1
-
-	dataBaseUnit = 10
-)
--- a/unicode/runenames/example_test.go
+++ b/unicode/runenames/example_test.go
@ -41,7 +41,6 @@ func Example() {
 		'\U00004dc0',

 		'\U00009fd5',
-		'\U00009fd6',
 		'\U00009fff',
 		'\U0000a000',
 		0xdc00, // '\U0000dc00' (Low Surrogate) is an invalid Go literal.
@ -95,7 +94,6 @@ func Example() {
 	// 00003402 "<CJK Ideograph Extension A>"
 	// 00004dc0 "HEXAGRAM FOR THE CREATIVE HEAVEN"
 	// 00009fd5 "<CJK Ideograph>"
-	// 00009fd6 ""
 	// 00009fff ""
 	// 0000a000 "YI SYLLABLE IT"
 	// 0000dc00 "<Low Surrogate>"
--- a/unicode/runenames/gen.go
+++ b/unicode/runenames/gen.go
@ -7,189 +7,156 @@
 package main

 import (
+	"bytes"
 	"log"
+	"sort"
 	"strings"
-	"unicode"

 	"golang.org/x/text/internal/gen"
+	"golang.org/x/text/internal/gen/bitfield"
 	"golang.org/x/text/internal/ucd"
 )

-// snippet is a slice of data; data is the concatenation of all of the names.
-type snippet struct {
-	offset int
-	length int
-	s      string
-}
-
-func makeTable0EntryDirect(rOffset, rLength, dOffset, dLength int) uint64 {
-	if rOffset >= 1<<bitsRuneOffset {
-		log.Fatalf("makeTable0EntryDirect: rOffset %d is too large", rOffset)
-	}
-	if rLength >= 1<<bitsRuneLength {
-		log.Fatalf("makeTable0EntryDirect: rLength %d is too large", rLength)
-	}
-	if dOffset >= 1<<bitsDataOffset {
-		log.Fatalf("makeTable0EntryDirect: dOffset %d is too large", dOffset)
-	}
-	if dLength >= 1<<bitsRuneLength {
-		log.Fatalf("makeTable0EntryDirect: dLength %d is too large", dLength)
-	}
-	return uint64(rOffset)<<shiftRuneOffset |
-		uint64(rLength)<<shiftRuneLength |
-		uint64(dOffset)<<shiftDataOffset |
-		uint64(dLength)<<shiftDataLength |
-		1 // Direct bit.
-}
-
-func makeTable0EntryIndirect(rOffset, rLength, dBase, t1Offset int) uint64 {
-	if rOffset >= 1<<bitsRuneOffset {
-		log.Fatalf("makeTable0EntryIndirect: rOffset %d is too large", rOffset)
-	}
-	if rLength >= 1<<bitsRuneLength {
-		log.Fatalf("makeTable0EntryIndirect: rLength %d is too large", rLength)
-	}
-	if dBase >= 1<<bitsDataBase {
-		log.Fatalf("makeTable0EntryIndirect: dBase %d is too large", dBase)
-	}
-	if t1Offset >= 1<<bitsTable1Offset {
-		log.Fatalf("makeTable0EntryIndirect: t1Offset %d is too large", t1Offset)
-	}
-	return uint64(rOffset)<<shiftRuneOffset |
-		uint64(rLength)<<shiftRuneLength |
-		uint64(dBase)<<shiftDataBase |
-		uint64(t1Offset)<<shiftTable1Offset |
-		0 // Direct bit.
-}
-
-func makeTable1Entry(x int) uint16 {
-	if x < 0 || 0xffff < x {
-		log.Fatalf("makeTable1Entry: entry %d is out of range", x)
-	}
-	return uint16(x)
-}
-
 var (
-	data     []byte
-	snippets = make([]snippet, 1+unicode.MaxRune)
+	// computed by computeDirectOffsets
+	directOffsets = map[string]int{}
+	directData    bytes.Buffer
+
+	// computed by computeEntries
+	entries    []entry
+	singleData bytes.Buffer
+	index      []uint16
 )

+type entry struct {
+	start    rune `bitfield:"21,startRune"`
+	numRunes int  `bitfield:"16"`
+	end      rune
+	index    int  `bitfield:"16"`
+	base     int  `bitfield:"6"`
+	direct   bool `bitfield:""`
+	name     string
+}
+
 func main() {
 	gen.Init()

-	names, counts := parse()
-	appendRepeatNames(names, counts)
-	appendUniqueNames(names, counts)
-
-	table0, table1 := makeTables()
-
-	gen.Repackage("gen_bits.go", "bits.go", "runenames")
-
 	w := gen.NewCodeWriter()
-	w.WriteVar("table0", table0)
-	w.WriteVar("table1", table1)
-	w.WriteConst("data", string(data))
-	w.WriteGoFile("tables.go", "runenames")
+	defer w.WriteVersionedGoFile("tables.go", "runenames")
+
+	gen.WriteUnicodeVersion(w)
+
+	computeDirectOffsets()
+	computeEntries()
+
+	if err := bitfield.Gen(w, entry{}, nil); err != nil {
+		log.Fatal(err)
+	}
+
+	type entry uint64 // trick the generation code to use the entry type
+	packed := []entry{}
+	for _, e := range entries {
+		e.numRunes = int(e.end - e.start + 1)
+		v, err := bitfield.Pack(e, nil)
+		if err != nil {
+			log.Fatal(err)
+		}
+		packed = append(packed, entry(v))
+	}
+
+	index = append(index, uint16(singleData.Len()))
+
+	w.WriteVar("entries", packed)
+	w.WriteVar("index", index)
+	w.WriteConst("directData", directData.String())
+	w.WriteConst("singleData", singleData.String())
 }

-func parse() (names []string, counts map[string]int) {
-	names = make([]string, 1+unicode.MaxRune)
-	counts = map[string]int{}
-	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
-		r, s := p.Rune(0), p.String(ucd.Name)
-		if s == "" {
-			return
+func computeDirectOffsets() {
+	counts := map[string]int{}
+
+	p := ucd.New(gen.OpenUCDFile("UnicodeData.txt"), ucd.KeepRanges)
+	for p.Next() {
+		start, end := p.Range(0)
+		counts[getName(p)] += int(end-start) + 1
+	}
+
+	direct := []string{}
+	for k, v := range counts {
+		if v > 1 {
+			direct = append(direct, k)
 		}
-		if s[0] == '<' {
-			const first = ", First>"
-			if i := strings.Index(s, first); i >= 0 {
-				s = s[:i] + ">"
+	}
+	sort.Strings(direct)
+
+	for _, s := range direct {
+		directOffsets[s] = directData.Len()
+		directData.WriteString(s)
+	}
+}
+
+func computeEntries() {
+	p := ucd.New(gen.OpenUCDFile("UnicodeData.txt"), ucd.KeepRanges)
+	for p.Next() {
+		start, end := p.Range(0)
+
+		last := entry{}
+		if len(entries) > 0 {
+			last = entries[len(entries)-1]
+		}
+
+		name := getName(p)
+		if index, ok := directOffsets[name]; ok {
+			if last.name == name && last.end+1 == start {
+				entries[len(entries)-1].end = end
+				continue
 			}
-		}
-		names[r] = s
-		counts[s]++
-	})
-	return names, counts
-}
-
-func appendRepeatNames(names []string, counts map[string]int) {
-	alreadySeen := map[string]snippet{}
-	for r, s := range names {
-		if s == "" || counts[s] == 1 {
-			continue
-		}
-		if s[0] != '<' {
-			log.Fatalf("Repeated name %q does not start with a '<'", s)
-		}
-
-		if z, ok := alreadySeen[s]; ok {
-			snippets[r] = z
+			entries = append(entries, entry{
+				start:  start,
+				end:    end,
+				index:  index,
+				base:   len(name),
+				direct: true,
+				name:   name,
+			})
 			continue
 		}

-		z := snippet{
-			offset: len(data),
-			length: len(s),
-			s:      s,
+		if start != end {
+			log.Fatalf("Expected start == end, found %x != %x", start, end)
 		}
-		data = append(data, s...)
-		snippets[r] = z
-		alreadySeen[s] = z
+
+		offset := singleData.Len()
+		base := offset >> 16
+		index = append(index, uint16(offset))
+		singleData.WriteString(name)
+
+		if last.base == base && last.end+1 == start {
+			entries[len(entries)-1].end = start
+			continue
+		}
+
+		entries = append(entries, entry{
+			start: start,
+			end:   end,
+			index: len(index) - 1,
+			base:  base,
+			name:  name,
+		})
 	}
 }

-func appendUniqueNames(names []string, counts map[string]int) {
-	for r, s := range names {
-		if s == "" || counts[s] != 1 {
-			continue
-		}
-		if s[0] == '<' {
-			log.Fatalf("Unique name %q starts with a '<'", s)
-		}
-
-		z := snippet{
-			offset: len(data),
-			length: len(s),
-			s:      s,
-		}
-		data = append(data, s...)
-		snippets[r] = z
+func getName(p *ucd.Parser) string {
+	s := p.String(ucd.Name)
+	if s == "" {
+		return ""
 	}
-}
-
-func makeTables() (table0 []uint64, table1 []uint16) {
-	for i := 0; i < len(snippets); {
-		zi := snippets[i]
-		if zi == (snippet{}) {
-			i++
-			continue
+	if s[0] == '<' {
+		const first = ", First>"
+		if i := strings.Index(s, first); i >= 0 {
+			s = s[:i] + ">"
 		}

-		// Look for repeat names. If we have one, we only need a table0 entry.
-		j := i + 1
-		for ; j < len(snippets) && zi == snippets[j]; j++ {
-		}
-		if j > i+1 {
-			table0 = append(table0, makeTable0EntryDirect(i, j-i, zi.offset, zi.length))
-			i = j
-			continue
-		}
-
-		// Otherwise, we have a run of unique names. We need one table0 entry
-		// and two or more table1 entries.
-		base := zi.offset &^ (1<<dataBaseUnit - 1)
-		t1Offset := len(table1) + 1
-		table1 = append(table1, makeTable1Entry(zi.offset-base))
-		table1 = append(table1, makeTable1Entry(zi.offset+zi.length-base))
-		for ; j < len(snippets) && snippets[j] != (snippet{}); j++ {
-			zj := snippets[j]
-			if data[zj.offset] == '<' {
-				break
-			}
-			table1 = append(table1, makeTable1Entry(zj.offset+zj.length-base))
-		}
-		table0 = append(table0, makeTable0EntryIndirect(i, j-i, base>>dataBaseUnit, t1Offset))
-		i = j
 	}
-	return table0, table1
+	return s
 }
--- a/unicode/runenames/gen_bits.go
+++ b/unicode/runenames/gen_bits.go
@ -1,63 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build ignore
-
-package main
-
-// This file contains code common to gen.go and the package code.
-
-// The mapping from rune to string (i.e. offset and length in the data string)
-// is encoded as a two level table. The first level maps from contiguous rune
-// ranges [runeOffset, runeOffset+runeLength) to entries. Entries are either
-// direct (for repeated names such as "<CJK Ideograph>") or indirect (for runs
-// of unique names such as "SPACE", "EXCLAMATION MARK", "QUOTATION MARK", ...).
-//
-// Each first level table element is 64 bits. The runeOffset (21 bits) and
-// runeLength (16 bits) take the 37 high bits. The entry takes the 27 low bits,
-// with directness encoded in the least significant bit.
-//
-// A direct entry encodes a dataOffset (18 bits) and dataLength (8 bits) in the
-// data string. 18 bits is too short to encode the entire data string's length,
-// but the data string's contents are arranged so that all of the few direct
-// entries' offsets come before all of the many indirect entries' offsets.
-//
-// An indirect entry encodes a dataBase (10 bits) and a table1Offset (16 bits).
-// The table1Offset is the start of a range in the second level table. The
-// length of that range is the same as the runeLength.
-//
-// Each second level table element is 16 bits, an index into data, relative to
-// a bias equal to (dataBase << dataBaseUnit). That (bias + index) is the
-// (dataOffset + dataLength) in the data string. The dataOffset is implied by
-// the previous table element (with the same implicit bias).
-
-const (
-	bitsRuneOffset = 21
-	bitsRuneLength = 16
-	bitsDataOffset = 18
-	bitsDataLength = 8
-	bitsDirect     = 1
-
-	bitsDataBase     = 10
-	bitsTable1Offset = 16
-
-	shiftRuneOffset = 0 + bitsDirect + bitsDataLength + bitsDataOffset + bitsRuneLength
-	shiftRuneLength = 0 + bitsDirect + bitsDataLength + bitsDataOffset
-	shiftDataOffset = 0 + bitsDirect + bitsDataLength
-	shiftDataLength = 0 + bitsDirect
-	shiftDirect     = 0
-
-	shiftDataBase     = 0 + bitsDirect + bitsTable1Offset
-	shiftTable1Offset = 0 + bitsDirect
-
-	maskRuneLength = 1<<bitsRuneLength - 1
-	maskDataOffset = 1<<bitsDataOffset - 1
-	maskDataLength = 1<<bitsDataLength - 1
-	maskDirect     = 1<<bitsDirect - 1
-
-	maskDataBase     = 1<<bitsDataBase - 1
-	maskTable1Offset = 1<<bitsTable1Offset - 1
-
-	dataBaseUnit = 10
-)
--- a/unicode/runenames/runenames.go
+++ b/unicode/runenames/runenames.go
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-//go:generate go run gen.go gen_bits.go
+//go:generate go run gen.go

 // Package runenames provides rune names from the Unicode Character Database.
 // For example, the name for '\u0100' is "LATIN CAPITAL LETTER A WITH MACRON".
@ -16,33 +16,33 @@ import (

 // Name returns the name for r.
 func Name(r rune) string {
-	i := sort.Search(len(table0), func(j int) bool {
-		e := table0[j]
-		rOffset := rune(e >> shiftRuneOffset)
-		return r < rOffset
+	i := sort.Search(len(entries), func(j int) bool {
+		return entries[j].startRune() > r
 	})
 	if i == 0 {
 		return ""
 	}
+	e := entries[i-1]

-	e := table0[i-1]
-	rOffset := rune(e >> shiftRuneOffset)
-	rLength := rune(e>>shiftRuneLength) & maskRuneLength
-	if r >= rOffset+rLength {
+	offset := int(r - e.startRune())
+	if offset >= e.numRunes() {
 		return ""
 	}

-	if (e>>shiftDirect)&maskDirect != 0 {
-		o := int(e>>shiftDataOffset) & maskDataOffset
-		n := int(e>>shiftDataLength) & maskDataLength
-		return data[o : o+n]
+	if e.direct() {
+		o := e.index()
+		n := e.len()
+		return directData[o : o+n]
 	}

-	base := uint32(e>>shiftDataBase) & maskDataBase
-	base <<= dataBaseUnit
-	j := rune(e>>shiftTable1Offset) & maskTable1Offset
-	j += r - rOffset
-	d0 := base + uint32(table1[j-1]) // dataOffset
-	d1 := base + uint32(table1[j-0]) // dataOffset + dataLength
-	return data[d0:d1]
+	start := int(index[e.index()+offset])
+	end := int(index[e.index()+offset+1])
+	base1 := e.base() << 16
+	base2 := base1
+	if start > end {
+		base2 += 1 << 16
+	}
+	return singleData[start+base1 : end+base2]
 }
+
+func (e entry) len() int { return e.base() }
--- a/unicode/runenames/runenames_test.go
+++ b/unicode/runenames/runenames_test.go
@ -19,17 +19,7 @@ func TestName(t *testing.T) {

 	wants := make([]string, 1+unicode.MaxRune)
 	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
-		r, s := p.Rune(0), p.String(ucd.Name)
-		if s == "" {
-			return
-		}
-		if s[0] == '<' {
-			const first = ", First>"
-			if i := strings.Index(s, first); i >= 0 {
-				s = s[:i] + ">"
-			}
-		}
-		wants[r] = s
+		wants[p.Rune(0)] = getName(p)
 	})

 	nErrors := 0
@ -44,3 +34,19 @@ func TestName(t *testing.T) {
 		}
 	}
 }
+
+// Copied from gen.go.
+func getName(p *ucd.Parser) string {
+	s := p.String(ucd.Name)
+	if s == "" {
+		return ""
+	}
+	if s[0] == '<' {
+		const first = ", First>"
+		if i := strings.Index(s, first); i >= 0 {
+			s = s[:i] + ">"
+		}
+
+	}
+	return s
+}
--- a/unicode/runenames/tables.go
+++ b/unicode/runenames/tables.go
--- a/unicode/runenames/tables10.0.0.go
+++ b/unicode/runenames/tables10.0.0.go
--- a/unicode/runenames/tables9.0.0.go
+++ b/unicode/runenames/tables9.0.0.go