go.text/encoding: shrink the japanese and korean encoding data tables.

The encoding.test binary size generated by "go test -c" drops by 132320
bytes.

Some benchmarks get better, others get worse (but that might just be
noise, as there are no code or data changes for Big5 or GBK).

benchmark                    old MB/s     new MB/s  speedup
BenchmarkBig5Encoder           170.12       171.82    1.01x
BenchmarkEUCJPEncoder          160.94       156.07    0.97x
BenchmarkEUCKREncoder          166.75       171.66    1.03x
BenchmarkGBKEncoder            180.07       173.59    0.96x
BenchmarkShiftJISEncoder       137.95       143.70    1.04x

R=r
CC=golang-dev
https://golang.org/cl/13321047
This commit is contained in:
Nigel Tao 2013-09-18 13:42:53 +10:00
Родитель d94036e178
Коммит a60de809e6
13 изменённых файлов: 30610 добавлений и 30386 удалений

Просмотреть файл

@ -12,7 +12,7 @@ import (
"code.google.com/p/go.text/transform"
)
// EUCJP is the EUC-JP (Extended Unix Code Japanese) encoding.
// EUCJP is the EUC-JP encoding.
var EUCJP encoding.Encoding = eucJP{}
type eucJP struct{}
@ -115,7 +115,6 @@ type eucJPEncoder struct{}
func (eucJPEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
@ -132,63 +131,78 @@ loop:
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break loop
break
}
}
}
switch {
case r < utf8.RuneSelf:
// No-op.
case 0xff61 <= r && r <= 0xff9f:
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break loop
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r != 0 {
goto write2or3
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r != 0 {
goto write2or3
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r != 0 {
goto write2or3
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r != 0 {
goto write2or3
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r != 0 {
goto write2or3
}
case encode5Low <= r && r < encode5High:
if 0xff61 <= r && r < 0xffa0 {
goto write2
}
if r = rune(encode5[r-encode5Low]); r != 0 {
goto write2or3
}
}
dst[nDst+0] = 0x8e
dst[nDst+1] = uint8(r - (0xff61 - 0xa1))
nDst += 2
continue loop
case 0xffff < r:
r = encoding.ASCIISub
default:
e := jisEncode[uint16(r)]
if e == 0 {
r = encoding.ASCIISub
break
}
switch e >> tableShift {
case jis0208:
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break loop
}
dst[nDst+0] = 0xa1 + uint8(e>>codeShift)&codeMask
dst[nDst+1] = 0xa1 + uint8(e)&codeMask
nDst += 2
case jis0212:
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break loop
}
dst[nDst+0] = 0x8f
dst[nDst+1] = 0xa1 + uint8(e>>codeShift)&codeMask
dst[nDst+2] = 0xa1 + uint8(e)&codeMask
nDst += 3
}
continue loop
}
// r is encoded as a single byte.
if nDst >= len(dst) {
err = transform.ErrShortDst
break loop
break
}
dst[nDst] = uint8(r)
nDst++
continue
write2or3:
if r>>tableShift == jis0208 {
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
} else {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = 0x8f
nDst++
}
dst[nDst+0] = 0xa1 + uint8(r>>codeShift)&codeMask
dst[nDst+1] = 0xa1 + uint8(r)&codeMask
nDst += 2
continue
write2:
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = 0x8e
dst[nDst+1] = uint8(r - (0xff61 - 0xa1))
nDst += 2
continue
}
return nDst, nSrc, err
}

Просмотреть файл

@ -18,6 +18,7 @@ import (
"fmt"
"log"
"net/http"
"sort"
"strings"
)
@ -84,6 +85,31 @@ func main() {
fmt.Printf("}\n\n")
}
// Any run of at least separation continuous zero entries in the reverse map will
// be a separate encode table.
const separation = 1024
intervals := []interval(nil)
low, high := -1, -1
for i, v := range reverse {
if v.table == -1 {
continue
}
if low < 0 {
low = i
} else if i-high >= separation {
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
low = i
}
high = i + 1
}
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
sort.Sort(byDecreasingLength(intervals))
fmt.Printf("const (\n")
fmt.Printf("\tjis0208 = 1\n")
fmt.Printf("\tjis0212 = 2\n")
@ -92,18 +118,43 @@ func main() {
fmt.Printf("\ttableShift = 14\n")
fmt.Printf(")\n\n")
fmt.Printf("// jisEncode is the encoding table from Unicode to JIS code.\n")
fmt.Printf("// encodeX are the encoding tables from Unicode to JIS code,\n")
fmt.Printf("// sorted by decreasing length.\n")
for i, v := range intervals {
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
}
fmt.Printf("//\n")
fmt.Printf("// The high two bits of the value record whether the JIS code comes from the\n")
fmt.Printf("// JIS0208 table (high bits == 1) or the JIS0212 table (high bits == 2).\n")
fmt.Printf("// The low 14 bits are two 7-bit unsigned integers j1 and j2 that form the\n")
fmt.Printf("// JIS code (94*j1 + j2) within that table.\n")
fmt.Printf("var jisEncode = [65536]uint16{\n")
for i, v := range reverse {
if v.table == -1 {
continue
fmt.Printf("\n")
for i, v := range intervals {
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
fmt.Printf("var encode%d = [...]uint16{\n", i)
for j := v.low; j < v.high; j++ {
x := reverse[j]
if x.table == -1 {
continue
}
fmt.Printf("\t%d - %d: jis%s<<14 | 0x%02X<<7 | 0x%02X,\n",
j, v.low, tables[x.table].name, x.jisCode/94, x.jisCode%94)
}
fmt.Printf("\t0x%04X: jis%s<<14 | 0x%02X<<7 | 0x%02X,\n",
i, tables[v.table].name, v.jisCode/94, v.jisCode%94)
fmt.Printf("}\n\n")
}
fmt.Printf("}\n\n")
}
// interval is a half-open interval [low, high).
type interval struct {
low, high int
}
func (i interval) len() int { return i.high - i.low }
// byDecreasingLength sorts intervals by decreasing length.
type byDecreasingLength []interval
func (b byDecreasingLength) Len() int { return len(b) }
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }

Просмотреть файл

@ -12,8 +12,8 @@ import (
"code.google.com/p/go.text/transform"
)
// ShiftJIS is the Shift JIS (Japanese Industrial Standards) encoding, also
// known as Code Page 932 and Windows-31J.
// ShiftJIS is the Shift JIS encoding, also known as Code Page 932 and
// Windows-31J.
var ShiftJIS encoding.Encoding = shiftJIS{}
type shiftJIS struct{}
@ -126,55 +126,68 @@ loop:
break loop
}
}
}
switch {
case r < utf8.RuneSelf:
// r is an ASCII rune.
case 0xff61 <= r && r <= 0xff9f:
r -= 0xff61 - 0xa1
case 0xffff < r:
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r>>tableShift == jis0208 {
goto write2
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r>>tableShift == jis0208 {
goto write2
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r>>tableShift == jis0208 {
goto write2
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r>>tableShift == jis0208 {
goto write2
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r>>tableShift == jis0208 {
goto write2
}
case encode5Low <= r && r < encode5High:
if 0xff61 <= r && r < 0xffa0 {
r -= 0xff61 - 0xa1
goto write1
}
if r = rune(encode5[r-encode5Low]); r>>tableShift == jis0208 {
goto write2
}
}
r = encoding.ASCIISub
default:
e := jisEncode[uint16(r)]
if e == 0 {
r = encoding.ASCIISub
break
}
if e>>tableShift != jis0208 {
r = encoding.ASCIISub
break
}
j1 := uint8(e>>codeShift) & codeMask
j2 := uint8(e) & codeMask
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break loop
}
if j1 <= 61 {
dst[nDst+0] = 129 + j1/2
} else {
dst[nDst+0] = 193 + j1/2
}
if j1&1 == 0 {
dst[nDst+1] = j2 + j2/63 + 64
} else {
dst[nDst+1] = j2 + 159
}
nDst += 2
continue loop
}
// r is encoded as a single byte.
write1:
if nDst >= len(dst) {
err = transform.ErrShortDst
break loop
break
}
dst[nDst] = uint8(r)
nDst++
continue
write2:
j1 := uint8(r>>codeShift) & codeMask
j2 := uint8(r) & codeMask
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break loop
}
if j1 <= 61 {
dst[nDst+0] = 129 + j1/2
} else {
dst[nDst+0] = 193 + j1/2
}
if j1&1 == 0 {
dst[nDst+1] = j2 + j2/63 + 64
} else {
dst[nDst+1] = j2 + 159
}
nDst += 2
continue
}
return nDst, nSrc, err
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -66,8 +66,8 @@ loop:
err = errInvalidEUCKR
break loop
}
if int(r) < len(eucKRDecode) {
r = rune(eucKRDecode[r])
if int(r) < len(decode) {
r = rune(decode[r])
if r == 0 {
r = encoding.ASCIISub
}
@ -97,7 +97,6 @@ type eucKREncoder struct{}
func (eucKREncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
@ -114,41 +113,60 @@ loop:
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break loop
break
}
}
}
switch {
case r < utf8.RuneSelf:
// No-op.
case 0xffff < r:
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r != 0 {
goto write2
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r != 0 {
goto write2
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r != 0 {
goto write2
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r != 0 {
goto write2
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r != 0 {
goto write2
}
case encode5Low <= r && r < encode5High:
if r = rune(encode5[r-encode5Low]); r != 0 {
goto write2
}
case encode6Low <= r && r < encode6High:
if r = rune(encode6[r-encode6Low]); r != 0 {
goto write2
}
}
r = encoding.ASCIISub
default:
e := eucKREncode[uint16(r)]
if e == 0 {
r = encoding.ASCIISub
break
}
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break loop
}
dst[nDst+0] = uint8(e >> 8)
dst[nDst+1] = uint8(e)
nDst += 2
continue loop
}
// r is encoded as a single byte.
if nDst >= len(dst) {
err = transform.ErrShortDst
break loop
break
}
dst[nDst] = uint8(r)
nDst++
continue
write2:
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = uint8(r >> 8)
dst[nDst+1] = uint8(r)
nDst += 2
continue
}
return nDst, nSrc, err
}

Просмотреть файл

@ -14,6 +14,7 @@ import (
"fmt"
"log"
"net/http"
"sort"
"strings"
)
@ -70,9 +71,9 @@ func main() {
log.Fatalf("scanner error: %v", err)
}
fmt.Printf("// eucKRDecode is the decoding table from EUC-KR code to Unicode.\n")
fmt.Printf("// decode is the decoding table from EUC-KR code to Unicode.\n")
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-euc-kr.txt\n")
fmt.Printf("var eucKRDecode = [...]uint16{\n")
fmt.Printf("var decode = [...]uint16{\n")
for i, v := range mapping {
if v != 0 {
fmt.Printf("\t%d: 0x%04X,\n", i, v)
@ -80,12 +81,62 @@ func main() {
}
fmt.Printf("}\n\n")
fmt.Printf("// eucKREncode is the encoding table from Unicode to EUC-KR code.\n")
fmt.Printf("var eucKREncode = [65536]uint16{\n")
// Any run of at least separation continuous zero entries in the reverse map will
// be a separate encode table.
const separation = 1024
intervals := []interval(nil)
low, high := -1, -1
for i, v := range reverse {
if v != 0 {
fmt.Printf("\t%d: 0x%04X,\n", i, v)
if v == 0 {
continue
}
if low < 0 {
low = i
} else if i-high >= separation {
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
low = i
}
high = i + 1
}
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
sort.Sort(byDecreasingLength(intervals))
fmt.Printf("// encodeX are the encoding tables from Unicode to EUC-KR code,\n")
fmt.Printf("// sorted by decreasing length.\n")
for i, v := range intervals {
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
}
fmt.Printf("\n")
for i, v := range intervals {
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
fmt.Printf("var encode%d = [...]uint16{\n", i)
for j := v.low; j < v.high; j++ {
x := reverse[j]
if x == 0 {
continue
}
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x)
}
fmt.Printf("}\n\n")
}
fmt.Printf("}\n\n")
}
// interval is a half-open interval [low, high).
type interval struct {
low, high int
}
func (i interval) len() int { return i.high - i.low }
// byDecreasingLength sorts intervals by decreasing length.
type byDecreasingLength []interval
func (b byDecreasingLength) Len() int { return len(b) }
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -64,8 +64,8 @@ loop:
break loop
}
r, size = encoding.ASCIISub, 2
if i := int(c0-0x81)*190 + int(c1); i < len(gbkDecode) {
r = rune(gbkDecode[i])
if i := int(c0-0x81)*190 + int(c1); i < len(decode) {
r = rune(decode[i])
if r == 0 {
r = encoding.ASCIISub
}
@ -113,11 +113,11 @@ func (gbkEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err er
}
switch {
case gbkEncode0Low <= r && r < gbkEncode0High:
if r = rune(gbkEncode0[r-gbkEncode0Low]); r != 0 {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r != 0 {
goto write2
}
case gbkEncode1Low <= r && r < gbkEncode1High:
case encode1Low <= r && r < encode1High:
// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
// says to treat "gbk" as Code Page 936.
@ -125,19 +125,19 @@ func (gbkEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err er
r = 0x80
goto write1
}
if r = rune(gbkEncode1[r-gbkEncode1Low]); r != 0 {
if r = rune(encode1[r-encode1Low]); r != 0 {
goto write2
}
case gbkEncode2Low <= r && r < gbkEncode2High:
if r = rune(gbkEncode2[r-gbkEncode2Low]); r != 0 {
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r != 0 {
goto write2
}
case gbkEncode3Low <= r && r < gbkEncode3High:
if r = rune(gbkEncode3[r-gbkEncode3Low]); r != 0 {
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r != 0 {
goto write2
}
case gbkEncode4Low <= r && r < gbkEncode4High:
if r = rune(gbkEncode4[r-gbkEncode4Low]); r != 0 {
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r != 0 {
goto write2
}
}

Просмотреть файл

@ -58,9 +58,9 @@ func main() {
log.Fatalf("scanner error: %v", err)
}
fmt.Printf("// gbkDecode is the decoding table from GBK code to Unicode.\n")
fmt.Printf("// decode is the decoding table from GBK code to Unicode.\n")
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-gbk.txt\n")
fmt.Printf("var gbkDecode = [...]uint16{\n")
fmt.Printf("var decode = [...]uint16{\n")
for i, v := range mapping {
if v != 0 {
fmt.Printf("\t%d: 0x%04X,\n", i, v)
@ -69,7 +69,7 @@ func main() {
fmt.Printf("}\n\n")
// Any run of at least separation continuous zero entries in the reverse map will
// be a separate gbkEncode table.
// be a separate encode table.
const separation = 1024
intervals := []interval(nil)
@ -93,16 +93,16 @@ func main() {
}
sort.Sort(byDecreasingLength(intervals))
fmt.Printf("// gbkEncodeX are the encoding tables from Unicode to GBK code,\n")
fmt.Printf("// encodeX are the encoding tables from Unicode to GBK code,\n")
fmt.Printf("// sorted by decreasing length.\n")
for i, v := range intervals {
fmt.Printf("// gbkEncode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
}
fmt.Printf("\n")
for i, v := range intervals {
fmt.Printf("const gbkEncode%dLow, gbkEncode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
fmt.Printf("var gbkEncode%d = [...]uint16{\n", i)
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
fmt.Printf("var encode%d = [...]uint16{\n", i)
for j := v.low; j < v.high; j++ {
x := reverse[j]
if x == 0 {

Просмотреть файл

@ -3,9 +3,9 @@
// Package simplifiedchinese provides Simplified Chinese encodings such as GBK.
package simplifiedchinese
// gbkDecode is the decoding table from GBK code to Unicode.
// decode is the decoding table from GBK code to Unicode.
// It is defined at http://encoding.spec.whatwg.org/index-gbk.txt
var gbkDecode = [...]uint16{
var decode = [...]uint16{
0: 0x4E02,
1: 0x4E04,
2: 0x4E05,
@ -21881,17 +21881,17 @@ var gbkDecode = [...]uint16{
23844: 0x4DAE,
}
// gbkEncodeX are the encoding tables from Unicode to GBK code,
// encodeX are the encoding tables from Unicode to GBK code,
// sorted by decreasing length.
// gbkEncode0: 28965 entries for runes in [11905, 40870).
// gbkEncode1: 1587 entries for runes in [ 8208, 9795).
// gbkEncode2: 942 entries for runes in [ 164, 1106).
// gbkEncode3: 438 entries for runes in [65072, 65510).
// gbkEncode4: 254 entries for runes in [63788, 64042).
// encode0: 28965 entries for runes in [11905, 40870).
// encode1: 1587 entries for runes in [ 8208, 9795).
// encode2: 942 entries for runes in [ 164, 1106).
// encode3: 438 entries for runes in [65072, 65510).
// encode4: 254 entries for runes in [63788, 64042).
const gbkEncode0Low, gbkEncode0High = 11905, 40870
const encode0Low, encode0High = 11905, 40870
var gbkEncode0 = [...]uint16{
var encode0 = [...]uint16{
11905 - 11905: 0xFE50,
11908 - 11905: 0xFE54,
11912 - 11905: 0xFE57,
@ -43145,9 +43145,9 @@ var gbkEncode0 = [...]uint16{
40869 - 11905: 0xFD9B,
}
const gbkEncode1Low, gbkEncode1High = 8208, 9795
const encode1Low, encode1High = 8208, 9795
var gbkEncode1 = [...]uint16{
var encode1 = [...]uint16{
8208 - 8208: 0xA95C,
8211 - 8208: 0xA843,
8212 - 8208: 0xA1AA,
@ -43440,9 +43440,9 @@ var gbkEncode1 = [...]uint16{
9794 - 8208: 0xA1E1,
}
const gbkEncode2Low, gbkEncode2High = 164, 1106
const encode2Low, encode2High = 164, 1106
var gbkEncode2 = [...]uint16{
var encode2 = [...]uint16{
164 - 164: 0xA1E8,
167 - 164: 0xA1EC,
168 - 164: 0xA1A7,
@ -43603,9 +43603,9 @@ var gbkEncode2 = [...]uint16{
1105 - 164: 0xA7D7,
}
const gbkEncode3Low, gbkEncode3High = 65072, 65510
const encode3Low, encode3High = 65072, 65510
var gbkEncode3 = [...]uint16{
var encode3 = [...]uint16{
65072 - 65072: 0xA955,
65073 - 65072: 0xA6F2,
65075 - 65072: 0xA6F4,
@ -43760,9 +43760,9 @@ var gbkEncode3 = [...]uint16{
65509 - 65072: 0xA3A4,
}
const gbkEncode4Low, gbkEncode4High = 63788, 64042
const encode4Low, encode4High = 63788, 64042
var gbkEncode4 = [...]uint16{
var encode4 = [...]uint16{
63788 - 63788: 0xFD9C,
63865 - 63788: 0xFD9D,
63893 - 63788: 0xFD9E,

Просмотреть файл

@ -57,7 +57,7 @@ loop:
break loop
}
r, size = encoding.ASCIISub, 2
if i := int(c0-0x81)*157 + int(c1); i < len(big5Decode) {
if i := int(c0-0x81)*157 + int(c1); i < len(decode) {
if 1133 <= i && i < 1167 {
// The two-rune special cases for LATIN CAPITAL / SMALL E WITH CIRCUMFLEX
// AND MACRON / CARON are from http://encoding.spec.whatwg.org/#big5
@ -76,7 +76,7 @@ loop:
goto writeStr
}
}
r = rune(big5Decode[i])
r = rune(decode[i])
if r == 0 {
r = encoding.ASCIISub
}
@ -135,36 +135,36 @@ func (big5Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err e
if r >= utf8.RuneSelf {
switch {
case big5Encode0Low <= r && r < big5Encode0High:
if r = rune(big5Encode0[r-big5Encode0Low]); r != 0 {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r != 0 {
goto write2
}
case big5Encode1Low <= r && r < big5Encode1High:
if r = rune(big5Encode1[r-big5Encode1Low]); r != 0 {
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r != 0 {
goto write2
}
case big5Encode2Low <= r && r < big5Encode2High:
if r = rune(big5Encode2[r-big5Encode2Low]); r != 0 {
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r != 0 {
goto write2
}
case big5Encode3Low <= r && r < big5Encode3High:
if r = rune(big5Encode3[r-big5Encode3Low]); r != 0 {
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r != 0 {
goto write2
}
case big5Encode4Low <= r && r < big5Encode4High:
if r = rune(big5Encode4[r-big5Encode4Low]); r != 0 {
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r != 0 {
goto write2
}
case big5Encode5Low <= r && r < big5Encode5High:
if r = rune(big5Encode5[r-big5Encode5Low]); r != 0 {
case encode5Low <= r && r < encode5High:
if r = rune(encode5[r-encode5Low]); r != 0 {
goto write2
}
case big5Encode6Low <= r && r < big5Encode6High:
if r = rune(big5Encode6[r-big5Encode6Low]); r != 0 {
case encode6Low <= r && r < encode6High:
if r = rune(encode6[r-encode6Low]); r != 0 {
goto write2
}
case big5Encode7Low <= r && r < big5Encode7High:
if r = rune(big5Encode7[r-big5Encode7Low]); r != 0 {
case encode7Low <= r && r < encode7High:
if r = rune(encode7[r-encode7Low]); r != 0 {
goto write2
}
}

Просмотреть файл

@ -68,9 +68,9 @@ func main() {
log.Fatalf("scanner error: %v", err)
}
fmt.Printf("// big5Decode is the decoding table from Big5 code to Unicode.\n")
fmt.Printf("// decode is the decoding table from Big5 code to Unicode.\n")
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-big5.txt\n")
fmt.Printf("var big5Decode = [...]uint32{\n")
fmt.Printf("var decode = [...]uint32{\n")
for i, v := range mapping {
if v != 0 {
fmt.Printf("\t%d: 0x%08X,\n", i, v)
@ -79,7 +79,7 @@ func main() {
fmt.Printf("}\n\n")
// Any run of at least separation continuous zero entries in the reverse map will
// be a separate big5Encode table.
// be a separate encode table.
const separation = 1024
intervals := []interval(nil)
@ -103,16 +103,16 @@ func main() {
}
sort.Sort(byDecreasingLength(intervals))
fmt.Printf("// big5EncodeX are the encoding tables from Unicode to Big5 code,\n")
fmt.Printf("// encodeX are the encoding tables from Unicode to Big5 code,\n")
fmt.Printf("// sorted by decreasing length.\n")
for i, v := range intervals {
fmt.Printf("// big5Encode%d: %5d entries for runes in [%6d, %6d).\n", i, v.len(), v.low, v.high)
fmt.Printf("// encode%d: %5d entries for runes in [%6d, %6d).\n", i, v.len(), v.low, v.high)
}
fmt.Printf("\n")
for i, v := range intervals {
fmt.Printf("const big5Encode%dLow, big5Encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
fmt.Printf("var big5Encode%d = [...]uint16{\n", i)
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
fmt.Printf("var encode%d = [...]uint16{\n", i)
for j := v.low; j < v.high; j++ {
x := reverse[j]
if x == 0 {

Просмотреть файл

@ -3,9 +3,9 @@
// Package traditionalchinese provides Traditional Chinese encodings such as Big5.
package traditionalchinese
// big5Decode is the decoding table from Big5 code to Unicode.
// decode is the decoding table from Big5 code to Unicode.
// It is defined at http://encoding.spec.whatwg.org/index-big5.txt
var big5Decode = [...]uint32{
var decode = [...]uint32{
942: 0x000043F0,
943: 0x00004C32,
944: 0x00004603,
@ -18598,20 +18598,20 @@ var big5Decode = [...]uint32{
19781: 0x000079D4,
}
// big5EncodeX are the encoding tables from Unicode to Big5 code,
// encodeX are the encoding tables from Unicode to Big5 code,
// sorted by decreasing length.
// big5Encode0: 42633 entries for runes in [131105, 173738).
// big5Encode1: 29004 entries for runes in [ 11904, 40908).
// big5Encode2: 2176 entries for runes in [ 7870, 10046).
// big5Encode3: 939 entries for runes in [ 167, 1106).
// big5Encode4: 446 entries for runes in [ 65072, 65518).
// big5Encode5: 432 entries for runes in [194597, 195029).
// big5Encode6: 263 entries for runes in [ 63751, 64014).
// big5Encode7: 1 entries for runes in [175615, 175616).
// encode0: 42633 entries for runes in [131105, 173738).
// encode1: 29004 entries for runes in [ 11904, 40908).
// encode2: 2176 entries for runes in [ 7870, 10046).
// encode3: 939 entries for runes in [ 167, 1106).
// encode4: 446 entries for runes in [ 65072, 65518).
// encode5: 432 entries for runes in [194597, 195029).
// encode6: 263 entries for runes in [ 63751, 64014).
// encode7: 1 entries for runes in [175615, 175616).
const big5Encode0Low, big5Encode0High = 131105, 173738
const encode0Low, encode0High = 131105, 173738
var big5Encode0 = [...]uint16{
var encode0 = [...]uint16{
131105 - 131105: 0x9C71,
131134 - 131105: 0x9375,
131142 - 131105: 0x9376,
@ -20315,9 +20315,9 @@ var big5Encode0 = [...]uint16{
173737 - 131105: 0x9E75,
}
const big5Encode1Low, big5Encode1High = 11904, 40908
const encode1Low, encode1High = 11904, 40908
var big5Encode1 = [...]uint16{
var encode1 = [...]uint16{
11904 - 11904: 0xC8D6,
11908 - 11904: 0xC8D7,
11910 - 11904: 0xC8D8,
@ -36548,9 +36548,9 @@ var big5Encode1 = [...]uint16{
40907 - 11904: 0x87DF,
}
const big5Encode2Low, big5Encode2High = 7870, 10046
const encode2Low, encode2High = 7870, 10046
var big5Encode2 = [...]uint16{
var encode2 = [...]uint16{
7870 - 7870: 0x8863,
7871 - 7870: 0x88A4,
7872 - 7870: 0x8865,
@ -36768,9 +36768,9 @@ var big5Encode2 = [...]uint16{
10045 - 7870: 0xC6E6,
}
const big5Encode3Low, big5Encode3High = 167, 1106
const encode3Low, encode3High = 167, 1106
var big5Encode3 = [...]uint16{
var encode3 = [...]uint16{
167 - 167: 0xA1B1,
168 - 167: 0xC6D8,
175 - 167: 0xA1C2,
@ -36953,9 +36953,9 @@ var big5Encode3 = [...]uint16{
1105 - 167: 0xC85B,
}
const big5Encode4Low, big5Encode4High = 65072, 65518
const encode4Low, encode4High = 65072, 65518
var big5Encode4 = [...]uint16{
var encode4 = [...]uint16{
65072 - 65072: 0xA14A,
65073 - 65072: 0xA157,
65075 - 65072: 0xA159,
@ -37109,9 +37109,9 @@ var big5Encode4 = [...]uint16{
65517 - 65072: 0xF9FE,
}
const big5Encode5Low, big5Encode5High = 194597, 195029
const encode5Low, encode5High = 194597, 195029
var big5Encode5 = [...]uint16{
var encode5 = [...]uint16{
194597 - 194597: 0x9874,
194619 - 194597: 0x9AC8,
194624 - 194597: 0xA047,
@ -37125,16 +37125,16 @@ var big5Encode5 = [...]uint16{
195028 - 194597: 0x8FF0,
}
const big5Encode6Low, big5Encode6High = 63751, 64014
const encode6Low, encode6High = 63751, 64014
var big5Encode6 = [...]uint16{
var encode6 = [...]uint16{
63751 - 63751: 0x8BF8,
64012 - 63751: 0xC94A,
64013 - 63751: 0xDDFC,
}
const big5Encode7Low, big5Encode7High = 175615, 175616
const encode7Low, encode7High = 175615, 175616
var big5Encode7 = [...]uint16{
var encode7 = [...]uint16{
175615 - 175615: 0x87DC,
}