зеркало из https://github.com/golang/text.git
go.text/encoding: shrink the japanese and korean encoding data tables.
The encoding.test binary size generated by "go test -c" drops by 132320 bytes. Some benchmarks get better, others get worse (but that might just be noise, as there are no code or data changes for Big5 or GBK). benchmark old MB/s new MB/s speedup BenchmarkBig5Encoder 170.12 171.82 1.01x BenchmarkEUCJPEncoder 160.94 156.07 0.97x BenchmarkEUCKREncoder 166.75 171.66 1.03x BenchmarkGBKEncoder 180.07 173.59 0.96x BenchmarkShiftJISEncoder 137.95 143.70 1.04x R=r CC=golang-dev https://golang.org/cl/13321047
This commit is contained in:
Родитель
d94036e178
Коммит
a60de809e6
|
@ -12,7 +12,7 @@ import (
|
|||
"code.google.com/p/go.text/transform"
|
||||
)
|
||||
|
||||
// EUCJP is the EUC-JP (Extended Unix Code Japanese) encoding.
|
||||
// EUCJP is the EUC-JP encoding.
|
||||
var EUCJP encoding.Encoding = eucJP{}
|
||||
|
||||
type eucJP struct{}
|
||||
|
@ -115,7 +115,6 @@ type eucJPEncoder struct{}
|
|||
|
||||
func (eucJPEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
loop:
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
r = rune(src[nSrc])
|
||||
|
||||
|
@ -132,63 +131,78 @@ loop:
|
|||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
switch {
|
||||
case r < utf8.RuneSelf:
|
||||
// No-op.
|
||||
|
||||
case 0xff61 <= r && r <= 0xff9f:
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
switch {
|
||||
case encode0Low <= r && r < encode0High:
|
||||
if r = rune(encode0[r-encode0Low]); r != 0 {
|
||||
goto write2or3
|
||||
}
|
||||
case encode1Low <= r && r < encode1High:
|
||||
if r = rune(encode1[r-encode1Low]); r != 0 {
|
||||
goto write2or3
|
||||
}
|
||||
case encode2Low <= r && r < encode2High:
|
||||
if r = rune(encode2[r-encode2Low]); r != 0 {
|
||||
goto write2or3
|
||||
}
|
||||
case encode3Low <= r && r < encode3High:
|
||||
if r = rune(encode3[r-encode3Low]); r != 0 {
|
||||
goto write2or3
|
||||
}
|
||||
case encode4Low <= r && r < encode4High:
|
||||
if r = rune(encode4[r-encode4Low]); r != 0 {
|
||||
goto write2or3
|
||||
}
|
||||
case encode5Low <= r && r < encode5High:
|
||||
if 0xff61 <= r && r < 0xffa0 {
|
||||
goto write2
|
||||
}
|
||||
if r = rune(encode5[r-encode5Low]); r != 0 {
|
||||
goto write2or3
|
||||
}
|
||||
}
|
||||
dst[nDst+0] = 0x8e
|
||||
dst[nDst+1] = uint8(r - (0xff61 - 0xa1))
|
||||
nDst += 2
|
||||
continue loop
|
||||
|
||||
case 0xffff < r:
|
||||
r = encoding.ASCIISub
|
||||
|
||||
default:
|
||||
e := jisEncode[uint16(r)]
|
||||
if e == 0 {
|
||||
r = encoding.ASCIISub
|
||||
break
|
||||
}
|
||||
switch e >> tableShift {
|
||||
case jis0208:
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
}
|
||||
dst[nDst+0] = 0xa1 + uint8(e>>codeShift)&codeMask
|
||||
dst[nDst+1] = 0xa1 + uint8(e)&codeMask
|
||||
nDst += 2
|
||||
case jis0212:
|
||||
if nDst+3 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
}
|
||||
dst[nDst+0] = 0x8f
|
||||
dst[nDst+1] = 0xa1 + uint8(e>>codeShift)&codeMask
|
||||
dst[nDst+2] = 0xa1 + uint8(e)&codeMask
|
||||
nDst += 3
|
||||
}
|
||||
continue loop
|
||||
}
|
||||
|
||||
// r is encoded as a single byte.
|
||||
if nDst >= len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
break
|
||||
}
|
||||
dst[nDst] = uint8(r)
|
||||
nDst++
|
||||
continue
|
||||
|
||||
write2or3:
|
||||
if r>>tableShift == jis0208 {
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
} else {
|
||||
if nDst+3 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = 0x8f
|
||||
nDst++
|
||||
}
|
||||
dst[nDst+0] = 0xa1 + uint8(r>>codeShift)&codeMask
|
||||
dst[nDst+1] = 0xa1 + uint8(r)&codeMask
|
||||
nDst += 2
|
||||
continue
|
||||
|
||||
write2:
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = 0x8e
|
||||
dst[nDst+1] = uint8(r - (0xff61 - 0xa1))
|
||||
nDst += 2
|
||||
continue
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
|
|
@ -18,6 +18,7 @@ import (
|
|||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
|
@ -84,6 +85,31 @@ func main() {
|
|||
fmt.Printf("}\n\n")
|
||||
}
|
||||
|
||||
// Any run of at least separation continuous zero entries in the reverse map will
|
||||
// be a separate encode table.
|
||||
const separation = 1024
|
||||
|
||||
intervals := []interval(nil)
|
||||
low, high := -1, -1
|
||||
for i, v := range reverse {
|
||||
if v.table == -1 {
|
||||
continue
|
||||
}
|
||||
if low < 0 {
|
||||
low = i
|
||||
} else if i-high >= separation {
|
||||
if high >= 0 {
|
||||
intervals = append(intervals, interval{low, high})
|
||||
}
|
||||
low = i
|
||||
}
|
||||
high = i + 1
|
||||
}
|
||||
if high >= 0 {
|
||||
intervals = append(intervals, interval{low, high})
|
||||
}
|
||||
sort.Sort(byDecreasingLength(intervals))
|
||||
|
||||
fmt.Printf("const (\n")
|
||||
fmt.Printf("\tjis0208 = 1\n")
|
||||
fmt.Printf("\tjis0212 = 2\n")
|
||||
|
@ -92,18 +118,43 @@ func main() {
|
|||
fmt.Printf("\ttableShift = 14\n")
|
||||
fmt.Printf(")\n\n")
|
||||
|
||||
fmt.Printf("// jisEncode is the encoding table from Unicode to JIS code.\n")
|
||||
fmt.Printf("// encodeX are the encoding tables from Unicode to JIS code,\n")
|
||||
fmt.Printf("// sorted by decreasing length.\n")
|
||||
for i, v := range intervals {
|
||||
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
|
||||
}
|
||||
fmt.Printf("//\n")
|
||||
fmt.Printf("// The high two bits of the value record whether the JIS code comes from the\n")
|
||||
fmt.Printf("// JIS0208 table (high bits == 1) or the JIS0212 table (high bits == 2).\n")
|
||||
fmt.Printf("// The low 14 bits are two 7-bit unsigned integers j1 and j2 that form the\n")
|
||||
fmt.Printf("// JIS code (94*j1 + j2) within that table.\n")
|
||||
fmt.Printf("var jisEncode = [65536]uint16{\n")
|
||||
for i, v := range reverse {
|
||||
if v.table == -1 {
|
||||
continue
|
||||
fmt.Printf("\n")
|
||||
|
||||
for i, v := range intervals {
|
||||
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
|
||||
fmt.Printf("var encode%d = [...]uint16{\n", i)
|
||||
for j := v.low; j < v.high; j++ {
|
||||
x := reverse[j]
|
||||
if x.table == -1 {
|
||||
continue
|
||||
}
|
||||
fmt.Printf("\t%d - %d: jis%s<<14 | 0x%02X<<7 | 0x%02X,\n",
|
||||
j, v.low, tables[x.table].name, x.jisCode/94, x.jisCode%94)
|
||||
}
|
||||
fmt.Printf("\t0x%04X: jis%s<<14 | 0x%02X<<7 | 0x%02X,\n",
|
||||
i, tables[v.table].name, v.jisCode/94, v.jisCode%94)
|
||||
fmt.Printf("}\n\n")
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
}
|
||||
|
||||
// interval is a half-open interval [low, high).
|
||||
type interval struct {
|
||||
low, high int
|
||||
}
|
||||
|
||||
func (i interval) len() int { return i.high - i.low }
|
||||
|
||||
// byDecreasingLength sorts intervals by decreasing length.
|
||||
type byDecreasingLength []interval
|
||||
|
||||
func (b byDecreasingLength) Len() int { return len(b) }
|
||||
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
|
||||
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
|
||||
|
|
|
@ -12,8 +12,8 @@ import (
|
|||
"code.google.com/p/go.text/transform"
|
||||
)
|
||||
|
||||
// ShiftJIS is the Shift JIS (Japanese Industrial Standards) encoding, also
|
||||
// known as Code Page 932 and Windows-31J.
|
||||
// ShiftJIS is the Shift JIS encoding, also known as Code Page 932 and
|
||||
// Windows-31J.
|
||||
var ShiftJIS encoding.Encoding = shiftJIS{}
|
||||
|
||||
type shiftJIS struct{}
|
||||
|
@ -126,55 +126,68 @@ loop:
|
|||
break loop
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
switch {
|
||||
case r < utf8.RuneSelf:
|
||||
// r is an ASCII rune.
|
||||
|
||||
case 0xff61 <= r && r <= 0xff9f:
|
||||
r -= 0xff61 - 0xa1
|
||||
|
||||
case 0xffff < r:
|
||||
switch {
|
||||
case encode0Low <= r && r < encode0High:
|
||||
if r = rune(encode0[r-encode0Low]); r>>tableShift == jis0208 {
|
||||
goto write2
|
||||
}
|
||||
case encode1Low <= r && r < encode1High:
|
||||
if r = rune(encode1[r-encode1Low]); r>>tableShift == jis0208 {
|
||||
goto write2
|
||||
}
|
||||
case encode2Low <= r && r < encode2High:
|
||||
if r = rune(encode2[r-encode2Low]); r>>tableShift == jis0208 {
|
||||
goto write2
|
||||
}
|
||||
case encode3Low <= r && r < encode3High:
|
||||
if r = rune(encode3[r-encode3Low]); r>>tableShift == jis0208 {
|
||||
goto write2
|
||||
}
|
||||
case encode4Low <= r && r < encode4High:
|
||||
if r = rune(encode4[r-encode4Low]); r>>tableShift == jis0208 {
|
||||
goto write2
|
||||
}
|
||||
case encode5Low <= r && r < encode5High:
|
||||
if 0xff61 <= r && r < 0xffa0 {
|
||||
r -= 0xff61 - 0xa1
|
||||
goto write1
|
||||
}
|
||||
if r = rune(encode5[r-encode5Low]); r>>tableShift == jis0208 {
|
||||
goto write2
|
||||
}
|
||||
}
|
||||
r = encoding.ASCIISub
|
||||
|
||||
default:
|
||||
e := jisEncode[uint16(r)]
|
||||
if e == 0 {
|
||||
r = encoding.ASCIISub
|
||||
break
|
||||
}
|
||||
if e>>tableShift != jis0208 {
|
||||
r = encoding.ASCIISub
|
||||
break
|
||||
}
|
||||
j1 := uint8(e>>codeShift) & codeMask
|
||||
j2 := uint8(e) & codeMask
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
}
|
||||
if j1 <= 61 {
|
||||
dst[nDst+0] = 129 + j1/2
|
||||
} else {
|
||||
dst[nDst+0] = 193 + j1/2
|
||||
}
|
||||
if j1&1 == 0 {
|
||||
dst[nDst+1] = j2 + j2/63 + 64
|
||||
} else {
|
||||
dst[nDst+1] = j2 + 159
|
||||
}
|
||||
nDst += 2
|
||||
continue loop
|
||||
}
|
||||
|
||||
// r is encoded as a single byte.
|
||||
write1:
|
||||
if nDst >= len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
break
|
||||
}
|
||||
dst[nDst] = uint8(r)
|
||||
nDst++
|
||||
continue
|
||||
|
||||
write2:
|
||||
j1 := uint8(r>>codeShift) & codeMask
|
||||
j2 := uint8(r) & codeMask
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
}
|
||||
if j1 <= 61 {
|
||||
dst[nDst+0] = 129 + j1/2
|
||||
} else {
|
||||
dst[nDst+0] = 193 + j1/2
|
||||
}
|
||||
if j1&1 == 0 {
|
||||
dst[nDst+1] = j2 + j2/63 + 64
|
||||
} else {
|
||||
dst[nDst+1] = j2 + 159
|
||||
}
|
||||
nDst += 2
|
||||
continue
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -66,8 +66,8 @@ loop:
|
|||
err = errInvalidEUCKR
|
||||
break loop
|
||||
}
|
||||
if int(r) < len(eucKRDecode) {
|
||||
r = rune(eucKRDecode[r])
|
||||
if int(r) < len(decode) {
|
||||
r = rune(decode[r])
|
||||
if r == 0 {
|
||||
r = encoding.ASCIISub
|
||||
}
|
||||
|
@ -97,7 +97,6 @@ type eucKREncoder struct{}
|
|||
|
||||
func (eucKREncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
loop:
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
r = rune(src[nSrc])
|
||||
|
||||
|
@ -114,41 +113,60 @@ loop:
|
|||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
switch {
|
||||
case r < utf8.RuneSelf:
|
||||
// No-op.
|
||||
|
||||
case 0xffff < r:
|
||||
switch {
|
||||
case encode0Low <= r && r < encode0High:
|
||||
if r = rune(encode0[r-encode0Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode1Low <= r && r < encode1High:
|
||||
if r = rune(encode1[r-encode1Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode2Low <= r && r < encode2High:
|
||||
if r = rune(encode2[r-encode2Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode3Low <= r && r < encode3High:
|
||||
if r = rune(encode3[r-encode3Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode4Low <= r && r < encode4High:
|
||||
if r = rune(encode4[r-encode4Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode5Low <= r && r < encode5High:
|
||||
if r = rune(encode5[r-encode5Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode6Low <= r && r < encode6High:
|
||||
if r = rune(encode6[r-encode6Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
}
|
||||
r = encoding.ASCIISub
|
||||
|
||||
default:
|
||||
e := eucKREncode[uint16(r)]
|
||||
if e == 0 {
|
||||
r = encoding.ASCIISub
|
||||
break
|
||||
}
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
}
|
||||
dst[nDst+0] = uint8(e >> 8)
|
||||
dst[nDst+1] = uint8(e)
|
||||
nDst += 2
|
||||
continue loop
|
||||
}
|
||||
|
||||
// r is encoded as a single byte.
|
||||
if nDst >= len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
break
|
||||
}
|
||||
dst[nDst] = uint8(r)
|
||||
nDst++
|
||||
continue
|
||||
|
||||
write2:
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = uint8(r >> 8)
|
||||
dst[nDst+1] = uint8(r)
|
||||
nDst += 2
|
||||
continue
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
|
|
@ -14,6 +14,7 @@ import (
|
|||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
|
@ -70,9 +71,9 @@ func main() {
|
|||
log.Fatalf("scanner error: %v", err)
|
||||
}
|
||||
|
||||
fmt.Printf("// eucKRDecode is the decoding table from EUC-KR code to Unicode.\n")
|
||||
fmt.Printf("// decode is the decoding table from EUC-KR code to Unicode.\n")
|
||||
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-euc-kr.txt\n")
|
||||
fmt.Printf("var eucKRDecode = [...]uint16{\n")
|
||||
fmt.Printf("var decode = [...]uint16{\n")
|
||||
for i, v := range mapping {
|
||||
if v != 0 {
|
||||
fmt.Printf("\t%d: 0x%04X,\n", i, v)
|
||||
|
@ -80,12 +81,62 @@ func main() {
|
|||
}
|
||||
fmt.Printf("}\n\n")
|
||||
|
||||
fmt.Printf("// eucKREncode is the encoding table from Unicode to EUC-KR code.\n")
|
||||
fmt.Printf("var eucKREncode = [65536]uint16{\n")
|
||||
// Any run of at least separation continuous zero entries in the reverse map will
|
||||
// be a separate encode table.
|
||||
const separation = 1024
|
||||
|
||||
intervals := []interval(nil)
|
||||
low, high := -1, -1
|
||||
for i, v := range reverse {
|
||||
if v != 0 {
|
||||
fmt.Printf("\t%d: 0x%04X,\n", i, v)
|
||||
if v == 0 {
|
||||
continue
|
||||
}
|
||||
if low < 0 {
|
||||
low = i
|
||||
} else if i-high >= separation {
|
||||
if high >= 0 {
|
||||
intervals = append(intervals, interval{low, high})
|
||||
}
|
||||
low = i
|
||||
}
|
||||
high = i + 1
|
||||
}
|
||||
if high >= 0 {
|
||||
intervals = append(intervals, interval{low, high})
|
||||
}
|
||||
sort.Sort(byDecreasingLength(intervals))
|
||||
|
||||
fmt.Printf("// encodeX are the encoding tables from Unicode to EUC-KR code,\n")
|
||||
fmt.Printf("// sorted by decreasing length.\n")
|
||||
for i, v := range intervals {
|
||||
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
|
||||
}
|
||||
fmt.Printf("\n")
|
||||
|
||||
for i, v := range intervals {
|
||||
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
|
||||
fmt.Printf("var encode%d = [...]uint16{\n", i)
|
||||
for j := v.low; j < v.high; j++ {
|
||||
x := reverse[j]
|
||||
if x == 0 {
|
||||
continue
|
||||
}
|
||||
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x)
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
}
|
||||
|
||||
// interval is a half-open interval [low, high).
|
||||
type interval struct {
|
||||
low, high int
|
||||
}
|
||||
|
||||
func (i interval) len() int { return i.high - i.low }
|
||||
|
||||
// byDecreasingLength sorts intervals by decreasing length.
|
||||
type byDecreasingLength []interval
|
||||
|
||||
func (b byDecreasingLength) Len() int { return len(b) }
|
||||
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
|
||||
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -64,8 +64,8 @@ loop:
|
|||
break loop
|
||||
}
|
||||
r, size = encoding.ASCIISub, 2
|
||||
if i := int(c0-0x81)*190 + int(c1); i < len(gbkDecode) {
|
||||
r = rune(gbkDecode[i])
|
||||
if i := int(c0-0x81)*190 + int(c1); i < len(decode) {
|
||||
r = rune(decode[i])
|
||||
if r == 0 {
|
||||
r = encoding.ASCIISub
|
||||
}
|
||||
|
@ -113,11 +113,11 @@ func (gbkEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err er
|
|||
}
|
||||
|
||||
switch {
|
||||
case gbkEncode0Low <= r && r < gbkEncode0High:
|
||||
if r = rune(gbkEncode0[r-gbkEncode0Low]); r != 0 {
|
||||
case encode0Low <= r && r < encode0High:
|
||||
if r = rune(encode0[r-encode0Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case gbkEncode1Low <= r && r < gbkEncode1High:
|
||||
case encode1Low <= r && r < encode1High:
|
||||
// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
|
||||
// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
|
||||
// says to treat "gbk" as Code Page 936.
|
||||
|
@ -125,19 +125,19 @@ func (gbkEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err er
|
|||
r = 0x80
|
||||
goto write1
|
||||
}
|
||||
if r = rune(gbkEncode1[r-gbkEncode1Low]); r != 0 {
|
||||
if r = rune(encode1[r-encode1Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case gbkEncode2Low <= r && r < gbkEncode2High:
|
||||
if r = rune(gbkEncode2[r-gbkEncode2Low]); r != 0 {
|
||||
case encode2Low <= r && r < encode2High:
|
||||
if r = rune(encode2[r-encode2Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case gbkEncode3Low <= r && r < gbkEncode3High:
|
||||
if r = rune(gbkEncode3[r-gbkEncode3Low]); r != 0 {
|
||||
case encode3Low <= r && r < encode3High:
|
||||
if r = rune(encode3[r-encode3Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case gbkEncode4Low <= r && r < gbkEncode4High:
|
||||
if r = rune(gbkEncode4[r-gbkEncode4Low]); r != 0 {
|
||||
case encode4Low <= r && r < encode4High:
|
||||
if r = rune(encode4[r-encode4Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
}
|
||||
|
|
|
@ -58,9 +58,9 @@ func main() {
|
|||
log.Fatalf("scanner error: %v", err)
|
||||
}
|
||||
|
||||
fmt.Printf("// gbkDecode is the decoding table from GBK code to Unicode.\n")
|
||||
fmt.Printf("// decode is the decoding table from GBK code to Unicode.\n")
|
||||
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-gbk.txt\n")
|
||||
fmt.Printf("var gbkDecode = [...]uint16{\n")
|
||||
fmt.Printf("var decode = [...]uint16{\n")
|
||||
for i, v := range mapping {
|
||||
if v != 0 {
|
||||
fmt.Printf("\t%d: 0x%04X,\n", i, v)
|
||||
|
@ -69,7 +69,7 @@ func main() {
|
|||
fmt.Printf("}\n\n")
|
||||
|
||||
// Any run of at least separation continuous zero entries in the reverse map will
|
||||
// be a separate gbkEncode table.
|
||||
// be a separate encode table.
|
||||
const separation = 1024
|
||||
|
||||
intervals := []interval(nil)
|
||||
|
@ -93,16 +93,16 @@ func main() {
|
|||
}
|
||||
sort.Sort(byDecreasingLength(intervals))
|
||||
|
||||
fmt.Printf("// gbkEncodeX are the encoding tables from Unicode to GBK code,\n")
|
||||
fmt.Printf("// encodeX are the encoding tables from Unicode to GBK code,\n")
|
||||
fmt.Printf("// sorted by decreasing length.\n")
|
||||
for i, v := range intervals {
|
||||
fmt.Printf("// gbkEncode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
|
||||
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
|
||||
}
|
||||
fmt.Printf("\n")
|
||||
|
||||
for i, v := range intervals {
|
||||
fmt.Printf("const gbkEncode%dLow, gbkEncode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
|
||||
fmt.Printf("var gbkEncode%d = [...]uint16{\n", i)
|
||||
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
|
||||
fmt.Printf("var encode%d = [...]uint16{\n", i)
|
||||
for j := v.low; j < v.high; j++ {
|
||||
x := reverse[j]
|
||||
if x == 0 {
|
||||
|
|
|
@ -3,9 +3,9 @@
|
|||
// Package simplifiedchinese provides Simplified Chinese encodings such as GBK.
|
||||
package simplifiedchinese
|
||||
|
||||
// gbkDecode is the decoding table from GBK code to Unicode.
|
||||
// decode is the decoding table from GBK code to Unicode.
|
||||
// It is defined at http://encoding.spec.whatwg.org/index-gbk.txt
|
||||
var gbkDecode = [...]uint16{
|
||||
var decode = [...]uint16{
|
||||
0: 0x4E02,
|
||||
1: 0x4E04,
|
||||
2: 0x4E05,
|
||||
|
@ -21881,17 +21881,17 @@ var gbkDecode = [...]uint16{
|
|||
23844: 0x4DAE,
|
||||
}
|
||||
|
||||
// gbkEncodeX are the encoding tables from Unicode to GBK code,
|
||||
// encodeX are the encoding tables from Unicode to GBK code,
|
||||
// sorted by decreasing length.
|
||||
// gbkEncode0: 28965 entries for runes in [11905, 40870).
|
||||
// gbkEncode1: 1587 entries for runes in [ 8208, 9795).
|
||||
// gbkEncode2: 942 entries for runes in [ 164, 1106).
|
||||
// gbkEncode3: 438 entries for runes in [65072, 65510).
|
||||
// gbkEncode4: 254 entries for runes in [63788, 64042).
|
||||
// encode0: 28965 entries for runes in [11905, 40870).
|
||||
// encode1: 1587 entries for runes in [ 8208, 9795).
|
||||
// encode2: 942 entries for runes in [ 164, 1106).
|
||||
// encode3: 438 entries for runes in [65072, 65510).
|
||||
// encode4: 254 entries for runes in [63788, 64042).
|
||||
|
||||
const gbkEncode0Low, gbkEncode0High = 11905, 40870
|
||||
const encode0Low, encode0High = 11905, 40870
|
||||
|
||||
var gbkEncode0 = [...]uint16{
|
||||
var encode0 = [...]uint16{
|
||||
11905 - 11905: 0xFE50,
|
||||
11908 - 11905: 0xFE54,
|
||||
11912 - 11905: 0xFE57,
|
||||
|
@ -43145,9 +43145,9 @@ var gbkEncode0 = [...]uint16{
|
|||
40869 - 11905: 0xFD9B,
|
||||
}
|
||||
|
||||
const gbkEncode1Low, gbkEncode1High = 8208, 9795
|
||||
const encode1Low, encode1High = 8208, 9795
|
||||
|
||||
var gbkEncode1 = [...]uint16{
|
||||
var encode1 = [...]uint16{
|
||||
8208 - 8208: 0xA95C,
|
||||
8211 - 8208: 0xA843,
|
||||
8212 - 8208: 0xA1AA,
|
||||
|
@ -43440,9 +43440,9 @@ var gbkEncode1 = [...]uint16{
|
|||
9794 - 8208: 0xA1E1,
|
||||
}
|
||||
|
||||
const gbkEncode2Low, gbkEncode2High = 164, 1106
|
||||
const encode2Low, encode2High = 164, 1106
|
||||
|
||||
var gbkEncode2 = [...]uint16{
|
||||
var encode2 = [...]uint16{
|
||||
164 - 164: 0xA1E8,
|
||||
167 - 164: 0xA1EC,
|
||||
168 - 164: 0xA1A7,
|
||||
|
@ -43603,9 +43603,9 @@ var gbkEncode2 = [...]uint16{
|
|||
1105 - 164: 0xA7D7,
|
||||
}
|
||||
|
||||
const gbkEncode3Low, gbkEncode3High = 65072, 65510
|
||||
const encode3Low, encode3High = 65072, 65510
|
||||
|
||||
var gbkEncode3 = [...]uint16{
|
||||
var encode3 = [...]uint16{
|
||||
65072 - 65072: 0xA955,
|
||||
65073 - 65072: 0xA6F2,
|
||||
65075 - 65072: 0xA6F4,
|
||||
|
@ -43760,9 +43760,9 @@ var gbkEncode3 = [...]uint16{
|
|||
65509 - 65072: 0xA3A4,
|
||||
}
|
||||
|
||||
const gbkEncode4Low, gbkEncode4High = 63788, 64042
|
||||
const encode4Low, encode4High = 63788, 64042
|
||||
|
||||
var gbkEncode4 = [...]uint16{
|
||||
var encode4 = [...]uint16{
|
||||
63788 - 63788: 0xFD9C,
|
||||
63865 - 63788: 0xFD9D,
|
||||
63893 - 63788: 0xFD9E,
|
||||
|
|
|
@ -57,7 +57,7 @@ loop:
|
|||
break loop
|
||||
}
|
||||
r, size = encoding.ASCIISub, 2
|
||||
if i := int(c0-0x81)*157 + int(c1); i < len(big5Decode) {
|
||||
if i := int(c0-0x81)*157 + int(c1); i < len(decode) {
|
||||
if 1133 <= i && i < 1167 {
|
||||
// The two-rune special cases for LATIN CAPITAL / SMALL E WITH CIRCUMFLEX
|
||||
// AND MACRON / CARON are from http://encoding.spec.whatwg.org/#big5
|
||||
|
@ -76,7 +76,7 @@ loop:
|
|||
goto writeStr
|
||||
}
|
||||
}
|
||||
r = rune(big5Decode[i])
|
||||
r = rune(decode[i])
|
||||
if r == 0 {
|
||||
r = encoding.ASCIISub
|
||||
}
|
||||
|
@ -135,36 +135,36 @@ func (big5Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err e
|
|||
|
||||
if r >= utf8.RuneSelf {
|
||||
switch {
|
||||
case big5Encode0Low <= r && r < big5Encode0High:
|
||||
if r = rune(big5Encode0[r-big5Encode0Low]); r != 0 {
|
||||
case encode0Low <= r && r < encode0High:
|
||||
if r = rune(encode0[r-encode0Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case big5Encode1Low <= r && r < big5Encode1High:
|
||||
if r = rune(big5Encode1[r-big5Encode1Low]); r != 0 {
|
||||
case encode1Low <= r && r < encode1High:
|
||||
if r = rune(encode1[r-encode1Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case big5Encode2Low <= r && r < big5Encode2High:
|
||||
if r = rune(big5Encode2[r-big5Encode2Low]); r != 0 {
|
||||
case encode2Low <= r && r < encode2High:
|
||||
if r = rune(encode2[r-encode2Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case big5Encode3Low <= r && r < big5Encode3High:
|
||||
if r = rune(big5Encode3[r-big5Encode3Low]); r != 0 {
|
||||
case encode3Low <= r && r < encode3High:
|
||||
if r = rune(encode3[r-encode3Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case big5Encode4Low <= r && r < big5Encode4High:
|
||||
if r = rune(big5Encode4[r-big5Encode4Low]); r != 0 {
|
||||
case encode4Low <= r && r < encode4High:
|
||||
if r = rune(encode4[r-encode4Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case big5Encode5Low <= r && r < big5Encode5High:
|
||||
if r = rune(big5Encode5[r-big5Encode5Low]); r != 0 {
|
||||
case encode5Low <= r && r < encode5High:
|
||||
if r = rune(encode5[r-encode5Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case big5Encode6Low <= r && r < big5Encode6High:
|
||||
if r = rune(big5Encode6[r-big5Encode6Low]); r != 0 {
|
||||
case encode6Low <= r && r < encode6High:
|
||||
if r = rune(encode6[r-encode6Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case big5Encode7Low <= r && r < big5Encode7High:
|
||||
if r = rune(big5Encode7[r-big5Encode7Low]); r != 0 {
|
||||
case encode7Low <= r && r < encode7High:
|
||||
if r = rune(encode7[r-encode7Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
}
|
||||
|
|
|
@ -68,9 +68,9 @@ func main() {
|
|||
log.Fatalf("scanner error: %v", err)
|
||||
}
|
||||
|
||||
fmt.Printf("// big5Decode is the decoding table from Big5 code to Unicode.\n")
|
||||
fmt.Printf("// decode is the decoding table from Big5 code to Unicode.\n")
|
||||
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-big5.txt\n")
|
||||
fmt.Printf("var big5Decode = [...]uint32{\n")
|
||||
fmt.Printf("var decode = [...]uint32{\n")
|
||||
for i, v := range mapping {
|
||||
if v != 0 {
|
||||
fmt.Printf("\t%d: 0x%08X,\n", i, v)
|
||||
|
@ -79,7 +79,7 @@ func main() {
|
|||
fmt.Printf("}\n\n")
|
||||
|
||||
// Any run of at least separation continuous zero entries in the reverse map will
|
||||
// be a separate big5Encode table.
|
||||
// be a separate encode table.
|
||||
const separation = 1024
|
||||
|
||||
intervals := []interval(nil)
|
||||
|
@ -103,16 +103,16 @@ func main() {
|
|||
}
|
||||
sort.Sort(byDecreasingLength(intervals))
|
||||
|
||||
fmt.Printf("// big5EncodeX are the encoding tables from Unicode to Big5 code,\n")
|
||||
fmt.Printf("// encodeX are the encoding tables from Unicode to Big5 code,\n")
|
||||
fmt.Printf("// sorted by decreasing length.\n")
|
||||
for i, v := range intervals {
|
||||
fmt.Printf("// big5Encode%d: %5d entries for runes in [%6d, %6d).\n", i, v.len(), v.low, v.high)
|
||||
fmt.Printf("// encode%d: %5d entries for runes in [%6d, %6d).\n", i, v.len(), v.low, v.high)
|
||||
}
|
||||
fmt.Printf("\n")
|
||||
|
||||
for i, v := range intervals {
|
||||
fmt.Printf("const big5Encode%dLow, big5Encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
|
||||
fmt.Printf("var big5Encode%d = [...]uint16{\n", i)
|
||||
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
|
||||
fmt.Printf("var encode%d = [...]uint16{\n", i)
|
||||
for j := v.low; j < v.high; j++ {
|
||||
x := reverse[j]
|
||||
if x == 0 {
|
||||
|
|
|
@ -3,9 +3,9 @@
|
|||
// Package traditionalchinese provides Traditional Chinese encodings such as Big5.
|
||||
package traditionalchinese
|
||||
|
||||
// big5Decode is the decoding table from Big5 code to Unicode.
|
||||
// decode is the decoding table from Big5 code to Unicode.
|
||||
// It is defined at http://encoding.spec.whatwg.org/index-big5.txt
|
||||
var big5Decode = [...]uint32{
|
||||
var decode = [...]uint32{
|
||||
942: 0x000043F0,
|
||||
943: 0x00004C32,
|
||||
944: 0x00004603,
|
||||
|
@ -18598,20 +18598,20 @@ var big5Decode = [...]uint32{
|
|||
19781: 0x000079D4,
|
||||
}
|
||||
|
||||
// big5EncodeX are the encoding tables from Unicode to Big5 code,
|
||||
// encodeX are the encoding tables from Unicode to Big5 code,
|
||||
// sorted by decreasing length.
|
||||
// big5Encode0: 42633 entries for runes in [131105, 173738).
|
||||
// big5Encode1: 29004 entries for runes in [ 11904, 40908).
|
||||
// big5Encode2: 2176 entries for runes in [ 7870, 10046).
|
||||
// big5Encode3: 939 entries for runes in [ 167, 1106).
|
||||
// big5Encode4: 446 entries for runes in [ 65072, 65518).
|
||||
// big5Encode5: 432 entries for runes in [194597, 195029).
|
||||
// big5Encode6: 263 entries for runes in [ 63751, 64014).
|
||||
// big5Encode7: 1 entries for runes in [175615, 175616).
|
||||
// encode0: 42633 entries for runes in [131105, 173738).
|
||||
// encode1: 29004 entries for runes in [ 11904, 40908).
|
||||
// encode2: 2176 entries for runes in [ 7870, 10046).
|
||||
// encode3: 939 entries for runes in [ 167, 1106).
|
||||
// encode4: 446 entries for runes in [ 65072, 65518).
|
||||
// encode5: 432 entries for runes in [194597, 195029).
|
||||
// encode6: 263 entries for runes in [ 63751, 64014).
|
||||
// encode7: 1 entries for runes in [175615, 175616).
|
||||
|
||||
const big5Encode0Low, big5Encode0High = 131105, 173738
|
||||
const encode0Low, encode0High = 131105, 173738
|
||||
|
||||
var big5Encode0 = [...]uint16{
|
||||
var encode0 = [...]uint16{
|
||||
131105 - 131105: 0x9C71,
|
||||
131134 - 131105: 0x9375,
|
||||
131142 - 131105: 0x9376,
|
||||
|
@ -20315,9 +20315,9 @@ var big5Encode0 = [...]uint16{
|
|||
173737 - 131105: 0x9E75,
|
||||
}
|
||||
|
||||
const big5Encode1Low, big5Encode1High = 11904, 40908
|
||||
const encode1Low, encode1High = 11904, 40908
|
||||
|
||||
var big5Encode1 = [...]uint16{
|
||||
var encode1 = [...]uint16{
|
||||
11904 - 11904: 0xC8D6,
|
||||
11908 - 11904: 0xC8D7,
|
||||
11910 - 11904: 0xC8D8,
|
||||
|
@ -36548,9 +36548,9 @@ var big5Encode1 = [...]uint16{
|
|||
40907 - 11904: 0x87DF,
|
||||
}
|
||||
|
||||
const big5Encode2Low, big5Encode2High = 7870, 10046
|
||||
const encode2Low, encode2High = 7870, 10046
|
||||
|
||||
var big5Encode2 = [...]uint16{
|
||||
var encode2 = [...]uint16{
|
||||
7870 - 7870: 0x8863,
|
||||
7871 - 7870: 0x88A4,
|
||||
7872 - 7870: 0x8865,
|
||||
|
@ -36768,9 +36768,9 @@ var big5Encode2 = [...]uint16{
|
|||
10045 - 7870: 0xC6E6,
|
||||
}
|
||||
|
||||
const big5Encode3Low, big5Encode3High = 167, 1106
|
||||
const encode3Low, encode3High = 167, 1106
|
||||
|
||||
var big5Encode3 = [...]uint16{
|
||||
var encode3 = [...]uint16{
|
||||
167 - 167: 0xA1B1,
|
||||
168 - 167: 0xC6D8,
|
||||
175 - 167: 0xA1C2,
|
||||
|
@ -36953,9 +36953,9 @@ var big5Encode3 = [...]uint16{
|
|||
1105 - 167: 0xC85B,
|
||||
}
|
||||
|
||||
const big5Encode4Low, big5Encode4High = 65072, 65518
|
||||
const encode4Low, encode4High = 65072, 65518
|
||||
|
||||
var big5Encode4 = [...]uint16{
|
||||
var encode4 = [...]uint16{
|
||||
65072 - 65072: 0xA14A,
|
||||
65073 - 65072: 0xA157,
|
||||
65075 - 65072: 0xA159,
|
||||
|
@ -37109,9 +37109,9 @@ var big5Encode4 = [...]uint16{
|
|||
65517 - 65072: 0xF9FE,
|
||||
}
|
||||
|
||||
const big5Encode5Low, big5Encode5High = 194597, 195029
|
||||
const encode5Low, encode5High = 194597, 195029
|
||||
|
||||
var big5Encode5 = [...]uint16{
|
||||
var encode5 = [...]uint16{
|
||||
194597 - 194597: 0x9874,
|
||||
194619 - 194597: 0x9AC8,
|
||||
194624 - 194597: 0xA047,
|
||||
|
@ -37125,16 +37125,16 @@ var big5Encode5 = [...]uint16{
|
|||
195028 - 194597: 0x8FF0,
|
||||
}
|
||||
|
||||
const big5Encode6Low, big5Encode6High = 63751, 64014
|
||||
const encode6Low, encode6High = 63751, 64014
|
||||
|
||||
var big5Encode6 = [...]uint16{
|
||||
var encode6 = [...]uint16{
|
||||
63751 - 63751: 0x8BF8,
|
||||
64012 - 63751: 0xC94A,
|
||||
64013 - 63751: 0xDDFC,
|
||||
}
|
||||
|
||||
const big5Encode7Low, big5Encode7High = 175615, 175616
|
||||
const encode7Low, encode7High = 175615, 175616
|
||||
|
||||
var big5Encode7 = [...]uint16{
|
||||
var encode7 = [...]uint16{
|
||||
175615 - 175615: 0x87DC,
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче