зеркало из https://github.com/golang/text.git
runes: added ReplaceIllFormed
This also doubles as a UTF-8 "encoder" and will likely be used as the decoder in the respective encoding packages. Change-Id: I9ce0d5fc4c8b458fc1764ed914eab43bfe297081 Reviewed-on: https://go-review.googlesource.com/17707 Reviewed-by: Hyang-Ah Hana Kim <hyangah@gmail.com>
This commit is contained in:
Родитель
dafab4adca
Коммит
a749bae74b
|
@ -221,3 +221,58 @@ func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err erro
|
|||
}
|
||||
return
|
||||
}
|
||||
|
||||
// ReplaceIllFormed returns a transformer that replaces all input bytes that are
|
||||
// not part of a well-formed UTF-8 code sequence with utf8.RuneError.
|
||||
func ReplaceIllFormed() Transformer {
|
||||
return Transformer{&replaceIllFormed{}}
|
||||
}
|
||||
|
||||
type replaceIllFormed struct{ transform.NopResetter }
|
||||
|
||||
func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
for nSrc < len(src) {
|
||||
r, size := utf8.DecodeRune(src[nSrc:])
|
||||
|
||||
// Look for an ASCII rune.
|
||||
if r < utf8.RuneSelf {
|
||||
if nDst == len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = byte(r)
|
||||
nDst++
|
||||
nSrc++
|
||||
continue
|
||||
}
|
||||
|
||||
// Look for a valid non-ASCII rune.
|
||||
if r != utf8.RuneError || size != 1 {
|
||||
if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
nDst += size
|
||||
nSrc += size
|
||||
continue
|
||||
}
|
||||
|
||||
// Look for short source data.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
|
||||
// We have an invalid rune.
|
||||
if nDst+3 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = runeErrorString[0]
|
||||
dst[nDst+1] = runeErrorString[1]
|
||||
dst[nDst+2] = runeErrorString[2]
|
||||
nDst += 3
|
||||
nSrc++
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
|
|
@ -392,6 +392,117 @@ func TestRemove(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestReplaceIllFormed(t *testing.T) {
|
||||
replace := ReplaceIllFormed()
|
||||
|
||||
for i, tt := range []transformTest{
|
||||
0: {
|
||||
szDst: large,
|
||||
atEOF: true,
|
||||
in: "",
|
||||
out: "",
|
||||
outFull: "",
|
||||
t: replace,
|
||||
},
|
||||
1: {
|
||||
szDst: 1,
|
||||
atEOF: true,
|
||||
in: "aa",
|
||||
out: "a",
|
||||
outFull: "aa",
|
||||
err: transform.ErrShortDst,
|
||||
t: replace,
|
||||
},
|
||||
2: {
|
||||
szDst: 1,
|
||||
atEOF: true,
|
||||
in: "a\x80",
|
||||
out: "a",
|
||||
outFull: "a\ufffd",
|
||||
err: transform.ErrShortDst,
|
||||
t: replace,
|
||||
},
|
||||
3: {
|
||||
szDst: 1,
|
||||
atEOF: true,
|
||||
in: "a\xc0",
|
||||
out: "a",
|
||||
outFull: "a\ufffd",
|
||||
err: transform.ErrShortDst,
|
||||
t: replace,
|
||||
},
|
||||
4: {
|
||||
szDst: large,
|
||||
atEOF: true,
|
||||
in: "\x80",
|
||||
out: "\ufffd",
|
||||
outFull: "\ufffd",
|
||||
t: replace,
|
||||
},
|
||||
5: {
|
||||
szDst: large,
|
||||
atEOF: false,
|
||||
in: "\x80",
|
||||
out: "\ufffd",
|
||||
outFull: "\ufffd",
|
||||
t: replace,
|
||||
},
|
||||
6: {
|
||||
szDst: large,
|
||||
atEOF: true,
|
||||
in: "\xc2",
|
||||
out: "\ufffd",
|
||||
outFull: "\ufffd",
|
||||
t: replace,
|
||||
},
|
||||
7: {
|
||||
szDst: large,
|
||||
atEOF: false,
|
||||
in: "\xc2",
|
||||
out: "",
|
||||
outFull: "\ufffd",
|
||||
err: transform.ErrShortSrc,
|
||||
t: replace,
|
||||
},
|
||||
8: {
|
||||
szDst: large,
|
||||
atEOF: true,
|
||||
in: "Hello world!",
|
||||
out: "Hello world!",
|
||||
outFull: "Hello world!",
|
||||
t: replace,
|
||||
},
|
||||
9: {
|
||||
szDst: large,
|
||||
atEOF: true,
|
||||
in: "Hello\x80 w\x80orl\xc2d!\xc2",
|
||||
out: "Hello\ufffd w\ufffdorl\ufffdd!\ufffd",
|
||||
outFull: "Hello\ufffd w\ufffdorl\ufffdd!\ufffd",
|
||||
t: replace,
|
||||
},
|
||||
10: {
|
||||
szDst: large,
|
||||
atEOF: false,
|
||||
in: "Hello\x80 w\x80orl\xc2d!\xc2",
|
||||
out: "Hello\ufffd w\ufffdorl\ufffdd!",
|
||||
outFull: "Hello\ufffd w\ufffdorl\ufffdd!\ufffd",
|
||||
err: transform.ErrShortSrc,
|
||||
t: replace,
|
||||
},
|
||||
16: {
|
||||
szDst: 10,
|
||||
atEOF: false,
|
||||
in: "\x80Hello\x80",
|
||||
out: "\ufffdHello",
|
||||
outFull: "\ufffdHello\ufffd",
|
||||
err: transform.ErrShortDst,
|
||||
t: replace,
|
||||
},
|
||||
} {
|
||||
tt.check(t, i)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMapAlloc(t *testing.T) {
|
||||
if n := testing.AllocsPerRun(3, func() {
|
||||
Map(idem).Transform(nil, nil, false)
|
||||
|
@ -410,6 +521,14 @@ func TestRemoveAlloc(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestReplaceIllFormedAlloc(t *testing.T) {
|
||||
if n := testing.AllocsPerRun(3, func() {
|
||||
ReplaceIllFormed().Transform(nil, nil, false)
|
||||
}); n > 0 {
|
||||
t.Errorf("got %f; want 0", n)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkRemove(b *testing.B) {
|
||||
dst := make([]byte, len(input))
|
||||
src := []byte(input)
|
||||
|
@ -446,6 +565,18 @@ func BenchmarkMapNone(b *testing.B) {
|
|||
}
|
||||
}
|
||||
|
||||
func BenchmarkReplaceIllFormed(b *testing.B) {
|
||||
dst := make([]byte, 2*len(input))
|
||||
src := []byte(input)
|
||||
|
||||
t := ReplaceIllFormed()
|
||||
b.ResetTimer()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
t.Transform(dst, src, true)
|
||||
}
|
||||
}
|
||||
|
||||
var (
|
||||
input = strings.Repeat("Thé qüick brøwn føx jumps øver the lazy døg. ", 100)
|
||||
)
|
||||
|
|
Загрузка…
Ссылка в новой задаче