This also doubles as a UTF-8 "encoder" and will likely be used as
the decoder in the respective encoding packages.

Change-Id: I9ce0d5fc4c8b458fc1764ed914eab43bfe297081
Reviewed-on: https://go-review.googlesource.com/17707
Reviewed-by: Hyang-Ah Hana Kim <hyangah@gmail.com>
This commit is contained in:
Marcel van Lohuizen 2015-12-11 13:30:52 +01:00
Родитель dafab4adca
Коммит a749bae74b
2 изменённых файлов: 186 добавлений и 0 удалений

Просмотреть файл

@ -221,3 +221,58 @@ func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err erro
}
return
}
// ReplaceIllFormed returns a transformer that replaces all input bytes that are
// not part of a well-formed UTF-8 code sequence with utf8.RuneError.
func ReplaceIllFormed() Transformer {
return Transformer{&replaceIllFormed{}}
}
type replaceIllFormed struct{ transform.NopResetter }
func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
for nSrc < len(src) {
r, size := utf8.DecodeRune(src[nSrc:])
// Look for an ASCII rune.
if r < utf8.RuneSelf {
if nDst == len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = byte(r)
nDst++
nSrc++
continue
}
// Look for a valid non-ASCII rune.
if r != utf8.RuneError || size != 1 {
if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
err = transform.ErrShortDst
break
}
nDst += size
nSrc += size
continue
}
// Look for short source data.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
// We have an invalid rune.
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = runeErrorString[0]
dst[nDst+1] = runeErrorString[1]
dst[nDst+2] = runeErrorString[2]
nDst += 3
nSrc++
}
return nDst, nSrc, err
}

Просмотреть файл

@ -392,6 +392,117 @@ func TestRemove(t *testing.T) {
}
}
func TestReplaceIllFormed(t *testing.T) {
replace := ReplaceIllFormed()
for i, tt := range []transformTest{
0: {
szDst: large,
atEOF: true,
in: "",
out: "",
outFull: "",
t: replace,
},
1: {
szDst: 1,
atEOF: true,
in: "aa",
out: "a",
outFull: "aa",
err: transform.ErrShortDst,
t: replace,
},
2: {
szDst: 1,
atEOF: true,
in: "a\x80",
out: "a",
outFull: "a\ufffd",
err: transform.ErrShortDst,
t: replace,
},
3: {
szDst: 1,
atEOF: true,
in: "a\xc0",
out: "a",
outFull: "a\ufffd",
err: transform.ErrShortDst,
t: replace,
},
4: {
szDst: large,
atEOF: true,
in: "\x80",
out: "\ufffd",
outFull: "\ufffd",
t: replace,
},
5: {
szDst: large,
atEOF: false,
in: "\x80",
out: "\ufffd",
outFull: "\ufffd",
t: replace,
},
6: {
szDst: large,
atEOF: true,
in: "\xc2",
out: "\ufffd",
outFull: "\ufffd",
t: replace,
},
7: {
szDst: large,
atEOF: false,
in: "\xc2",
out: "",
outFull: "\ufffd",
err: transform.ErrShortSrc,
t: replace,
},
8: {
szDst: large,
atEOF: true,
in: "Hello world!",
out: "Hello world!",
outFull: "Hello world!",
t: replace,
},
9: {
szDst: large,
atEOF: true,
in: "Hello\x80 w\x80orl\xc2d!\xc2",
out: "Hello\ufffd w\ufffdorl\ufffdd!\ufffd",
outFull: "Hello\ufffd w\ufffdorl\ufffdd!\ufffd",
t: replace,
},
10: {
szDst: large,
atEOF: false,
in: "Hello\x80 w\x80orl\xc2d!\xc2",
out: "Hello\ufffd w\ufffdorl\ufffdd!",
outFull: "Hello\ufffd w\ufffdorl\ufffdd!\ufffd",
err: transform.ErrShortSrc,
t: replace,
},
16: {
szDst: 10,
atEOF: false,
in: "\x80Hello\x80",
out: "\ufffdHello",
outFull: "\ufffdHello\ufffd",
err: transform.ErrShortDst,
t: replace,
},
} {
tt.check(t, i)
}
}
func TestMapAlloc(t *testing.T) {
if n := testing.AllocsPerRun(3, func() {
Map(idem).Transform(nil, nil, false)
@ -410,6 +521,14 @@ func TestRemoveAlloc(t *testing.T) {
}
}
func TestReplaceIllFormedAlloc(t *testing.T) {
if n := testing.AllocsPerRun(3, func() {
ReplaceIllFormed().Transform(nil, nil, false)
}); n > 0 {
t.Errorf("got %f; want 0", n)
}
}
func BenchmarkRemove(b *testing.B) {
dst := make([]byte, len(input))
src := []byte(input)
@ -446,6 +565,18 @@ func BenchmarkMapNone(b *testing.B) {
}
}
func BenchmarkReplaceIllFormed(b *testing.B) {
dst := make([]byte, 2*len(input))
src := []byte(input)
t := ReplaceIllFormed()
b.ResetTimer()
for i := 0; i < b.N; i++ {
t.Transform(dst, src, true)
}
}
var (
input = strings.Repeat("Thé qüick brøwn føx jumps øver the lazy døg. ", 100)
)