зеркало из https://github.com/golang/text.git
go.text/transform: added RemoveFunc transform for removing individual
runes from the input. This corresponds to ICU's Remove transform. For example, to remove accents from characters one could use RemoveFunc as follows: nonspacingMark := func(r rune) bool { return unicode.Is(unicode.Mn, r) } transform.Chain(norm.NFD, transform.RemoveFunc(nonspacingMark), norm.NFC) (Once norm.Form implements Transformer; guess what will be my next CL.) R=r CC=golang-dev, nigeltao https://golang.org/cl/23220043
This commit is contained in:
Родитель
089e4d2d44
Коммит
191b11aac8
|
@ -0,0 +1,37 @@
|
||||||
|
// Copyright 2013 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package transform_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"unicode"
|
||||||
|
|
||||||
|
"code.google.com/p/go.text/transform"
|
||||||
|
"code.google.com/p/go.text/unicode/norm"
|
||||||
|
)
|
||||||
|
|
||||||
|
func ExampleRemoveFunc() {
|
||||||
|
input := []byte(`tschüß; до свидания`)
|
||||||
|
|
||||||
|
b := make([]byte, len(input))
|
||||||
|
|
||||||
|
t := transform.RemoveFunc(unicode.IsSpace)
|
||||||
|
n, _, _ := t.Transform(b, input, true)
|
||||||
|
fmt.Println(string(b[:n]))
|
||||||
|
|
||||||
|
t = transform.RemoveFunc(func(r rune) bool {
|
||||||
|
return !unicode.Is(unicode.Latin, r)
|
||||||
|
})
|
||||||
|
n, _, _ = t.Transform(b, input, true)
|
||||||
|
fmt.Println(string(b[:n]))
|
||||||
|
|
||||||
|
n, _, _ = t.Transform(b, norm.NFD.Bytes(input), true)
|
||||||
|
fmt.Println(string(b[:n]))
|
||||||
|
|
||||||
|
// Output:
|
||||||
|
// tschüß;досвидания
|
||||||
|
// tschüß
|
||||||
|
// tschuß
|
||||||
|
}
|
|
@ -3,13 +3,15 @@
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
// Package transform provides reader and writer wrappers that transform the
|
// Package transform provides reader and writer wrappers that transform the
|
||||||
// bytes passing through. Example transformations, provided by other packages,
|
// bytes passing through as well as various transformations. Example
|
||||||
// include text collation, normalization and conversion between character sets.
|
// transformations provided by other packages include normalization and
|
||||||
|
// conversion between character sets.
|
||||||
package transform
|
package transform
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
"io"
|
"io"
|
||||||
|
"unicode/utf8"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
@ -413,3 +415,54 @@ func (c *chain) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err erro
|
||||||
}
|
}
|
||||||
return dstL.n, srcL.p, err
|
return dstL.n, srcL.p, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// RemoveFunc returns a Transformer that removes from the input all runes r for
|
||||||
|
// which f(r) is true. Illegal bytes in the input are replaced by RuneError.
|
||||||
|
func RemoveFunc(f func(r rune) bool) Transformer {
|
||||||
|
return removeF(f)
|
||||||
|
}
|
||||||
|
|
||||||
|
type removeF func(r rune) bool
|
||||||
|
|
||||||
|
// Transform implements the Transformer interface.
|
||||||
|
func (t removeF) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||||
|
for r, sz := rune(0), 0; len(src) > 0; src = src[sz:] {
|
||||||
|
|
||||||
|
if r = rune(src[0]); r < utf8.RuneSelf {
|
||||||
|
sz = 1
|
||||||
|
} else {
|
||||||
|
r, sz = utf8.DecodeRune(src)
|
||||||
|
|
||||||
|
if sz == 1 {
|
||||||
|
// Invalid rune.
|
||||||
|
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||||
|
err = ErrShortSrc
|
||||||
|
break
|
||||||
|
}
|
||||||
|
// We replace illegal bytes with RuneError. Not doing so might
|
||||||
|
// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
|
||||||
|
// The resulting byte sequence may subsequently contain runes
|
||||||
|
// for which t(r) is true that were passed unnoticed.
|
||||||
|
if !t(r) {
|
||||||
|
if nDst+3 > len(dst) {
|
||||||
|
err = ErrShortDst
|
||||||
|
break
|
||||||
|
}
|
||||||
|
nDst += copy(dst[nDst:], "\uFFFD")
|
||||||
|
}
|
||||||
|
nSrc++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !t(r) {
|
||||||
|
if nDst+sz > len(dst) {
|
||||||
|
err = ErrShortDst
|
||||||
|
break
|
||||||
|
}
|
||||||
|
nDst += copy(dst[nDst:], src[:sz])
|
||||||
|
}
|
||||||
|
nSrc += sz
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
|
@ -12,6 +12,7 @@ import (
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
"unicode/utf8"
|
||||||
)
|
)
|
||||||
|
|
||||||
type lowerCaseASCII struct{}
|
type lowerCaseASCII struct{}
|
||||||
|
@ -768,3 +769,108 @@ func TestChain(t *testing.T) {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRemoveFunc(t *testing.T) {
|
||||||
|
filter := RemoveFunc(func(r rune) bool {
|
||||||
|
return strings.IndexRune("ab\u0300\u1234,", r) != -1
|
||||||
|
})
|
||||||
|
tests := []testCase{
|
||||||
|
{
|
||||||
|
src: ",",
|
||||||
|
wantStr: "",
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
src: "c",
|
||||||
|
wantStr: "c",
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
src: "\u2345",
|
||||||
|
wantStr: "\u2345",
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
src: "tschüß",
|
||||||
|
wantStr: "tschüß",
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
src: ",до,свидания,",
|
||||||
|
wantStr: "досвидания",
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
src: "a\xbd\xb2=\xbc ⌘",
|
||||||
|
wantStr: "\uFFFD\uFFFD=\uFFFD ⌘",
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
// If we didn't replace illegal bytes with RuneError, the result
|
||||||
|
// would be \u0300 or the code would need to be more complex.
|
||||||
|
src: "\xcc\u0300\x80",
|
||||||
|
wantStr: "\uFFFD\uFFFD",
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
src: "\xcc\u0300\x80",
|
||||||
|
dstSize: 3,
|
||||||
|
wantStr: "\uFFFD\uFFFD",
|
||||||
|
wantIter: 2,
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
src: "\u2345",
|
||||||
|
dstSize: 2,
|
||||||
|
wantStr: "",
|
||||||
|
wantErr: ErrShortDst,
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
src: "\xcc",
|
||||||
|
dstSize: 2,
|
||||||
|
wantStr: "",
|
||||||
|
wantErr: ErrShortDst,
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
src: "\u0300",
|
||||||
|
dstSize: 2,
|
||||||
|
srcSize: 1,
|
||||||
|
wantStr: "",
|
||||||
|
wantErr: ErrShortSrc,
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
t: RemoveFunc(func(r rune) bool {
|
||||||
|
return r == utf8.RuneError
|
||||||
|
}),
|
||||||
|
src: "\xcc\u0300\x80",
|
||||||
|
wantStr: "\u0300",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range tests {
|
||||||
|
tc.desc = tc.src
|
||||||
|
if tc.t == nil {
|
||||||
|
tc.t = filter
|
||||||
|
}
|
||||||
|
if tc.dstSize == 0 {
|
||||||
|
tc.dstSize = 100
|
||||||
|
}
|
||||||
|
if tc.srcSize == 0 {
|
||||||
|
tc.srcSize = 100
|
||||||
|
}
|
||||||
|
str, iter, err := doTransform(tc)
|
||||||
|
mi := tc.wantIter != 0 && tc.wantIter != iter
|
||||||
|
if str != tc.wantStr || err != tc.wantErr || mi {
|
||||||
|
t.Errorf("%+q:\ngot iter:%d, %+q, %v\nwant iter:%d, %+q, %v", tc.src, iter, str, err, tc.wantIter, tc.wantStr, tc.wantErr)
|
||||||
|
}
|
||||||
|
|
||||||
|
tc.src = str
|
||||||
|
idem, _, _ := doTransform(tc)
|
||||||
|
if str != idem {
|
||||||
|
t.Errorf("%+q: found %+q; want %+q", tc.src, idem, str)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Загрузка…
Ссылка в новой задаче