зеркало из https://github.com/golang/text.git
go.text/transform: added RemoveFunc transform for removing individual
runes from the input. This corresponds to ICU's Remove transform. For example, to remove accents from characters one could use RemoveFunc as follows: nonspacingMark := func(r rune) bool { return unicode.Is(unicode.Mn, r) } transform.Chain(norm.NFD, transform.RemoveFunc(nonspacingMark), norm.NFC) (Once norm.Form implements Transformer; guess what will be my next CL.) R=r CC=golang-dev, nigeltao https://golang.org/cl/23220043
This commit is contained in:
Родитель
089e4d2d44
Коммит
191b11aac8
|
@ -0,0 +1,37 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package transform_test
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"unicode"
|
||||
|
||||
"code.google.com/p/go.text/transform"
|
||||
"code.google.com/p/go.text/unicode/norm"
|
||||
)
|
||||
|
||||
func ExampleRemoveFunc() {
|
||||
input := []byte(`tschüß; до свидания`)
|
||||
|
||||
b := make([]byte, len(input))
|
||||
|
||||
t := transform.RemoveFunc(unicode.IsSpace)
|
||||
n, _, _ := t.Transform(b, input, true)
|
||||
fmt.Println(string(b[:n]))
|
||||
|
||||
t = transform.RemoveFunc(func(r rune) bool {
|
||||
return !unicode.Is(unicode.Latin, r)
|
||||
})
|
||||
n, _, _ = t.Transform(b, input, true)
|
||||
fmt.Println(string(b[:n]))
|
||||
|
||||
n, _, _ = t.Transform(b, norm.NFD.Bytes(input), true)
|
||||
fmt.Println(string(b[:n]))
|
||||
|
||||
// Output:
|
||||
// tschüß;досвидания
|
||||
// tschüß
|
||||
// tschuß
|
||||
}
|
|
@ -3,13 +3,15 @@
|
|||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package transform provides reader and writer wrappers that transform the
|
||||
// bytes passing through. Example transformations, provided by other packages,
|
||||
// include text collation, normalization and conversion between character sets.
|
||||
// bytes passing through as well as various transformations. Example
|
||||
// transformations provided by other packages include normalization and
|
||||
// conversion between character sets.
|
||||
package transform
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"io"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
var (
|
||||
|
@ -413,3 +415,54 @@ func (c *chain) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err erro
|
|||
}
|
||||
return dstL.n, srcL.p, err
|
||||
}
|
||||
|
||||
// RemoveFunc returns a Transformer that removes from the input all runes r for
|
||||
// which f(r) is true. Illegal bytes in the input are replaced by RuneError.
|
||||
func RemoveFunc(f func(r rune) bool) Transformer {
|
||||
return removeF(f)
|
||||
}
|
||||
|
||||
type removeF func(r rune) bool
|
||||
|
||||
// Transform implements the Transformer interface.
|
||||
func (t removeF) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
for r, sz := rune(0), 0; len(src) > 0; src = src[sz:] {
|
||||
|
||||
if r = rune(src[0]); r < utf8.RuneSelf {
|
||||
sz = 1
|
||||
} else {
|
||||
r, sz = utf8.DecodeRune(src)
|
||||
|
||||
if sz == 1 {
|
||||
// Invalid rune.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = ErrShortSrc
|
||||
break
|
||||
}
|
||||
// We replace illegal bytes with RuneError. Not doing so might
|
||||
// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
|
||||
// The resulting byte sequence may subsequently contain runes
|
||||
// for which t(r) is true that were passed unnoticed.
|
||||
if !t(r) {
|
||||
if nDst+3 > len(dst) {
|
||||
err = ErrShortDst
|
||||
break
|
||||
}
|
||||
nDst += copy(dst[nDst:], "\uFFFD")
|
||||
}
|
||||
nSrc++
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
if !t(r) {
|
||||
if nDst+sz > len(dst) {
|
||||
err = ErrShortDst
|
||||
break
|
||||
}
|
||||
nDst += copy(dst[nDst:], src[:sz])
|
||||
}
|
||||
nSrc += sz
|
||||
}
|
||||
return
|
||||
}
|
||||
|
|
|
@ -12,6 +12,7 @@ import (
|
|||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
type lowerCaseASCII struct{}
|
||||
|
@ -768,3 +769,108 @@ func TestChain(t *testing.T) {
|
|||
break
|
||||
}
|
||||
}
|
||||
|
||||
func TestRemoveFunc(t *testing.T) {
|
||||
filter := RemoveFunc(func(r rune) bool {
|
||||
return strings.IndexRune("ab\u0300\u1234,", r) != -1
|
||||
})
|
||||
tests := []testCase{
|
||||
{
|
||||
src: ",",
|
||||
wantStr: "",
|
||||
},
|
||||
|
||||
{
|
||||
src: "c",
|
||||
wantStr: "c",
|
||||
},
|
||||
|
||||
{
|
||||
src: "\u2345",
|
||||
wantStr: "\u2345",
|
||||
},
|
||||
|
||||
{
|
||||
src: "tschüß",
|
||||
wantStr: "tschüß",
|
||||
},
|
||||
|
||||
{
|
||||
src: ",до,свидания,",
|
||||
wantStr: "досвидания",
|
||||
},
|
||||
|
||||
{
|
||||
src: "a\xbd\xb2=\xbc ⌘",
|
||||
wantStr: "\uFFFD\uFFFD=\uFFFD ⌘",
|
||||
},
|
||||
|
||||
{
|
||||
// If we didn't replace illegal bytes with RuneError, the result
|
||||
// would be \u0300 or the code would need to be more complex.
|
||||
src: "\xcc\u0300\x80",
|
||||
wantStr: "\uFFFD\uFFFD",
|
||||
},
|
||||
|
||||
{
|
||||
src: "\xcc\u0300\x80",
|
||||
dstSize: 3,
|
||||
wantStr: "\uFFFD\uFFFD",
|
||||
wantIter: 2,
|
||||
},
|
||||
|
||||
{
|
||||
src: "\u2345",
|
||||
dstSize: 2,
|
||||
wantStr: "",
|
||||
wantErr: ErrShortDst,
|
||||
},
|
||||
|
||||
{
|
||||
src: "\xcc",
|
||||
dstSize: 2,
|
||||
wantStr: "",
|
||||
wantErr: ErrShortDst,
|
||||
},
|
||||
|
||||
{
|
||||
src: "\u0300",
|
||||
dstSize: 2,
|
||||
srcSize: 1,
|
||||
wantStr: "",
|
||||
wantErr: ErrShortSrc,
|
||||
},
|
||||
|
||||
{
|
||||
t: RemoveFunc(func(r rune) bool {
|
||||
return r == utf8.RuneError
|
||||
}),
|
||||
src: "\xcc\u0300\x80",
|
||||
wantStr: "\u0300",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
tc.desc = tc.src
|
||||
if tc.t == nil {
|
||||
tc.t = filter
|
||||
}
|
||||
if tc.dstSize == 0 {
|
||||
tc.dstSize = 100
|
||||
}
|
||||
if tc.srcSize == 0 {
|
||||
tc.srcSize = 100
|
||||
}
|
||||
str, iter, err := doTransform(tc)
|
||||
mi := tc.wantIter != 0 && tc.wantIter != iter
|
||||
if str != tc.wantStr || err != tc.wantErr || mi {
|
||||
t.Errorf("%+q:\ngot iter:%d, %+q, %v\nwant iter:%d, %+q, %v", tc.src, iter, str, err, tc.wantIter, tc.wantStr, tc.wantErr)
|
||||
}
|
||||
|
||||
tc.src = str
|
||||
idem, _, _ := doTransform(tc)
|
||||
if str != idem {
|
||||
t.Errorf("%+q: found %+q; want %+q", tc.src, idem, str)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче