From 191b11aac8d8364f3d6f2a141b9d86d6bed91d0b Mon Sep 17 00:00:00 2001 From: Marcel van Lohuizen Date: Tue, 26 Nov 2013 08:29:24 +0100 Subject: [PATCH] go.text/transform: added RemoveFunc transform for removing individual runes from the input. This corresponds to ICU's Remove transform. For example, to remove accents from characters one could use RemoveFunc as follows: nonspacingMark := func(r rune) bool { return unicode.Is(unicode.Mn, r) } transform.Chain(norm.NFD, transform.RemoveFunc(nonspacingMark), norm.NFC) (Once norm.Form implements Transformer; guess what will be my next CL.) R=r CC=golang-dev, nigeltao https://golang.org/cl/23220043 --- transform/examples_test.go | 37 +++++++++++++ transform/transform.go | 57 ++++++++++++++++++- transform/transform_test.go | 106 ++++++++++++++++++++++++++++++++++++ 3 files changed, 198 insertions(+), 2 deletions(-) create mode 100644 transform/examples_test.go diff --git a/transform/examples_test.go b/transform/examples_test.go new file mode 100644 index 0000000..3aff8e1 --- /dev/null +++ b/transform/examples_test.go @@ -0,0 +1,37 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package transform_test + +import ( + "fmt" + "unicode" + + "code.google.com/p/go.text/transform" + "code.google.com/p/go.text/unicode/norm" +) + +func ExampleRemoveFunc() { + input := []byte(`tschüß; до свидания`) + + b := make([]byte, len(input)) + + t := transform.RemoveFunc(unicode.IsSpace) + n, _, _ := t.Transform(b, input, true) + fmt.Println(string(b[:n])) + + t = transform.RemoveFunc(func(r rune) bool { + return !unicode.Is(unicode.Latin, r) + }) + n, _, _ = t.Transform(b, input, true) + fmt.Println(string(b[:n])) + + n, _, _ = t.Transform(b, norm.NFD.Bytes(input), true) + fmt.Println(string(b[:n])) + + // Output: + // tschüß;досвидания + // tschüß + // tschuß +} diff --git a/transform/transform.go b/transform/transform.go index 9333d30..1ab1af4 100644 --- a/transform/transform.go +++ b/transform/transform.go @@ -3,13 +3,15 @@ // license that can be found in the LICENSE file. // Package transform provides reader and writer wrappers that transform the -// bytes passing through. Example transformations, provided by other packages, -// include text collation, normalization and conversion between character sets. +// bytes passing through as well as various transformations. Example +// transformations provided by other packages include normalization and +// conversion between character sets. package transform import ( "errors" "io" + "unicode/utf8" ) var ( @@ -413,3 +415,54 @@ func (c *chain) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err erro } return dstL.n, srcL.p, err } + +// RemoveFunc returns a Transformer that removes from the input all runes r for +// which f(r) is true. Illegal bytes in the input are replaced by RuneError. +func RemoveFunc(f func(r rune) bool) Transformer { + return removeF(f) +} + +type removeF func(r rune) bool + +// Transform implements the Transformer interface. +func (t removeF) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { + for r, sz := rune(0), 0; len(src) > 0; src = src[sz:] { + + if r = rune(src[0]); r < utf8.RuneSelf { + sz = 1 + } else { + r, sz = utf8.DecodeRune(src) + + if sz == 1 { + // Invalid rune. + if !atEOF && !utf8.FullRune(src[nSrc:]) { + err = ErrShortSrc + break + } + // We replace illegal bytes with RuneError. Not doing so might + // otherwise turn a sequence of invalid UTF-8 into valid UTF-8. + // The resulting byte sequence may subsequently contain runes + // for which t(r) is true that were passed unnoticed. + if !t(r) { + if nDst+3 > len(dst) { + err = ErrShortDst + break + } + nDst += copy(dst[nDst:], "\uFFFD") + } + nSrc++ + continue + } + } + + if !t(r) { + if nDst+sz > len(dst) { + err = ErrShortDst + break + } + nDst += copy(dst[nDst:], src[:sz]) + } + nSrc += sz + } + return +} diff --git a/transform/transform_test.go b/transform/transform_test.go index a0bd72f..507d132 100644 --- a/transform/transform_test.go +++ b/transform/transform_test.go @@ -12,6 +12,7 @@ import ( "strconv" "strings" "testing" + "unicode/utf8" ) type lowerCaseASCII struct{} @@ -768,3 +769,108 @@ func TestChain(t *testing.T) { break } } + +func TestRemoveFunc(t *testing.T) { + filter := RemoveFunc(func(r rune) bool { + return strings.IndexRune("ab\u0300\u1234,", r) != -1 + }) + tests := []testCase{ + { + src: ",", + wantStr: "", + }, + + { + src: "c", + wantStr: "c", + }, + + { + src: "\u2345", + wantStr: "\u2345", + }, + + { + src: "tschüß", + wantStr: "tschüß", + }, + + { + src: ",до,свидания,", + wantStr: "досвидания", + }, + + { + src: "a\xbd\xb2=\xbc ⌘", + wantStr: "\uFFFD\uFFFD=\uFFFD ⌘", + }, + + { + // If we didn't replace illegal bytes with RuneError, the result + // would be \u0300 or the code would need to be more complex. + src: "\xcc\u0300\x80", + wantStr: "\uFFFD\uFFFD", + }, + + { + src: "\xcc\u0300\x80", + dstSize: 3, + wantStr: "\uFFFD\uFFFD", + wantIter: 2, + }, + + { + src: "\u2345", + dstSize: 2, + wantStr: "", + wantErr: ErrShortDst, + }, + + { + src: "\xcc", + dstSize: 2, + wantStr: "", + wantErr: ErrShortDst, + }, + + { + src: "\u0300", + dstSize: 2, + srcSize: 1, + wantStr: "", + wantErr: ErrShortSrc, + }, + + { + t: RemoveFunc(func(r rune) bool { + return r == utf8.RuneError + }), + src: "\xcc\u0300\x80", + wantStr: "\u0300", + }, + } + + for _, tc := range tests { + tc.desc = tc.src + if tc.t == nil { + tc.t = filter + } + if tc.dstSize == 0 { + tc.dstSize = 100 + } + if tc.srcSize == 0 { + tc.srcSize = 100 + } + str, iter, err := doTransform(tc) + mi := tc.wantIter != 0 && tc.wantIter != iter + if str != tc.wantStr || err != tc.wantErr || mi { + t.Errorf("%+q:\ngot iter:%d, %+q, %v\nwant iter:%d, %+q, %v", tc.src, iter, str, err, tc.wantIter, tc.wantStr, tc.wantErr) + } + + tc.src = str + idem, _, _ := doTransform(tc) + if str != idem { + t.Errorf("%+q: found %+q; want %+q", tc.src, idem, str) + } + } +}