go.text/transform: added RemoveFunc transform for removing individual

runes from the input. This corresponds to ICU's Remove transform. For example, to remove accents from characters one could use RemoveFunc as follows: nonspacingMark := func(r rune) bool { return unicode.Is(unicode.Mn, r) } transform.Chain(norm.NFD, transform.RemoveFunc(nonspacingMark), norm.NFC) (Once norm.Form implements Transformer; guess what will be my next CL.) R=r CC=golang-dev, nigeltao https://golang.org/cl/23220043
2013-11-26 08:29:24 +01:00 · 2013-11-26 08:29:24 +01:00 · 191b11aac8
--- a/transform/examples_test.go
+++ b/transform/examples_test.go
@ -0,0 +1,37 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package transform_test
+
+import (
+	"fmt"
+	"unicode"
+
+	"code.google.com/p/go.text/transform"
+	"code.google.com/p/go.text/unicode/norm"
+)
+
+func ExampleRemoveFunc() {
+	input := []byte(`tschüß; до свидания`)
+
+	b := make([]byte, len(input))
+
+	t := transform.RemoveFunc(unicode.IsSpace)
+	n, _, _ := t.Transform(b, input, true)
+	fmt.Println(string(b[:n]))
+
+	t = transform.RemoveFunc(func(r rune) bool {
+		return !unicode.Is(unicode.Latin, r)
+	})
+	n, _, _ = t.Transform(b, input, true)
+	fmt.Println(string(b[:n]))
+
+	n, _, _ = t.Transform(b, norm.NFD.Bytes(input), true)
+	fmt.Println(string(b[:n]))
+
+	// Output:
+	// tschüß;досвидания
+	// tschüß
+	// tschuß
+}
--- a/transform/transform.go
+++ b/transform/transform.go
@ -3,13 +3,15 @@
 // license that can be found in the LICENSE file.

 // Package transform provides reader and writer wrappers that transform the
-// bytes passing through. Example transformations, provided by other packages,
-// include text collation, normalization and conversion between character sets.
+// bytes passing through as well as various transformations. Example
+// transformations provided by other packages include normalization and
+// conversion between character sets.
 package transform

 import (
 	"errors"
 	"io"
+	"unicode/utf8"
 )

 var (
@ -413,3 +415,54 @@ func (c *chain) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err erro
 	}
 	return dstL.n, srcL.p, err
 }
+
+// RemoveFunc returns a Transformer that removes from the input all runes r for
+// which f(r) is true. Illegal bytes in the input are replaced by RuneError.
+func RemoveFunc(f func(r rune) bool) Transformer {
+	return removeF(f)
+}
+
+type removeF func(r rune) bool
+
+// Transform implements the Transformer interface.
+func (t removeF) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
+	for r, sz := rune(0), 0; len(src) > 0; src = src[sz:] {
+
+		if r = rune(src[0]); r < utf8.RuneSelf {
+			sz = 1
+		} else {
+			r, sz = utf8.DecodeRune(src)
+
+			if sz == 1 {
+				// Invalid rune.
+				if !atEOF && !utf8.FullRune(src[nSrc:]) {
+					err = ErrShortSrc
+					break
+				}
+				// We replace illegal bytes with RuneError. Not doing so might
+				// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
+				// The resulting byte sequence may subsequently contain runes
+				// for which t(r) is true that were passed unnoticed.
+				if !t(r) {
+					if nDst+3 > len(dst) {
+						err = ErrShortDst
+						break
+					}
+					nDst += copy(dst[nDst:], "\uFFFD")
+				}
+				nSrc++
+				continue
+			}
+		}
+
+		if !t(r) {
+			if nDst+sz > len(dst) {
+				err = ErrShortDst
+				break
+			}
+			nDst += copy(dst[nDst:], src[:sz])
+		}
+		nSrc += sz
+	}
+	return
+}
--- a/transform/transform_test.go
+++ b/transform/transform_test.go
@ -12,6 +12,7 @@ import (
 	"strconv"
 	"strings"
 	"testing"
+	"unicode/utf8"
 )

 type lowerCaseASCII struct{}
@ -768,3 +769,108 @@ func TestChain(t *testing.T) {
 		break
 	}
 }
+
+func TestRemoveFunc(t *testing.T) {
+	filter := RemoveFunc(func(r rune) bool {
+		return strings.IndexRune("ab\u0300\u1234,", r) != -1
+	})
+	tests := []testCase{
+		{
+			src:     ",",
+			wantStr: "",
+		},
+
+		{
+			src:     "c",
+			wantStr: "c",
+		},
+
+		{
+			src:     "\u2345",
+			wantStr: "\u2345",
+		},
+
+		{
+			src:     "tschüß",
+			wantStr: "tschüß",
+		},
+
+		{
+			src:     ",до,свидания,",
+			wantStr: "досвидания",
+		},
+
+		{
+			src:     "a\xbd\xb2=\xbc ⌘",
+			wantStr: "\uFFFD\uFFFD=\uFFFD ⌘",
+		},
+
+		{
+			// If we didn't replace illegal bytes with RuneError, the result
+			// would be \u0300 or the code would need to be more complex.
+			src:     "\xcc\u0300\x80",
+			wantStr: "\uFFFD\uFFFD",
+		},
+
+		{
+			src:      "\xcc\u0300\x80",
+			dstSize:  3,
+			wantStr:  "\uFFFD\uFFFD",
+			wantIter: 2,
+		},
+
+		{
+			src:     "\u2345",
+			dstSize: 2,
+			wantStr: "",
+			wantErr: ErrShortDst,
+		},
+
+		{
+			src:     "\xcc",
+			dstSize: 2,
+			wantStr: "",
+			wantErr: ErrShortDst,
+		},
+
+		{
+			src:     "\u0300",
+			dstSize: 2,
+			srcSize: 1,
+			wantStr: "",
+			wantErr: ErrShortSrc,
+		},
+
+		{
+			t: RemoveFunc(func(r rune) bool {
+				return r == utf8.RuneError
+			}),
+			src:     "\xcc\u0300\x80",
+			wantStr: "\u0300",
+		},
+	}
+
+	for _, tc := range tests {
+		tc.desc = tc.src
+		if tc.t == nil {
+			tc.t = filter
+		}
+		if tc.dstSize == 0 {
+			tc.dstSize = 100
+		}
+		if tc.srcSize == 0 {
+			tc.srcSize = 100
+		}
+		str, iter, err := doTransform(tc)
+		mi := tc.wantIter != 0 && tc.wantIter != iter
+		if str != tc.wantStr || err != tc.wantErr || mi {
+			t.Errorf("%+q:\ngot  iter:%d, %+q, %v\nwant iter:%d, %+q, %v", tc.src, iter, str, err, tc.wantIter, tc.wantStr, tc.wantErr)
+		}
+
+		tc.src = str
+		idem, _, _ := doTransform(tc)
+		if str != idem {
+			t.Errorf("%+q: found %+q; want %+q", tc.src, idem, str)
+		}
+	}
+}