go.text/transform: added RemoveFunc transform for removing individual

runes from the input. This corresponds to ICU's Remove transform. For example, to remove accents from characters one could use RemoveFunc as follows: nonspacingMark := func(r rune) bool { return unicode.Is(unicode.Mn, r) } transform.Chain(norm.NFD, transform.RemoveFunc(nonspacingMark), norm.NFC) (Once norm.Form implements Transformer; guess what will be my next CL.) R=r CC=golang-dev, nigeltao https://golang.org/cl/23220043
2013-11-26 08:29:24 +01:00 · 2013-11-26 08:29:24 +01:00 · 191b11aac8
--- a/transform/examples_test.go
+++ b/transform/examples_test.go
@ -0,0 +1,37 @@
 // Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package transform_test
 import (
 	"fmt"
 	"unicode"
 	"code.google.com/p/go.text/transform"
 	"code.google.com/p/go.text/unicode/norm"
 )
 func ExampleRemoveFunc() {
 	input := []byte(`tschüß; до свидания`)
 	b := make([]byte, len(input))
 	t := transform.RemoveFunc(unicode.IsSpace)
 	n, _, _ := t.Transform(b, input, true)
 	fmt.Println(string(b[:n]))
 	t = transform.RemoveFunc(func(r rune) bool {
 		return !unicode.Is(unicode.Latin, r)
 	})
 	n, _, _ = t.Transform(b, input, true)
 	fmt.Println(string(b[:n]))
 	n, _, _ = t.Transform(b, norm.NFD.Bytes(input), true)
 	fmt.Println(string(b[:n]))
 	// Output:
 	// tschüß;досвидания
 	// tschüß
 	// tschuß
 }
--- a/transform/transform.go
+++ b/transform/transform.go
@ -3,13 +3,15 @@
 // license that can be found in the LICENSE file.
 // Package transform provides reader and writer wrappers that transform the
-// bytes passing through. Example transformations, provided by other packages,
+// bytes passing through as well as various transformations. Example
-// include text collation, normalization and conversion between character sets.
+// transformations provided by other packages include normalization and
 // conversion between character sets.
 package transform
 import (
 	"errors"
 	"io"
 	"unicode/utf8"
 )
 var (
@ -413,3 +415,54 @@ func (c *chain) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err erro
 	}
 	return dstL.n, srcL.p, err
 }
 // RemoveFunc returns a Transformer that removes from the input all runes r for
 // which f(r) is true. Illegal bytes in the input are replaced by RuneError.
 func RemoveFunc(f func(r rune) bool) Transformer {
 	return removeF(f)
 }
 type removeF func(r rune) bool
 // Transform implements the Transformer interface.
 func (t removeF) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 	for r, sz := rune(0), 0; len(src) > 0; src = src[sz:] {
 		if r = rune(src[0]); r < utf8.RuneSelf {
 			sz = 1
 		} else {
 			r, sz = utf8.DecodeRune(src)
 			if sz == 1 {
 				// Invalid rune.
 				if !atEOF && !utf8.FullRune(src[nSrc:]) {
 					err = ErrShortSrc
 					break
 				}
 				// We replace illegal bytes with RuneError. Not doing so might
 				// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
 				// The resulting byte sequence may subsequently contain runes
 				// for which t(r) is true that were passed unnoticed.
 				if !t(r) {
 					if nDst+3 > len(dst) {
 						err = ErrShortDst
 						break
 					}
 					nDst += copy(dst[nDst:], "\uFFFD")
 				}
 				nSrc++
 				continue
 			}
 		}
 		if !t(r) {
 			if nDst+sz > len(dst) {
 				err = ErrShortDst
 				break
 			}
 			nDst += copy(dst[nDst:], src[:sz])
 		}
 		nSrc += sz
 	}
 	return
 }
--- a/transform/transform_test.go
+++ b/transform/transform_test.go
@ -12,6 +12,7 @@ import (
 	"strconv"
 	"strings"
 	"testing"
 	"unicode/utf8"
 )
 type lowerCaseASCII struct{}
@ -768,3 +769,108 @@ func TestChain(t *testing.T) {
 		break
 	}
 }
 func TestRemoveFunc(t *testing.T) {
 	filter := RemoveFunc(func(r rune) bool {
 		return strings.IndexRune("ab\u0300\u1234,", r) != -1
 	})
 	tests := []testCase{
 		{
 			src:     ",",
 			wantStr: "",
 		},
 		{
 			src:     "c",
 			wantStr: "c",
 		},
 		{
 			src:     "\u2345",
 			wantStr: "\u2345",
 		},
 		{
 			src:     "tschüß",
 			wantStr: "tschüß",
 		},
 		{
 			src:     ",до,свидания,",
 			wantStr: "досвидания",
 		},
 		{
 			src:     "a\xbd\xb2=\xbc ⌘",
 			wantStr: "\uFFFD\uFFFD=\uFFFD ⌘",
 		},
 		{
 			// If we didn't replace illegal bytes with RuneError, the result
 			// would be \u0300 or the code would need to be more complex.
 			src:     "\xcc\u0300\x80",
 			wantStr: "\uFFFD\uFFFD",
 		},
 		{
 			src:      "\xcc\u0300\x80",
 			dstSize:  3,
 			wantStr:  "\uFFFD\uFFFD",
 			wantIter: 2,
 		},
 		{
 			src:     "\u2345",
 			dstSize: 2,
 			wantStr: "",
 			wantErr: ErrShortDst,
 		},
 		{
 			src:     "\xcc",
 			dstSize: 2,
 			wantStr: "",
 			wantErr: ErrShortDst,
 		},
 		{
 			src:     "\u0300",
 			dstSize: 2,
 			srcSize: 1,
 			wantStr: "",
 			wantErr: ErrShortSrc,
 		},
 		{
 			t: RemoveFunc(func(r rune) bool {
 				return r == utf8.RuneError
 			}),
 			src:     "\xcc\u0300\x80",
 			wantStr: "\u0300",
 		},
 	}
 	for _, tc := range tests {
 		tc.desc = tc.src
 		if tc.t == nil {
 			tc.t = filter
 		}
 		if tc.dstSize == 0 {
 			tc.dstSize = 100
 		}
 		if tc.srcSize == 0 {
 			tc.srcSize = 100
 		}
 		str, iter, err := doTransform(tc)
 		mi := tc.wantIter != 0 && tc.wantIter != iter
 		if str != tc.wantStr || err != tc.wantErr || mi {
 			t.Errorf("%+q:\ngot  iter:%d, %+q, %v\nwant iter:%d, %+q, %v", tc.src, iter, str, err, tc.wantIter, tc.wantStr, tc.wantErr)
 		}
 		tc.src = str
 		idem, _, _ := doTransform(tc)
 		if str != idem {
 			t.Errorf("%+q: found %+q; want %+q", tc.src, idem, str)
 		}
 	}
 }