зеркало из https://github.com/golang/text.git
631 строка
18 KiB
Go
631 строка
18 KiB
Go
// Copyright 2013 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Package transform provides reader and writer wrappers that transform the
|
|
// bytes passing through as well as various transformations. Example
|
|
// transformations provided by other packages include normalization and
|
|
// conversion between character sets.
|
|
package transform // import "golang.org/x/text/transform"
|
|
|
|
import (
|
|
"bytes"
|
|
"errors"
|
|
"io"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
var (
|
|
// ErrShortDst means that the destination buffer was too short to
|
|
// receive all of the transformed bytes.
|
|
ErrShortDst = errors.New("transform: short destination buffer")
|
|
|
|
// ErrShortSrc means that the source buffer has insufficient data to
|
|
// complete the transformation.
|
|
ErrShortSrc = errors.New("transform: short source buffer")
|
|
|
|
// errInconsistentByteCount means that Transform returned success (nil
|
|
// error) but also returned nSrc inconsistent with the src argument.
|
|
errInconsistentByteCount = errors.New("transform: inconsistent byte count returned")
|
|
|
|
// errShortInternal means that an internal buffer is not large enough
|
|
// to make progress and the Transform operation must be aborted.
|
|
errShortInternal = errors.New("transform: short internal buffer")
|
|
)
|
|
|
|
// Transformer transforms bytes.
|
|
type Transformer interface {
|
|
// Transform writes to dst the transformed bytes read from src, and
|
|
// returns the number of dst bytes written and src bytes read. The
|
|
// atEOF argument tells whether src represents the last bytes of the
|
|
// input.
|
|
//
|
|
// Callers should always process the nDst bytes produced and account
|
|
// for the nSrc bytes consumed before considering the error err.
|
|
//
|
|
// A nil error means that all of the transformed bytes (whether freshly
|
|
// transformed from src or left over from previous Transform calls)
|
|
// were written to dst. A nil error can be returned regardless of
|
|
// whether atEOF is true. If err is nil then nSrc must equal len(src);
|
|
// the converse is not necessarily true.
|
|
//
|
|
// ErrShortDst means that dst was too short to receive all of the
|
|
// transformed bytes. ErrShortSrc means that src had insufficient data
|
|
// to complete the transformation. If both conditions apply, then
|
|
// either error may be returned. Other than the error conditions listed
|
|
// here, implementations are free to report other errors that arise.
|
|
Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error)
|
|
|
|
// Reset resets the state and allows a Transformer to be reused.
|
|
Reset()
|
|
}
|
|
|
|
// NopResetter can be embedded by implementations of Transformer to add a nop
|
|
// Reset method.
|
|
type NopResetter struct{}
|
|
|
|
// Reset implements the Reset method of the Transformer interface.
|
|
func (NopResetter) Reset() {}
|
|
|
|
// Reader wraps another io.Reader by transforming the bytes read.
|
|
type Reader struct {
|
|
r io.Reader
|
|
t Transformer
|
|
err error
|
|
|
|
// dst[dst0:dst1] contains bytes that have been transformed by t but
|
|
// not yet copied out via Read.
|
|
dst []byte
|
|
dst0, dst1 int
|
|
|
|
// src[src0:src1] contains bytes that have been read from r but not
|
|
// yet transformed through t.
|
|
src []byte
|
|
src0, src1 int
|
|
|
|
// transformComplete is whether the transformation is complete,
|
|
// regardless of whether or not it was successful.
|
|
transformComplete bool
|
|
}
|
|
|
|
const defaultBufSize = 4096
|
|
|
|
// NewReader returns a new Reader that wraps r by transforming the bytes read
|
|
// via t. It calls Reset on t.
|
|
func NewReader(r io.Reader, t Transformer) *Reader {
|
|
t.Reset()
|
|
return &Reader{
|
|
r: r,
|
|
t: t,
|
|
dst: make([]byte, defaultBufSize),
|
|
src: make([]byte, defaultBufSize),
|
|
}
|
|
}
|
|
|
|
// Read implements the io.Reader interface.
|
|
func (r *Reader) Read(p []byte) (int, error) {
|
|
n, err := 0, error(nil)
|
|
for {
|
|
// Copy out any transformed bytes and return the final error if we are done.
|
|
if r.dst0 != r.dst1 {
|
|
n = copy(p, r.dst[r.dst0:r.dst1])
|
|
r.dst0 += n
|
|
if r.dst0 == r.dst1 && r.transformComplete {
|
|
return n, r.err
|
|
}
|
|
return n, nil
|
|
} else if r.transformComplete {
|
|
return 0, r.err
|
|
}
|
|
|
|
// Try to transform some source bytes, or to flush the transformer if we
|
|
// are out of source bytes. We do this even if r.r.Read returned an error.
|
|
// As the io.Reader documentation says, "process the n > 0 bytes returned
|
|
// before considering the error".
|
|
if r.src0 != r.src1 || r.err != nil {
|
|
r.dst0 = 0
|
|
r.dst1, n, err = r.t.Transform(r.dst, r.src[r.src0:r.src1], r.err == io.EOF)
|
|
r.src0 += n
|
|
|
|
switch {
|
|
case err == nil:
|
|
if r.src0 != r.src1 {
|
|
r.err = errInconsistentByteCount
|
|
}
|
|
// The Transform call was successful; we are complete if we
|
|
// cannot read more bytes into src.
|
|
r.transformComplete = r.err != nil
|
|
continue
|
|
case err == ErrShortDst && (r.dst1 != 0 || n != 0):
|
|
// Make room in dst by copying out, and try again.
|
|
continue
|
|
case err == ErrShortSrc && r.src1-r.src0 != len(r.src) && r.err == nil:
|
|
// Read more bytes into src via the code below, and try again.
|
|
default:
|
|
r.transformComplete = true
|
|
// The reader error (r.err) takes precedence over the
|
|
// transformer error (err) unless r.err is nil or io.EOF.
|
|
if r.err == nil || r.err == io.EOF {
|
|
r.err = err
|
|
}
|
|
continue
|
|
}
|
|
}
|
|
|
|
// Move any untransformed source bytes to the start of the buffer
|
|
// and read more bytes.
|
|
if r.src0 != 0 {
|
|
r.src0, r.src1 = 0, copy(r.src, r.src[r.src0:r.src1])
|
|
}
|
|
n, r.err = r.r.Read(r.src[r.src1:])
|
|
r.src1 += n
|
|
}
|
|
}
|
|
|
|
// TODO: implement ReadByte (and ReadRune??).
|
|
|
|
// Writer wraps another io.Writer by transforming the bytes read.
|
|
// The user needs to call Close to flush unwritten bytes that may
|
|
// be buffered.
|
|
type Writer struct {
|
|
w io.Writer
|
|
t Transformer
|
|
dst []byte
|
|
|
|
// src[:n] contains bytes that have not yet passed through t.
|
|
src []byte
|
|
n int
|
|
}
|
|
|
|
// NewWriter returns a new Writer that wraps w by transforming the bytes written
|
|
// via t. It calls Reset on t.
|
|
func NewWriter(w io.Writer, t Transformer) *Writer {
|
|
t.Reset()
|
|
return &Writer{
|
|
w: w,
|
|
t: t,
|
|
dst: make([]byte, defaultBufSize),
|
|
src: make([]byte, defaultBufSize),
|
|
}
|
|
}
|
|
|
|
// Write implements the io.Writer interface. If there are not enough
|
|
// bytes available to complete a Transform, the bytes will be buffered
|
|
// for the next write. Call Close to convert the remaining bytes.
|
|
func (w *Writer) Write(data []byte) (n int, err error) {
|
|
src := data
|
|
if w.n > 0 {
|
|
// Append bytes from data to the last remainder.
|
|
// TODO: limit the amount copied on first try.
|
|
n = copy(w.src[w.n:], data)
|
|
w.n += n
|
|
src = w.src[:w.n]
|
|
}
|
|
for {
|
|
nDst, nSrc, err := w.t.Transform(w.dst, src, false)
|
|
if _, werr := w.w.Write(w.dst[:nDst]); werr != nil {
|
|
return n, werr
|
|
}
|
|
src = src[nSrc:]
|
|
if w.n > 0 && len(src) <= n {
|
|
// Enough bytes from w.src have been consumed. We make src point
|
|
// to data instead to reduce the copying.
|
|
w.n = 0
|
|
n -= len(src)
|
|
src = data[n:]
|
|
if n < len(data) && (err == nil || err == ErrShortSrc) {
|
|
continue
|
|
}
|
|
} else {
|
|
n += nSrc
|
|
}
|
|
switch {
|
|
case err == ErrShortDst && (nDst > 0 || nSrc > 0):
|
|
case err == ErrShortSrc && len(src) < len(w.src):
|
|
m := copy(w.src, src)
|
|
// If w.n > 0, bytes from data were already copied to w.src and n
|
|
// was already set to the number of bytes consumed.
|
|
if w.n == 0 {
|
|
n += m
|
|
}
|
|
w.n = m
|
|
return n, nil
|
|
case err == nil && w.n > 0:
|
|
return n, errInconsistentByteCount
|
|
default:
|
|
return n, err
|
|
}
|
|
}
|
|
}
|
|
|
|
// Close implements the io.Closer interface.
|
|
func (w *Writer) Close() error {
|
|
for src := w.src[:w.n]; len(src) > 0; {
|
|
nDst, nSrc, err := w.t.Transform(w.dst, src, true)
|
|
if nDst == 0 {
|
|
return err
|
|
}
|
|
if _, werr := w.w.Write(w.dst[:nDst]); werr != nil {
|
|
return werr
|
|
}
|
|
if err != ErrShortDst {
|
|
return err
|
|
}
|
|
src = src[nSrc:]
|
|
}
|
|
return nil
|
|
}
|
|
|
|
type nop struct{ NopResetter }
|
|
|
|
func (nop) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
|
n := copy(dst, src)
|
|
if n < len(src) {
|
|
err = ErrShortDst
|
|
}
|
|
return n, n, err
|
|
}
|
|
|
|
type discard struct{ NopResetter }
|
|
|
|
func (discard) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
|
return 0, len(src), nil
|
|
}
|
|
|
|
var (
|
|
// Discard is a Transformer for which all Transform calls succeed
|
|
// by consuming all bytes and writing nothing.
|
|
Discard Transformer = discard{}
|
|
|
|
// Nop is a Transformer that copies src to dst.
|
|
Nop Transformer = nop{}
|
|
)
|
|
|
|
// chain is a sequence of links. A chain with N Transformers has N+1 links and
|
|
// N+1 buffers. Of those N+1 buffers, the first and last are the src and dst
|
|
// buffers given to chain.Transform and the middle N-1 buffers are intermediate
|
|
// buffers owned by the chain. The i'th link transforms bytes from the i'th
|
|
// buffer chain.link[i].b at read offset chain.link[i].p to the i+1'th buffer
|
|
// chain.link[i+1].b at write offset chain.link[i+1].n, for i in [0, N).
|
|
type chain struct {
|
|
link []link
|
|
err error
|
|
// errStart is the index at which the error occurred plus 1. Processing
|
|
// errStart at this level at the next call to Transform. As long as
|
|
// errStart > 0, chain will not consume any more source bytes.
|
|
errStart int
|
|
}
|
|
|
|
func (c *chain) fatalError(errIndex int, err error) {
|
|
if i := errIndex + 1; i > c.errStart {
|
|
c.errStart = i
|
|
c.err = err
|
|
}
|
|
}
|
|
|
|
type link struct {
|
|
t Transformer
|
|
// b[p:n] holds the bytes to be transformed by t.
|
|
b []byte
|
|
p int
|
|
n int
|
|
}
|
|
|
|
func (l *link) src() []byte {
|
|
return l.b[l.p:l.n]
|
|
}
|
|
|
|
func (l *link) dst() []byte {
|
|
return l.b[l.n:]
|
|
}
|
|
|
|
// Chain returns a Transformer that applies t in sequence.
|
|
func Chain(t ...Transformer) Transformer {
|
|
if len(t) == 0 {
|
|
return nop{}
|
|
}
|
|
c := &chain{link: make([]link, len(t)+1)}
|
|
for i, tt := range t {
|
|
c.link[i].t = tt
|
|
}
|
|
// Allocate intermediate buffers.
|
|
b := make([][defaultBufSize]byte, len(t)-1)
|
|
for i := range b {
|
|
c.link[i+1].b = b[i][:]
|
|
}
|
|
return c
|
|
}
|
|
|
|
// Reset resets the state of Chain. It calls Reset on all the Transformers.
|
|
func (c *chain) Reset() {
|
|
for i, l := range c.link {
|
|
if l.t != nil {
|
|
l.t.Reset()
|
|
}
|
|
c.link[i].p, c.link[i].n = 0, 0
|
|
}
|
|
}
|
|
|
|
// Transform applies the transformers of c in sequence.
|
|
func (c *chain) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
|
// Set up src and dst in the chain.
|
|
srcL := &c.link[0]
|
|
dstL := &c.link[len(c.link)-1]
|
|
srcL.b, srcL.p, srcL.n = src, 0, len(src)
|
|
dstL.b, dstL.n = dst, 0
|
|
var lastFull, needProgress bool // for detecting progress
|
|
|
|
// i is the index of the next Transformer to apply, for i in [low, high].
|
|
// low is the lowest index for which c.link[low] may still produce bytes.
|
|
// high is the highest index for which c.link[high] has a Transformer.
|
|
// The error returned by Transform determines whether to increase or
|
|
// decrease i. We try to completely fill a buffer before converting it.
|
|
for low, i, high := c.errStart, c.errStart, len(c.link)-2; low <= i && i <= high; {
|
|
in, out := &c.link[i], &c.link[i+1]
|
|
nDst, nSrc, err0 := in.t.Transform(out.dst(), in.src(), atEOF && low == i)
|
|
out.n += nDst
|
|
in.p += nSrc
|
|
if i > 0 && in.p == in.n {
|
|
in.p, in.n = 0, 0
|
|
}
|
|
needProgress, lastFull = lastFull, false
|
|
switch err0 {
|
|
case ErrShortDst:
|
|
// Process the destination buffer next. Return if we are already
|
|
// at the high index.
|
|
if i == high {
|
|
return dstL.n, srcL.p, ErrShortDst
|
|
}
|
|
if out.n != 0 {
|
|
i++
|
|
// If the Transformer at the next index is not able to process any
|
|
// source bytes there is nothing that can be done to make progress
|
|
// and the bytes will remain unprocessed. lastFull is used to
|
|
// detect this and break out of the loop with a fatal error.
|
|
lastFull = true
|
|
continue
|
|
}
|
|
// The destination buffer was too small, but is completely empty.
|
|
// Return a fatal error as this transformation can never complete.
|
|
c.fatalError(i, errShortInternal)
|
|
case ErrShortSrc:
|
|
if i == 0 {
|
|
// Save ErrShortSrc in err. All other errors take precedence.
|
|
err = ErrShortSrc
|
|
break
|
|
}
|
|
// Source bytes were depleted before filling up the destination buffer.
|
|
// Verify we made some progress, move the remaining bytes to the errStart
|
|
// and try to get more source bytes.
|
|
if needProgress && nSrc == 0 || in.n-in.p == len(in.b) {
|
|
// There were not enough source bytes to proceed while the source
|
|
// buffer cannot hold any more bytes. Return a fatal error as this
|
|
// transformation can never complete.
|
|
c.fatalError(i, errShortInternal)
|
|
break
|
|
}
|
|
// in.b is an internal buffer and we can make progress.
|
|
in.p, in.n = 0, copy(in.b, in.src())
|
|
fallthrough
|
|
case nil:
|
|
// if i == low, we have depleted the bytes at index i or any lower levels.
|
|
// In that case we increase low and i. In all other cases we decrease i to
|
|
// fetch more bytes before proceeding to the next index.
|
|
if i > low {
|
|
i--
|
|
continue
|
|
}
|
|
default:
|
|
c.fatalError(i, err0)
|
|
}
|
|
// Exhausted level low or fatal error: increase low and continue
|
|
// to process the bytes accepted so far.
|
|
i++
|
|
low = i
|
|
}
|
|
|
|
// If c.errStart > 0, this means we found a fatal error. We will clear
|
|
// all upstream buffers. At this point, no more progress can be made
|
|
// downstream, as Transform would have bailed while handling ErrShortDst.
|
|
if c.errStart > 0 {
|
|
for i := 1; i < c.errStart; i++ {
|
|
c.link[i].p, c.link[i].n = 0, 0
|
|
}
|
|
err, c.errStart, c.err = c.err, 0, nil
|
|
}
|
|
return dstL.n, srcL.p, err
|
|
}
|
|
|
|
// RemoveFunc returns a Transformer that removes from the input all runes r for
|
|
// which f(r) is true. Illegal bytes in the input are replaced by RuneError.
|
|
func RemoveFunc(f func(r rune) bool) Transformer {
|
|
return removeF(f)
|
|
}
|
|
|
|
type removeF func(r rune) bool
|
|
|
|
func (removeF) Reset() {}
|
|
|
|
// Transform implements the Transformer interface.
|
|
func (t removeF) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
|
for r, sz := rune(0), 0; len(src) > 0; src = src[sz:] {
|
|
|
|
if r = rune(src[0]); r < utf8.RuneSelf {
|
|
sz = 1
|
|
} else {
|
|
r, sz = utf8.DecodeRune(src)
|
|
|
|
if sz == 1 {
|
|
// Invalid rune.
|
|
if !atEOF && !utf8.FullRune(src) {
|
|
err = ErrShortSrc
|
|
break
|
|
}
|
|
// We replace illegal bytes with RuneError. Not doing so might
|
|
// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
|
|
// The resulting byte sequence may subsequently contain runes
|
|
// for which t(r) is true that were passed unnoticed.
|
|
if !t(r) {
|
|
if nDst+3 > len(dst) {
|
|
err = ErrShortDst
|
|
break
|
|
}
|
|
nDst += copy(dst[nDst:], "\uFFFD")
|
|
}
|
|
nSrc++
|
|
continue
|
|
}
|
|
}
|
|
|
|
if !t(r) {
|
|
if nDst+sz > len(dst) {
|
|
err = ErrShortDst
|
|
break
|
|
}
|
|
nDst += copy(dst[nDst:], src[:sz])
|
|
}
|
|
nSrc += sz
|
|
}
|
|
return
|
|
}
|
|
|
|
// grow returns a new []byte that is longer than b, and copies the first n bytes
|
|
// of b to the start of the new slice.
|
|
func grow(b []byte, n int) []byte {
|
|
m := len(b)
|
|
if m <= 256 {
|
|
m *= 2
|
|
} else {
|
|
m += m >> 1
|
|
}
|
|
buf := make([]byte, m)
|
|
copy(buf, b[:n])
|
|
return buf
|
|
}
|
|
|
|
const initialBufSize = 128
|
|
|
|
// String returns a string with the result of converting s[:n] using t, where
|
|
// n <= len(s). If err == nil, n will be len(s). It calls Reset on t.
|
|
func String(t Transformer, s string) (result string, n int, err error) {
|
|
if s == "" {
|
|
return "", 0, nil
|
|
}
|
|
|
|
t.Reset()
|
|
|
|
// Allocate only once. Note that both dst and src escape when passed to
|
|
// Transform.
|
|
buf := [2 * initialBufSize]byte{}
|
|
dst := buf[:initialBufSize:initialBufSize]
|
|
src := buf[initialBufSize : 2*initialBufSize]
|
|
|
|
// Avoid allocation if the transformed string is identical to the original.
|
|
// After this loop, pDst will point to the furthest point in s for which it
|
|
// could be detected that t gives equal results, src[:nSrc] will
|
|
// indicated the last processed chunk of s for which the output is not equal
|
|
// and dst[:nDst] will be the transform of this chunk.
|
|
var nDst, nSrc int
|
|
pDst := 0 // Used as index in both src and dst in this loop.
|
|
for {
|
|
n := copy(src, s[pDst:])
|
|
nDst, nSrc, err = t.Transform(dst, src[:n], pDst+n == len(s))
|
|
|
|
// Note 1: we will not enter the loop with pDst == len(s) and we will
|
|
// not end the loop with it either. So if nSrc is 0, this means there is
|
|
// some kind of error from which we cannot recover given the current
|
|
// buffer sizes. We will give up in this case.
|
|
// Note 2: it is not entirely correct to simply do a bytes.Equal as
|
|
// a Transformer may buffer internally. It will work in most cases,
|
|
// though, and no harm is done if it doesn't work.
|
|
// TODO: let transformers implement an optional Spanner interface, akin
|
|
// to norm's QuickSpan. This would even allow us to avoid any allocation.
|
|
if nSrc == 0 || !bytes.Equal(dst[:nDst], src[:nSrc]) {
|
|
break
|
|
}
|
|
|
|
if pDst += nDst; pDst == len(s) {
|
|
return s, pDst, nil
|
|
}
|
|
}
|
|
|
|
// Move the bytes seen so far to dst.
|
|
pSrc := pDst + nSrc
|
|
if pDst+nDst <= initialBufSize {
|
|
copy(dst[pDst:], dst[:nDst])
|
|
} else {
|
|
b := make([]byte, len(s)+nDst-nSrc)
|
|
copy(b[pDst:], dst[:nDst])
|
|
dst = b
|
|
}
|
|
copy(dst, s[:pDst])
|
|
pDst += nDst
|
|
|
|
if err != nil && err != ErrShortDst && err != ErrShortSrc {
|
|
return string(dst[:pDst]), pSrc, err
|
|
}
|
|
|
|
// Complete the string with the remainder.
|
|
for {
|
|
n := copy(src, s[pSrc:])
|
|
nDst, nSrc, err = t.Transform(dst[pDst:], src[:n], pSrc+n == len(s))
|
|
pDst += nDst
|
|
pSrc += nSrc
|
|
|
|
switch err {
|
|
case nil:
|
|
if pSrc == len(s) {
|
|
return string(dst[:pDst]), pSrc, nil
|
|
}
|
|
case ErrShortDst:
|
|
// Do not grow as long as we can make progress. This may avoid
|
|
// excessive allocations.
|
|
if nDst == 0 {
|
|
dst = grow(dst, pDst)
|
|
}
|
|
case ErrShortSrc:
|
|
if nSrc == 0 {
|
|
src = grow(src, 0)
|
|
}
|
|
default:
|
|
return string(dst[:pDst]), pSrc, err
|
|
}
|
|
}
|
|
}
|
|
|
|
// Bytes returns a new byte slice with the result of converting b[:n] using t,
|
|
// where n <= len(b). If err == nil, n will be len(b). It calls Reset on t.
|
|
func Bytes(t Transformer, b []byte) (result []byte, n int, err error) {
|
|
return doAppend(t, 0, make([]byte, len(b)), b)
|
|
}
|
|
|
|
// Append appends the result of converting src[:n] using t to dst, where
|
|
// n <= len(src), If err == nil, n will be len(src). It calls Reset on t.
|
|
func Append(t Transformer, dst, src []byte) (result []byte, n int, err error) {
|
|
if len(dst) == cap(dst) {
|
|
n := len(src) + len(dst) // It is okay for this to be 0.
|
|
b := make([]byte, n)
|
|
dst = b[:copy(b, dst)]
|
|
}
|
|
return doAppend(t, len(dst), dst[:cap(dst)], src)
|
|
}
|
|
|
|
func doAppend(t Transformer, pDst int, dst, src []byte) (result []byte, n int, err error) {
|
|
t.Reset()
|
|
pSrc := 0
|
|
for {
|
|
nDst, nSrc, err := t.Transform(dst[pDst:], src[pSrc:], true)
|
|
pDst += nDst
|
|
pSrc += nSrc
|
|
if err != ErrShortDst {
|
|
return dst[:pDst], pSrc, err
|
|
}
|
|
|
|
// Grow the destination buffer, but do not grow as long as we can make
|
|
// progress. This may avoid excessive allocations.
|
|
if nDst == 0 {
|
|
dst = grow(dst, pDst)
|
|
}
|
|
}
|
|
}
|