123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705 |
- // Copyright 2013 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- // Package transform provides reader and writer wrappers that transform the
- // bytes passing through as well as various transformations. Example
- // transformations provided by other packages include normalization and
- // conversion between character sets.
- package transform // import "golang.org/x/text/transform"
- import (
- "bytes"
- "errors"
- "io"
- "unicode/utf8"
- )
- var (
- // ErrShortDst means that the destination buffer was too short to
- // receive all of the transformed bytes.
- ErrShortDst = errors.New("transform: short destination buffer")
- // ErrShortSrc means that the source buffer has insufficient data to
- // complete the transformation.
- ErrShortSrc = errors.New("transform: short source buffer")
- // ErrEndOfSpan means that the input and output (the transformed input)
- // are not identical.
- ErrEndOfSpan = errors.New("transform: input and output are not identical")
- // errInconsistentByteCount means that Transform returned success (nil
- // error) but also returned nSrc inconsistent with the src argument.
- errInconsistentByteCount = errors.New("transform: inconsistent byte count returned")
- // errShortInternal means that an internal buffer is not large enough
- // to make progress and the Transform operation must be aborted.
- errShortInternal = errors.New("transform: short internal buffer")
- )
- // Transformer transforms bytes.
- type Transformer interface {
- // Transform writes to dst the transformed bytes read from src, and
- // returns the number of dst bytes written and src bytes read. The
- // atEOF argument tells whether src represents the last bytes of the
- // input.
- //
- // Callers should always process the nDst bytes produced and account
- // for the nSrc bytes consumed before considering the error err.
- //
- // A nil error means that all of the transformed bytes (whether freshly
- // transformed from src or left over from previous Transform calls)
- // were written to dst. A nil error can be returned regardless of
- // whether atEOF is true. If err is nil then nSrc must equal len(src);
- // the converse is not necessarily true.
- //
- // ErrShortDst means that dst was too short to receive all of the
- // transformed bytes. ErrShortSrc means that src had insufficient data
- // to complete the transformation. If both conditions apply, then
- // either error may be returned. Other than the error conditions listed
- // here, implementations are free to report other errors that arise.
- Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error)
- // Reset resets the state and allows a Transformer to be reused.
- Reset()
- }
- // SpanningTransformer extends the Transformer interface with a Span method
- // that determines how much of the input already conforms to the Transformer.
- type SpanningTransformer interface {
- Transformer
- // Span returns a position in src such that transforming src[:n] results in
- // identical output src[:n] for these bytes. It does not necessarily return
- // the largest such n. The atEOF argument tells whether src represents the
- // last bytes of the input.
- //
- // Callers should always account for the n bytes consumed before
- // considering the error err.
- //
- // A nil error means that all input bytes are known to be identical to the
- // output produced by the Transformer. A nil error can be returned
- // regardless of whether atEOF is true. If err is nil, then n must
- // equal len(src); the converse is not necessarily true.
- //
- // ErrEndOfSpan means that the Transformer output may differ from the
- // input after n bytes. Note that n may be len(src), meaning that the output
- // would contain additional bytes after otherwise identical output.
- // ErrShortSrc means that src had insufficient data to determine whether the
- // remaining bytes would change. Other than the error conditions listed
- // here, implementations are free to report other errors that arise.
- //
- // Calling Span can modify the Transformer state as a side effect. In
- // effect, it does the transformation just as calling Transform would, only
- // without copying to a destination buffer and only up to a point it can
- // determine the input and output bytes are the same. This is obviously more
- // limited than calling Transform, but can be more efficient in terms of
- // copying and allocating buffers. Calls to Span and Transform may be
- // interleaved.
- Span(src []byte, atEOF bool) (n int, err error)
- }
- // NopResetter can be embedded by implementations of Transformer to add a nop
- // Reset method.
- type NopResetter struct{}
- // Reset implements the Reset method of the Transformer interface.
- func (NopResetter) Reset() {}
- // Reader wraps another io.Reader by transforming the bytes read.
- type Reader struct {
- r io.Reader
- t Transformer
- err error
- // dst[dst0:dst1] contains bytes that have been transformed by t but
- // not yet copied out via Read.
- dst []byte
- dst0, dst1 int
- // src[src0:src1] contains bytes that have been read from r but not
- // yet transformed through t.
- src []byte
- src0, src1 int
- // transformComplete is whether the transformation is complete,
- // regardless of whether or not it was successful.
- transformComplete bool
- }
- const defaultBufSize = 4096
- // NewReader returns a new Reader that wraps r by transforming the bytes read
- // via t. It calls Reset on t.
- func NewReader(r io.Reader, t Transformer) *Reader {
- t.Reset()
- return &Reader{
- r: r,
- t: t,
- dst: make([]byte, defaultBufSize),
- src: make([]byte, defaultBufSize),
- }
- }
- // Read implements the io.Reader interface.
- func (r *Reader) Read(p []byte) (int, error) {
- n, err := 0, error(nil)
- for {
- // Copy out any transformed bytes and return the final error if we are done.
- if r.dst0 != r.dst1 {
- n = copy(p, r.dst[r.dst0:r.dst1])
- r.dst0 += n
- if r.dst0 == r.dst1 && r.transformComplete {
- return n, r.err
- }
- return n, nil
- } else if r.transformComplete {
- return 0, r.err
- }
- // Try to transform some source bytes, or to flush the transformer if we
- // are out of source bytes. We do this even if r.r.Read returned an error.
- // As the io.Reader documentation says, "process the n > 0 bytes returned
- // before considering the error".
- if r.src0 != r.src1 || r.err != nil {
- r.dst0 = 0
- r.dst1, n, err = r.t.Transform(r.dst, r.src[r.src0:r.src1], r.err == io.EOF)
- r.src0 += n
- switch {
- case err == nil:
- if r.src0 != r.src1 {
- r.err = errInconsistentByteCount
- }
- // The Transform call was successful; we are complete if we
- // cannot read more bytes into src.
- r.transformComplete = r.err != nil
- continue
- case err == ErrShortDst && (r.dst1 != 0 || n != 0):
- // Make room in dst by copying out, and try again.
- continue
- case err == ErrShortSrc && r.src1-r.src0 != len(r.src) && r.err == nil:
- // Read more bytes into src via the code below, and try again.
- default:
- r.transformComplete = true
- // The reader error (r.err) takes precedence over the
- // transformer error (err) unless r.err is nil or io.EOF.
- if r.err == nil || r.err == io.EOF {
- r.err = err
- }
- continue
- }
- }
- // Move any untransformed source bytes to the start of the buffer
- // and read more bytes.
- if r.src0 != 0 {
- r.src0, r.src1 = 0, copy(r.src, r.src[r.src0:r.src1])
- }
- n, r.err = r.r.Read(r.src[r.src1:])
- r.src1 += n
- }
- }
- // TODO: implement ReadByte (and ReadRune??).
- // Writer wraps another io.Writer by transforming the bytes read.
- // The user needs to call Close to flush unwritten bytes that may
- // be buffered.
- type Writer struct {
- w io.Writer
- t Transformer
- dst []byte
- // src[:n] contains bytes that have not yet passed through t.
- src []byte
- n int
- }
- // NewWriter returns a new Writer that wraps w by transforming the bytes written
- // via t. It calls Reset on t.
- func NewWriter(w io.Writer, t Transformer) *Writer {
- t.Reset()
- return &Writer{
- w: w,
- t: t,
- dst: make([]byte, defaultBufSize),
- src: make([]byte, defaultBufSize),
- }
- }
- // Write implements the io.Writer interface. If there are not enough
- // bytes available to complete a Transform, the bytes will be buffered
- // for the next write. Call Close to convert the remaining bytes.
- func (w *Writer) Write(data []byte) (n int, err error) {
- src := data
- if w.n > 0 {
- // Append bytes from data to the last remainder.
- // TODO: limit the amount copied on first try.
- n = copy(w.src[w.n:], data)
- w.n += n
- src = w.src[:w.n]
- }
- for {
- nDst, nSrc, err := w.t.Transform(w.dst, src, false)
- if _, werr := w.w.Write(w.dst[:nDst]); werr != nil {
- return n, werr
- }
- src = src[nSrc:]
- if w.n == 0 {
- n += nSrc
- } else if len(src) <= n {
- // Enough bytes from w.src have been consumed. We make src point
- // to data instead to reduce the copying.
- w.n = 0
- n -= len(src)
- src = data[n:]
- if n < len(data) && (err == nil || err == ErrShortSrc) {
- continue
- }
- }
- switch err {
- case ErrShortDst:
- // This error is okay as long as we are making progress.
- if nDst > 0 || nSrc > 0 {
- continue
- }
- case ErrShortSrc:
- if len(src) < len(w.src) {
- m := copy(w.src, src)
- // If w.n > 0, bytes from data were already copied to w.src and n
- // was already set to the number of bytes consumed.
- if w.n == 0 {
- n += m
- }
- w.n = m
- err = nil
- } else if nDst > 0 || nSrc > 0 {
- // Not enough buffer to store the remainder. Keep processing as
- // long as there is progress. Without this case, transforms that
- // require a lookahead larger than the buffer may result in an
- // error. This is not something one may expect to be common in
- // practice, but it may occur when buffers are set to small
- // sizes during testing.
- continue
- }
- case nil:
- if w.n > 0 {
- err = errInconsistentByteCount
- }
- }
- return n, err
- }
- }
- // Close implements the io.Closer interface.
- func (w *Writer) Close() error {
- src := w.src[:w.n]
- for {
- nDst, nSrc, err := w.t.Transform(w.dst, src, true)
- if _, werr := w.w.Write(w.dst[:nDst]); werr != nil {
- return werr
- }
- if err != ErrShortDst {
- return err
- }
- src = src[nSrc:]
- }
- }
- type nop struct{ NopResetter }
- func (nop) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
- n := copy(dst, src)
- if n < len(src) {
- err = ErrShortDst
- }
- return n, n, err
- }
- func (nop) Span(src []byte, atEOF bool) (n int, err error) {
- return len(src), nil
- }
- type discard struct{ NopResetter }
- func (discard) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
- return 0, len(src), nil
- }
- var (
- // Discard is a Transformer for which all Transform calls succeed
- // by consuming all bytes and writing nothing.
- Discard Transformer = discard{}
- // Nop is a SpanningTransformer that copies src to dst.
- Nop SpanningTransformer = nop{}
- )
- // chain is a sequence of links. A chain with N Transformers has N+1 links and
- // N+1 buffers. Of those N+1 buffers, the first and last are the src and dst
- // buffers given to chain.Transform and the middle N-1 buffers are intermediate
- // buffers owned by the chain. The i'th link transforms bytes from the i'th
- // buffer chain.link[i].b at read offset chain.link[i].p to the i+1'th buffer
- // chain.link[i+1].b at write offset chain.link[i+1].n, for i in [0, N).
- type chain struct {
- link []link
- err error
- // errStart is the index at which the error occurred plus 1. Processing
- // errStart at this level at the next call to Transform. As long as
- // errStart > 0, chain will not consume any more source bytes.
- errStart int
- }
- func (c *chain) fatalError(errIndex int, err error) {
- if i := errIndex + 1; i > c.errStart {
- c.errStart = i
- c.err = err
- }
- }
- type link struct {
- t Transformer
- // b[p:n] holds the bytes to be transformed by t.
- b []byte
- p int
- n int
- }
- func (l *link) src() []byte {
- return l.b[l.p:l.n]
- }
- func (l *link) dst() []byte {
- return l.b[l.n:]
- }
- // Chain returns a Transformer that applies t in sequence.
- func Chain(t ...Transformer) Transformer {
- if len(t) == 0 {
- return nop{}
- }
- c := &chain{link: make([]link, len(t)+1)}
- for i, tt := range t {
- c.link[i].t = tt
- }
- // Allocate intermediate buffers.
- b := make([][defaultBufSize]byte, len(t)-1)
- for i := range b {
- c.link[i+1].b = b[i][:]
- }
- return c
- }
- // Reset resets the state of Chain. It calls Reset on all the Transformers.
- func (c *chain) Reset() {
- for i, l := range c.link {
- if l.t != nil {
- l.t.Reset()
- }
- c.link[i].p, c.link[i].n = 0, 0
- }
- }
- // TODO: make chain use Span (is going to be fun to implement!)
- // Transform applies the transformers of c in sequence.
- func (c *chain) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
- // Set up src and dst in the chain.
- srcL := &c.link[0]
- dstL := &c.link[len(c.link)-1]
- srcL.b, srcL.p, srcL.n = src, 0, len(src)
- dstL.b, dstL.n = dst, 0
- var lastFull, needProgress bool // for detecting progress
- // i is the index of the next Transformer to apply, for i in [low, high].
- // low is the lowest index for which c.link[low] may still produce bytes.
- // high is the highest index for which c.link[high] has a Transformer.
- // The error returned by Transform determines whether to increase or
- // decrease i. We try to completely fill a buffer before converting it.
- for low, i, high := c.errStart, c.errStart, len(c.link)-2; low <= i && i <= high; {
- in, out := &c.link[i], &c.link[i+1]
- nDst, nSrc, err0 := in.t.Transform(out.dst(), in.src(), atEOF && low == i)
- out.n += nDst
- in.p += nSrc
- if i > 0 && in.p == in.n {
- in.p, in.n = 0, 0
- }
- needProgress, lastFull = lastFull, false
- switch err0 {
- case ErrShortDst:
- // Process the destination buffer next. Return if we are already
- // at the high index.
- if i == high {
- return dstL.n, srcL.p, ErrShortDst
- }
- if out.n != 0 {
- i++
- // If the Transformer at the next index is not able to process any
- // source bytes there is nothing that can be done to make progress
- // and the bytes will remain unprocessed. lastFull is used to
- // detect this and break out of the loop with a fatal error.
- lastFull = true
- continue
- }
- // The destination buffer was too small, but is completely empty.
- // Return a fatal error as this transformation can never complete.
- c.fatalError(i, errShortInternal)
- case ErrShortSrc:
- if i == 0 {
- // Save ErrShortSrc in err. All other errors take precedence.
- err = ErrShortSrc
- break
- }
- // Source bytes were depleted before filling up the destination buffer.
- // Verify we made some progress, move the remaining bytes to the errStart
- // and try to get more source bytes.
- if needProgress && nSrc == 0 || in.n-in.p == len(in.b) {
- // There were not enough source bytes to proceed while the source
- // buffer cannot hold any more bytes. Return a fatal error as this
- // transformation can never complete.
- c.fatalError(i, errShortInternal)
- break
- }
- // in.b is an internal buffer and we can make progress.
- in.p, in.n = 0, copy(in.b, in.src())
- fallthrough
- case nil:
- // if i == low, we have depleted the bytes at index i or any lower levels.
- // In that case we increase low and i. In all other cases we decrease i to
- // fetch more bytes before proceeding to the next index.
- if i > low {
- i--
- continue
- }
- default:
- c.fatalError(i, err0)
- }
- // Exhausted level low or fatal error: increase low and continue
- // to process the bytes accepted so far.
- i++
- low = i
- }
- // If c.errStart > 0, this means we found a fatal error. We will clear
- // all upstream buffers. At this point, no more progress can be made
- // downstream, as Transform would have bailed while handling ErrShortDst.
- if c.errStart > 0 {
- for i := 1; i < c.errStart; i++ {
- c.link[i].p, c.link[i].n = 0, 0
- }
- err, c.errStart, c.err = c.err, 0, nil
- }
- return dstL.n, srcL.p, err
- }
- // Deprecated: Use runes.Remove instead.
- func RemoveFunc(f func(r rune) bool) Transformer {
- return removeF(f)
- }
- type removeF func(r rune) bool
- func (removeF) Reset() {}
- // Transform implements the Transformer interface.
- func (t removeF) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
- for r, sz := rune(0), 0; len(src) > 0; src = src[sz:] {
- if r = rune(src[0]); r < utf8.RuneSelf {
- sz = 1
- } else {
- r, sz = utf8.DecodeRune(src)
- if sz == 1 {
- // Invalid rune.
- if !atEOF && !utf8.FullRune(src) {
- err = ErrShortSrc
- break
- }
- // We replace illegal bytes with RuneError. Not doing so might
- // otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
- // The resulting byte sequence may subsequently contain runes
- // for which t(r) is true that were passed unnoticed.
- if !t(r) {
- if nDst+3 > len(dst) {
- err = ErrShortDst
- break
- }
- nDst += copy(dst[nDst:], "\uFFFD")
- }
- nSrc++
- continue
- }
- }
- if !t(r) {
- if nDst+sz > len(dst) {
- err = ErrShortDst
- break
- }
- nDst += copy(dst[nDst:], src[:sz])
- }
- nSrc += sz
- }
- return
- }
- // grow returns a new []byte that is longer than b, and copies the first n bytes
- // of b to the start of the new slice.
- func grow(b []byte, n int) []byte {
- m := len(b)
- if m <= 32 {
- m = 64
- } else if m <= 256 {
- m *= 2
- } else {
- m += m >> 1
- }
- buf := make([]byte, m)
- copy(buf, b[:n])
- return buf
- }
- const initialBufSize = 128
- // String returns a string with the result of converting s[:n] using t, where
- // n <= len(s). If err == nil, n will be len(s). It calls Reset on t.
- func String(t Transformer, s string) (result string, n int, err error) {
- t.Reset()
- if s == "" {
- // Fast path for the common case for empty input. Results in about a
- // 86% reduction of running time for BenchmarkStringLowerEmpty.
- if _, _, err := t.Transform(nil, nil, true); err == nil {
- return "", 0, nil
- }
- }
- // Allocate only once. Note that both dst and src escape when passed to
- // Transform.
- buf := [2 * initialBufSize]byte{}
- dst := buf[:initialBufSize:initialBufSize]
- src := buf[initialBufSize : 2*initialBufSize]
- // The input string s is transformed in multiple chunks (starting with a
- // chunk size of initialBufSize). nDst and nSrc are per-chunk (or
- // per-Transform-call) indexes, pDst and pSrc are overall indexes.
- nDst, nSrc := 0, 0
- pDst, pSrc := 0, 0
- // pPrefix is the length of a common prefix: the first pPrefix bytes of the
- // result will equal the first pPrefix bytes of s. It is not guaranteed to
- // be the largest such value, but if pPrefix, len(result) and len(s) are
- // all equal after the final transform (i.e. calling Transform with atEOF
- // being true returned nil error) then we don't need to allocate a new
- // result string.
- pPrefix := 0
- for {
- // Invariant: pDst == pPrefix && pSrc == pPrefix.
- n := copy(src, s[pSrc:])
- nDst, nSrc, err = t.Transform(dst, src[:n], pSrc+n == len(s))
- pDst += nDst
- pSrc += nSrc
- // TODO: let transformers implement an optional Spanner interface, akin
- // to norm's QuickSpan. This would even allow us to avoid any allocation.
- if !bytes.Equal(dst[:nDst], src[:nSrc]) {
- break
- }
- pPrefix = pSrc
- if err == ErrShortDst {
- // A buffer can only be short if a transformer modifies its input.
- break
- } else if err == ErrShortSrc {
- if nSrc == 0 {
- // No progress was made.
- break
- }
- // Equal so far and !atEOF, so continue checking.
- } else if err != nil || pPrefix == len(s) {
- return string(s[:pPrefix]), pPrefix, err
- }
- }
- // Post-condition: pDst == pPrefix + nDst && pSrc == pPrefix + nSrc.
- // We have transformed the first pSrc bytes of the input s to become pDst
- // transformed bytes. Those transformed bytes are discontiguous: the first
- // pPrefix of them equal s[:pPrefix] and the last nDst of them equal
- // dst[:nDst]. We copy them around, into a new dst buffer if necessary, so
- // that they become one contiguous slice: dst[:pDst].
- if pPrefix != 0 {
- newDst := dst
- if pDst > len(newDst) {
- newDst = make([]byte, len(s)+nDst-nSrc)
- }
- copy(newDst[pPrefix:pDst], dst[:nDst])
- copy(newDst[:pPrefix], s[:pPrefix])
- dst = newDst
- }
- // Prevent duplicate Transform calls with atEOF being true at the end of
- // the input. Also return if we have an unrecoverable error.
- if (err == nil && pSrc == len(s)) ||
- (err != nil && err != ErrShortDst && err != ErrShortSrc) {
- return string(dst[:pDst]), pSrc, err
- }
- // Transform the remaining input, growing dst and src buffers as necessary.
- for {
- n := copy(src, s[pSrc:])
- nDst, nSrc, err := t.Transform(dst[pDst:], src[:n], pSrc+n == len(s))
- pDst += nDst
- pSrc += nSrc
- // If we got ErrShortDst or ErrShortSrc, do not grow as long as we can
- // make progress. This may avoid excessive allocations.
- if err == ErrShortDst {
- if nDst == 0 {
- dst = grow(dst, pDst)
- }
- } else if err == ErrShortSrc {
- if nSrc == 0 {
- src = grow(src, 0)
- }
- } else if err != nil || pSrc == len(s) {
- return string(dst[:pDst]), pSrc, err
- }
- }
- }
- // Bytes returns a new byte slice with the result of converting b[:n] using t,
- // where n <= len(b). If err == nil, n will be len(b). It calls Reset on t.
- func Bytes(t Transformer, b []byte) (result []byte, n int, err error) {
- return doAppend(t, 0, make([]byte, len(b)), b)
- }
- // Append appends the result of converting src[:n] using t to dst, where
- // n <= len(src), If err == nil, n will be len(src). It calls Reset on t.
- func Append(t Transformer, dst, src []byte) (result []byte, n int, err error) {
- if len(dst) == cap(dst) {
- n := len(src) + len(dst) // It is okay for this to be 0.
- b := make([]byte, n)
- dst = b[:copy(b, dst)]
- }
- return doAppend(t, len(dst), dst[:cap(dst)], src)
- }
- func doAppend(t Transformer, pDst int, dst, src []byte) (result []byte, n int, err error) {
- t.Reset()
- pSrc := 0
- for {
- nDst, nSrc, err := t.Transform(dst[pDst:], src[pSrc:], true)
- pDst += nDst
- pSrc += nSrc
- if err != ErrShortDst {
- return dst[:pDst], pSrc, err
- }
- // Grow the destination buffer, but do not grow as long as we can make
- // progress. This may avoid excessive allocations.
- if nDst == 0 {
- dst = grow(dst, pDst)
- }
- }
- }
|