123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458 |
- // Copyright 2011 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- package norm
- import (
- "fmt"
- "unicode/utf8"
- )
- // MaxSegmentSize is the maximum size of a byte buffer needed to consider any
- // sequence of starter and non-starter runes for the purpose of normalization.
- const MaxSegmentSize = maxByteBufferSize
- // An Iter iterates over a string or byte slice, while normalizing it
- // to a given Form.
- type Iter struct {
- rb reorderBuffer
- buf [maxByteBufferSize]byte
- info Properties // first character saved from previous iteration
- next iterFunc // implementation of next depends on form
- asciiF iterFunc
- p int // current position in input source
- multiSeg []byte // remainder of multi-segment decomposition
- }
- type iterFunc func(*Iter) []byte
- // Init initializes i to iterate over src after normalizing it to Form f.
- func (i *Iter) Init(f Form, src []byte) {
- i.p = 0
- if len(src) == 0 {
- i.setDone()
- i.rb.nsrc = 0
- return
- }
- i.multiSeg = nil
- i.rb.init(f, src)
- i.next = i.rb.f.nextMain
- i.asciiF = nextASCIIBytes
- i.info = i.rb.f.info(i.rb.src, i.p)
- i.rb.ss.first(i.info)
- }
- // InitString initializes i to iterate over src after normalizing it to Form f.
- func (i *Iter) InitString(f Form, src string) {
- i.p = 0
- if len(src) == 0 {
- i.setDone()
- i.rb.nsrc = 0
- return
- }
- i.multiSeg = nil
- i.rb.initString(f, src)
- i.next = i.rb.f.nextMain
- i.asciiF = nextASCIIString
- i.info = i.rb.f.info(i.rb.src, i.p)
- i.rb.ss.first(i.info)
- }
- // Seek sets the segment to be returned by the next call to Next to start
- // at position p. It is the responsibility of the caller to set p to the
- // start of a segment.
- func (i *Iter) Seek(offset int64, whence int) (int64, error) {
- var abs int64
- switch whence {
- case 0:
- abs = offset
- case 1:
- abs = int64(i.p) + offset
- case 2:
- abs = int64(i.rb.nsrc) + offset
- default:
- return 0, fmt.Errorf("norm: invalid whence")
- }
- if abs < 0 {
- return 0, fmt.Errorf("norm: negative position")
- }
- if int(abs) >= i.rb.nsrc {
- i.setDone()
- return int64(i.p), nil
- }
- i.p = int(abs)
- i.multiSeg = nil
- i.next = i.rb.f.nextMain
- i.info = i.rb.f.info(i.rb.src, i.p)
- i.rb.ss.first(i.info)
- return abs, nil
- }
- // returnSlice returns a slice of the underlying input type as a byte slice.
- // If the underlying is of type []byte, it will simply return a slice.
- // If the underlying is of type string, it will copy the slice to the buffer
- // and return that.
- func (i *Iter) returnSlice(a, b int) []byte {
- if i.rb.src.bytes == nil {
- return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
- }
- return i.rb.src.bytes[a:b]
- }
- // Pos returns the byte position at which the next call to Next will commence processing.
- func (i *Iter) Pos() int {
- return i.p
- }
- func (i *Iter) setDone() {
- i.next = nextDone
- i.p = i.rb.nsrc
- }
- // Done returns true if there is no more input to process.
- func (i *Iter) Done() bool {
- return i.p >= i.rb.nsrc
- }
- // Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
- // For any input a and b for which f(a) == f(b), subsequent calls
- // to Next will return the same segments.
- // Modifying runes are grouped together with the preceding starter, if such a starter exists.
- // Although not guaranteed, n will typically be the smallest possible n.
- func (i *Iter) Next() []byte {
- return i.next(i)
- }
- func nextASCIIBytes(i *Iter) []byte {
- p := i.p + 1
- if p >= i.rb.nsrc {
- p0 := i.p
- i.setDone()
- return i.rb.src.bytes[p0:p]
- }
- if i.rb.src.bytes[p] < utf8.RuneSelf {
- p0 := i.p
- i.p = p
- return i.rb.src.bytes[p0:p]
- }
- i.info = i.rb.f.info(i.rb.src, i.p)
- i.next = i.rb.f.nextMain
- return i.next(i)
- }
- func nextASCIIString(i *Iter) []byte {
- p := i.p + 1
- if p >= i.rb.nsrc {
- i.buf[0] = i.rb.src.str[i.p]
- i.setDone()
- return i.buf[:1]
- }
- if i.rb.src.str[p] < utf8.RuneSelf {
- i.buf[0] = i.rb.src.str[i.p]
- i.p = p
- return i.buf[:1]
- }
- i.info = i.rb.f.info(i.rb.src, i.p)
- i.next = i.rb.f.nextMain
- return i.next(i)
- }
- func nextHangul(i *Iter) []byte {
- p := i.p
- next := p + hangulUTF8Size
- if next >= i.rb.nsrc {
- i.setDone()
- } else if i.rb.src.hangul(next) == 0 {
- i.rb.ss.next(i.info)
- i.info = i.rb.f.info(i.rb.src, i.p)
- i.next = i.rb.f.nextMain
- return i.next(i)
- }
- i.p = next
- return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))]
- }
- func nextDone(i *Iter) []byte {
- return nil
- }
- // nextMulti is used for iterating over multi-segment decompositions
- // for decomposing normal forms.
- func nextMulti(i *Iter) []byte {
- j := 0
- d := i.multiSeg
- // skip first rune
- for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
- }
- for j < len(d) {
- info := i.rb.f.info(input{bytes: d}, j)
- if info.BoundaryBefore() {
- i.multiSeg = d[j:]
- return d[:j]
- }
- j += int(info.size)
- }
- // treat last segment as normal decomposition
- i.next = i.rb.f.nextMain
- return i.next(i)
- }
- // nextMultiNorm is used for iterating over multi-segment decompositions
- // for composing normal forms.
- func nextMultiNorm(i *Iter) []byte {
- j := 0
- d := i.multiSeg
- for j < len(d) {
- info := i.rb.f.info(input{bytes: d}, j)
- if info.BoundaryBefore() {
- i.rb.compose()
- seg := i.buf[:i.rb.flushCopy(i.buf[:])]
- i.rb.insertUnsafe(input{bytes: d}, j, info)
- i.multiSeg = d[j+int(info.size):]
- return seg
- }
- i.rb.insertUnsafe(input{bytes: d}, j, info)
- j += int(info.size)
- }
- i.multiSeg = nil
- i.next = nextComposed
- return doNormComposed(i)
- }
- // nextDecomposed is the implementation of Next for forms NFD and NFKD.
- func nextDecomposed(i *Iter) (next []byte) {
- outp := 0
- inCopyStart, outCopyStart := i.p, 0
- for {
- if sz := int(i.info.size); sz <= 1 {
- i.rb.ss = 0
- p := i.p
- i.p++ // ASCII or illegal byte. Either way, advance by 1.
- if i.p >= i.rb.nsrc {
- i.setDone()
- return i.returnSlice(p, i.p)
- } else if i.rb.src._byte(i.p) < utf8.RuneSelf {
- i.next = i.asciiF
- return i.returnSlice(p, i.p)
- }
- outp++
- } else if d := i.info.Decomposition(); d != nil {
- // Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
- // Case 1: there is a leftover to copy. In this case the decomposition
- // must begin with a modifier and should always be appended.
- // Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
- p := outp + len(d)
- if outp > 0 {
- i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
- // TODO: this condition should not be possible, but we leave it
- // in for defensive purposes.
- if p > len(i.buf) {
- return i.buf[:outp]
- }
- } else if i.info.multiSegment() {
- // outp must be 0 as multi-segment decompositions always
- // start a new segment.
- if i.multiSeg == nil {
- i.multiSeg = d
- i.next = nextMulti
- return nextMulti(i)
- }
- // We are in the last segment. Treat as normal decomposition.
- d = i.multiSeg
- i.multiSeg = nil
- p = len(d)
- }
- prevCC := i.info.tccc
- if i.p += sz; i.p >= i.rb.nsrc {
- i.setDone()
- i.info = Properties{} // Force BoundaryBefore to succeed.
- } else {
- i.info = i.rb.f.info(i.rb.src, i.p)
- }
- switch i.rb.ss.next(i.info) {
- case ssOverflow:
- i.next = nextCGJDecompose
- fallthrough
- case ssStarter:
- if outp > 0 {
- copy(i.buf[outp:], d)
- return i.buf[:p]
- }
- return d
- }
- copy(i.buf[outp:], d)
- outp = p
- inCopyStart, outCopyStart = i.p, outp
- if i.info.ccc < prevCC {
- goto doNorm
- }
- continue
- } else if r := i.rb.src.hangul(i.p); r != 0 {
- outp = decomposeHangul(i.buf[:], r)
- i.p += hangulUTF8Size
- inCopyStart, outCopyStart = i.p, outp
- if i.p >= i.rb.nsrc {
- i.setDone()
- break
- } else if i.rb.src.hangul(i.p) != 0 {
- i.next = nextHangul
- return i.buf[:outp]
- }
- } else {
- p := outp + sz
- if p > len(i.buf) {
- break
- }
- outp = p
- i.p += sz
- }
- if i.p >= i.rb.nsrc {
- i.setDone()
- break
- }
- prevCC := i.info.tccc
- i.info = i.rb.f.info(i.rb.src, i.p)
- if v := i.rb.ss.next(i.info); v == ssStarter {
- break
- } else if v == ssOverflow {
- i.next = nextCGJDecompose
- break
- }
- if i.info.ccc < prevCC {
- goto doNorm
- }
- }
- if outCopyStart == 0 {
- return i.returnSlice(inCopyStart, i.p)
- } else if inCopyStart < i.p {
- i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
- }
- return i.buf[:outp]
- doNorm:
- // Insert what we have decomposed so far in the reorderBuffer.
- // As we will only reorder, there will always be enough room.
- i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
- i.rb.insertDecomposed(i.buf[0:outp])
- return doNormDecomposed(i)
- }
- func doNormDecomposed(i *Iter) []byte {
- for {
- i.rb.insertUnsafe(i.rb.src, i.p, i.info)
- if i.p += int(i.info.size); i.p >= i.rb.nsrc {
- i.setDone()
- break
- }
- i.info = i.rb.f.info(i.rb.src, i.p)
- if i.info.ccc == 0 {
- break
- }
- if s := i.rb.ss.next(i.info); s == ssOverflow {
- i.next = nextCGJDecompose
- break
- }
- }
- // new segment or too many combining characters: exit normalization
- return i.buf[:i.rb.flushCopy(i.buf[:])]
- }
- func nextCGJDecompose(i *Iter) []byte {
- i.rb.ss = 0
- i.rb.insertCGJ()
- i.next = nextDecomposed
- i.rb.ss.first(i.info)
- buf := doNormDecomposed(i)
- return buf
- }
- // nextComposed is the implementation of Next for forms NFC and NFKC.
- func nextComposed(i *Iter) []byte {
- outp, startp := 0, i.p
- var prevCC uint8
- for {
- if !i.info.isYesC() {
- goto doNorm
- }
- prevCC = i.info.tccc
- sz := int(i.info.size)
- if sz == 0 {
- sz = 1 // illegal rune: copy byte-by-byte
- }
- p := outp + sz
- if p > len(i.buf) {
- break
- }
- outp = p
- i.p += sz
- if i.p >= i.rb.nsrc {
- i.setDone()
- break
- } else if i.rb.src._byte(i.p) < utf8.RuneSelf {
- i.rb.ss = 0
- i.next = i.asciiF
- break
- }
- i.info = i.rb.f.info(i.rb.src, i.p)
- if v := i.rb.ss.next(i.info); v == ssStarter {
- break
- } else if v == ssOverflow {
- i.next = nextCGJCompose
- break
- }
- if i.info.ccc < prevCC {
- goto doNorm
- }
- }
- return i.returnSlice(startp, i.p)
- doNorm:
- // reset to start position
- i.p = startp
- i.info = i.rb.f.info(i.rb.src, i.p)
- i.rb.ss.first(i.info)
- if i.info.multiSegment() {
- d := i.info.Decomposition()
- info := i.rb.f.info(input{bytes: d}, 0)
- i.rb.insertUnsafe(input{bytes: d}, 0, info)
- i.multiSeg = d[int(info.size):]
- i.next = nextMultiNorm
- return nextMultiNorm(i)
- }
- i.rb.ss.first(i.info)
- i.rb.insertUnsafe(i.rb.src, i.p, i.info)
- return doNormComposed(i)
- }
- func doNormComposed(i *Iter) []byte {
- // First rune should already be inserted.
- for {
- if i.p += int(i.info.size); i.p >= i.rb.nsrc {
- i.setDone()
- break
- }
- i.info = i.rb.f.info(i.rb.src, i.p)
- if s := i.rb.ss.next(i.info); s == ssStarter {
- break
- } else if s == ssOverflow {
- i.next = nextCGJCompose
- break
- }
- i.rb.insertUnsafe(i.rb.src, i.p, i.info)
- }
- i.rb.compose()
- seg := i.buf[:i.rb.flushCopy(i.buf[:])]
- return seg
- }
- func nextCGJCompose(i *Iter) []byte {
- i.rb.ss = 0 // instead of first
- i.rb.insertCGJ()
- i.next = nextComposed
- // Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
- // even if they are not. This is particularly dubious for U+FF9E and UFF9A.
- // If we ever change that, insert a check here.
- i.rb.ss.first(i.info)
- i.rb.insertUnsafe(i.rb.src, i.p, i.info)
- return doNormComposed(i)
- }
|