123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275 |
- // Copyright 2012 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- package colltab
- import (
- "unicode/utf8"
- "golang.org/x/text/unicode/norm"
- )
- // Table holds all collation data for a given collation ordering.
- type Table struct {
- Index Trie // main trie
- // expansion info
- ExpandElem []uint32
- // contraction info
- ContractTries ContractTrieSet
- ContractElem []uint32
- MaxContractLen int
- VariableTop uint32
- }
- func (t *Table) AppendNext(w []Elem, b []byte) (res []Elem, n int) {
- return t.appendNext(w, source{bytes: b})
- }
- func (t *Table) AppendNextString(w []Elem, s string) (res []Elem, n int) {
- return t.appendNext(w, source{str: s})
- }
- func (t *Table) Start(p int, b []byte) int {
- // TODO: implement
- panic("not implemented")
- }
- func (t *Table) StartString(p int, s string) int {
- // TODO: implement
- panic("not implemented")
- }
- func (t *Table) Domain() []string {
- // TODO: implement
- panic("not implemented")
- }
- func (t *Table) Top() uint32 {
- return t.VariableTop
- }
- type source struct {
- str string
- bytes []byte
- }
- func (src *source) lookup(t *Table) (ce Elem, sz int) {
- if src.bytes == nil {
- return t.Index.lookupString(src.str)
- }
- return t.Index.lookup(src.bytes)
- }
- func (src *source) tail(sz int) {
- if src.bytes == nil {
- src.str = src.str[sz:]
- } else {
- src.bytes = src.bytes[sz:]
- }
- }
- func (src *source) nfd(buf []byte, end int) []byte {
- if src.bytes == nil {
- return norm.NFD.AppendString(buf[:0], src.str[:end])
- }
- return norm.NFD.Append(buf[:0], src.bytes[:end]...)
- }
- func (src *source) rune() (r rune, sz int) {
- if src.bytes == nil {
- return utf8.DecodeRuneInString(src.str)
- }
- return utf8.DecodeRune(src.bytes)
- }
- func (src *source) properties(f norm.Form) norm.Properties {
- if src.bytes == nil {
- return f.PropertiesString(src.str)
- }
- return f.Properties(src.bytes)
- }
- // appendNext appends the weights corresponding to the next rune or
- // contraction in s. If a contraction is matched to a discontinuous
- // sequence of runes, the weights for the interstitial runes are
- // appended as well. It returns a new slice that includes the appended
- // weights and the number of bytes consumed from s.
- func (t *Table) appendNext(w []Elem, src source) (res []Elem, n int) {
- ce, sz := src.lookup(t)
- tp := ce.ctype()
- if tp == ceNormal {
- if ce == 0 {
- r, _ := src.rune()
- const (
- hangulSize = 3
- firstHangul = 0xAC00
- lastHangul = 0xD7A3
- )
- if r >= firstHangul && r <= lastHangul {
- // TODO: performance can be considerably improved here.
- n = sz
- var buf [16]byte // Used for decomposing Hangul.
- for b := src.nfd(buf[:0], hangulSize); len(b) > 0; b = b[sz:] {
- ce, sz = t.Index.lookup(b)
- w = append(w, ce)
- }
- return w, n
- }
- ce = makeImplicitCE(implicitPrimary(r))
- }
- w = append(w, ce)
- } else if tp == ceExpansionIndex {
- w = t.appendExpansion(w, ce)
- } else if tp == ceContractionIndex {
- n := 0
- src.tail(sz)
- if src.bytes == nil {
- w, n = t.matchContractionString(w, ce, src.str)
- } else {
- w, n = t.matchContraction(w, ce, src.bytes)
- }
- sz += n
- } else if tp == ceDecompose {
- // Decompose using NFKD and replace tertiary weights.
- t1, t2 := splitDecompose(ce)
- i := len(w)
- nfkd := src.properties(norm.NFKD).Decomposition()
- for p := 0; len(nfkd) > 0; nfkd = nfkd[p:] {
- w, p = t.appendNext(w, source{bytes: nfkd})
- }
- w[i] = w[i].updateTertiary(t1)
- if i++; i < len(w) {
- w[i] = w[i].updateTertiary(t2)
- for i++; i < len(w); i++ {
- w[i] = w[i].updateTertiary(maxTertiary)
- }
- }
- }
- return w, sz
- }
- func (t *Table) appendExpansion(w []Elem, ce Elem) []Elem {
- i := splitExpandIndex(ce)
- n := int(t.ExpandElem[i])
- i++
- for _, ce := range t.ExpandElem[i : i+n] {
- w = append(w, Elem(ce))
- }
- return w
- }
- func (t *Table) matchContraction(w []Elem, ce Elem, suffix []byte) ([]Elem, int) {
- index, n, offset := splitContractIndex(ce)
- scan := t.ContractTries.scanner(index, n, suffix)
- buf := [norm.MaxSegmentSize]byte{}
- bufp := 0
- p := scan.scan(0)
- if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
- // By now we should have filtered most cases.
- p0 := p
- bufn := 0
- rune := norm.NFD.Properties(suffix[p:])
- p += rune.Size()
- if rune.LeadCCC() != 0 {
- prevCC := rune.TrailCCC()
- // A gap may only occur in the last normalization segment.
- // This also ensures that len(scan.s) < norm.MaxSegmentSize.
- if end := norm.NFD.FirstBoundary(suffix[p:]); end != -1 {
- scan.s = suffix[:p+end]
- }
- for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
- rune = norm.NFD.Properties(suffix[p:])
- if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
- break
- }
- prevCC = rune.TrailCCC()
- if pp := scan.scan(p); pp != p {
- // Copy the interstitial runes for later processing.
- bufn += copy(buf[bufn:], suffix[p0:p])
- if scan.pindex == pp {
- bufp = bufn
- }
- p, p0 = pp, pp
- } else {
- p += rune.Size()
- }
- }
- }
- }
- // Append weights for the matched contraction, which may be an expansion.
- i, n := scan.result()
- ce = Elem(t.ContractElem[i+offset])
- if ce.ctype() == ceNormal {
- w = append(w, ce)
- } else {
- w = t.appendExpansion(w, ce)
- }
- // Append weights for the runes in the segment not part of the contraction.
- for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
- w, p = t.appendNext(w, source{bytes: b})
- }
- return w, n
- }
- // TODO: unify the two implementations. This is best done after first simplifying
- // the algorithm taking into account the inclusion of both NFC and NFD forms
- // in the table.
- func (t *Table) matchContractionString(w []Elem, ce Elem, suffix string) ([]Elem, int) {
- index, n, offset := splitContractIndex(ce)
- scan := t.ContractTries.scannerString(index, n, suffix)
- buf := [norm.MaxSegmentSize]byte{}
- bufp := 0
- p := scan.scan(0)
- if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
- // By now we should have filtered most cases.
- p0 := p
- bufn := 0
- rune := norm.NFD.PropertiesString(suffix[p:])
- p += rune.Size()
- if rune.LeadCCC() != 0 {
- prevCC := rune.TrailCCC()
- // A gap may only occur in the last normalization segment.
- // This also ensures that len(scan.s) < norm.MaxSegmentSize.
- if end := norm.NFD.FirstBoundaryInString(suffix[p:]); end != -1 {
- scan.s = suffix[:p+end]
- }
- for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
- rune = norm.NFD.PropertiesString(suffix[p:])
- if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
- break
- }
- prevCC = rune.TrailCCC()
- if pp := scan.scan(p); pp != p {
- // Copy the interstitial runes for later processing.
- bufn += copy(buf[bufn:], suffix[p0:p])
- if scan.pindex == pp {
- bufp = bufn
- }
- p, p0 = pp, pp
- } else {
- p += rune.Size()
- }
- }
- }
- }
- // Append weights for the matched contraction, which may be an expansion.
- i, n := scan.result()
- ce = Elem(t.ContractElem[i+offset])
- if ce.ctype() == ceNormal {
- w = append(w, ce)
- } else {
- w = t.appendExpansion(w, ce)
- }
- // Append weights for the runes in the segment not part of the contraction.
- for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
- w, p = t.appendNext(w, source{bytes: b})
- }
- return w, n
- }
|