123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403 |
- // Copyright 2012 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- // TODO: remove hard-coded versions when we have implemented fractional weights.
- // The current implementation is incompatible with later CLDR versions.
- //go:generate go run maketables.go -cldr=23 -unicode=6.2.0
- // Package collate contains types for comparing and sorting Unicode strings
- // according to a given collation order.
- package collate // import "golang.org/x/text/collate"
- import (
- "bytes"
- "strings"
- "golang.org/x/text/internal/colltab"
- "golang.org/x/text/language"
- )
- // Collator provides functionality for comparing strings for a given
- // collation order.
- type Collator struct {
- options
- sorter sorter
- _iter [2]iter
- }
- func (c *Collator) iter(i int) *iter {
- // TODO: evaluate performance for making the second iterator optional.
- return &c._iter[i]
- }
- // Supported returns the list of languages for which collating differs from its parent.
- func Supported() []language.Tag {
- // TODO: use language.Coverage instead.
- t := make([]language.Tag, len(tags))
- copy(t, tags)
- return t
- }
- func init() {
- ids := strings.Split(availableLocales, ",")
- tags = make([]language.Tag, len(ids))
- for i, s := range ids {
- tags[i] = language.Raw.MustParse(s)
- }
- }
- var tags []language.Tag
- // New returns a new Collator initialized for the given locale.
- func New(t language.Tag, o ...Option) *Collator {
- index := colltab.MatchLang(t, tags)
- c := newCollator(getTable(locales[index]))
- // Set options from the user-supplied tag.
- c.setFromTag(t)
- // Set the user-supplied options.
- c.setOptions(o)
- c.init()
- return c
- }
- // NewFromTable returns a new Collator for the given Weighter.
- func NewFromTable(w colltab.Weighter, o ...Option) *Collator {
- c := newCollator(w)
- c.setOptions(o)
- c.init()
- return c
- }
- func (c *Collator) init() {
- if c.numeric {
- c.t = colltab.NewNumericWeighter(c.t)
- }
- c._iter[0].init(c)
- c._iter[1].init(c)
- }
- // Buffer holds keys generated by Key and KeyString.
- type Buffer struct {
- buf [4096]byte
- key []byte
- }
- func (b *Buffer) init() {
- if b.key == nil {
- b.key = b.buf[:0]
- }
- }
- // Reset clears the buffer from previous results generated by Key and KeyString.
- func (b *Buffer) Reset() {
- b.key = b.key[:0]
- }
- // Compare returns an integer comparing the two byte slices.
- // The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
- func (c *Collator) Compare(a, b []byte) int {
- // TODO: skip identical prefixes once we have a fast way to detect if a rune is
- // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
- c.iter(0).SetInput(a)
- c.iter(1).SetInput(b)
- if res := c.compare(); res != 0 {
- return res
- }
- if !c.ignore[colltab.Identity] {
- return bytes.Compare(a, b)
- }
- return 0
- }
- // CompareString returns an integer comparing the two strings.
- // The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
- func (c *Collator) CompareString(a, b string) int {
- // TODO: skip identical prefixes once we have a fast way to detect if a rune is
- // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
- c.iter(0).SetInputString(a)
- c.iter(1).SetInputString(b)
- if res := c.compare(); res != 0 {
- return res
- }
- if !c.ignore[colltab.Identity] {
- if a < b {
- return -1
- } else if a > b {
- return 1
- }
- }
- return 0
- }
- func compareLevel(f func(i *iter) int, a, b *iter) int {
- a.pce = 0
- b.pce = 0
- for {
- va := f(a)
- vb := f(b)
- if va != vb {
- if va < vb {
- return -1
- }
- return 1
- } else if va == 0 {
- break
- }
- }
- return 0
- }
- func (c *Collator) compare() int {
- ia, ib := c.iter(0), c.iter(1)
- // Process primary level
- if c.alternate != altShifted {
- // TODO: implement script reordering
- if res := compareLevel((*iter).nextPrimary, ia, ib); res != 0 {
- return res
- }
- } else {
- // TODO: handle shifted
- }
- if !c.ignore[colltab.Secondary] {
- f := (*iter).nextSecondary
- if c.backwards {
- f = (*iter).prevSecondary
- }
- if res := compareLevel(f, ia, ib); res != 0 {
- return res
- }
- }
- // TODO: special case handling (Danish?)
- if !c.ignore[colltab.Tertiary] || c.caseLevel {
- if res := compareLevel((*iter).nextTertiary, ia, ib); res != 0 {
- return res
- }
- if !c.ignore[colltab.Quaternary] {
- if res := compareLevel((*iter).nextQuaternary, ia, ib); res != 0 {
- return res
- }
- }
- }
- return 0
- }
- // Key returns the collation key for str.
- // Passing the buffer buf may avoid memory allocations.
- // The returned slice will point to an allocation in Buffer and will remain
- // valid until the next call to buf.Reset().
- func (c *Collator) Key(buf *Buffer, str []byte) []byte {
- // See https://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
- buf.init()
- return c.key(buf, c.getColElems(str))
- }
- // KeyFromString returns the collation key for str.
- // Passing the buffer buf may avoid memory allocations.
- // The returned slice will point to an allocation in Buffer and will retain
- // valid until the next call to buf.ResetKeys().
- func (c *Collator) KeyFromString(buf *Buffer, str string) []byte {
- // See https://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
- buf.init()
- return c.key(buf, c.getColElemsString(str))
- }
- func (c *Collator) key(buf *Buffer, w []colltab.Elem) []byte {
- processWeights(c.alternate, c.t.Top(), w)
- kn := len(buf.key)
- c.keyFromElems(buf, w)
- return buf.key[kn:]
- }
- func (c *Collator) getColElems(str []byte) []colltab.Elem {
- i := c.iter(0)
- i.SetInput(str)
- for i.Next() {
- }
- return i.Elems
- }
- func (c *Collator) getColElemsString(str string) []colltab.Elem {
- i := c.iter(0)
- i.SetInputString(str)
- for i.Next() {
- }
- return i.Elems
- }
- type iter struct {
- wa [512]colltab.Elem
- colltab.Iter
- pce int
- }
- func (i *iter) init(c *Collator) {
- i.Weighter = c.t
- i.Elems = i.wa[:0]
- }
- func (i *iter) nextPrimary() int {
- for {
- for ; i.pce < i.N; i.pce++ {
- if v := i.Elems[i.pce].Primary(); v != 0 {
- i.pce++
- return v
- }
- }
- if !i.Next() {
- return 0
- }
- }
- panic("should not reach here")
- }
- func (i *iter) nextSecondary() int {
- for ; i.pce < len(i.Elems); i.pce++ {
- if v := i.Elems[i.pce].Secondary(); v != 0 {
- i.pce++
- return v
- }
- }
- return 0
- }
- func (i *iter) prevSecondary() int {
- for ; i.pce < len(i.Elems); i.pce++ {
- if v := i.Elems[len(i.Elems)-i.pce-1].Secondary(); v != 0 {
- i.pce++
- return v
- }
- }
- return 0
- }
- func (i *iter) nextTertiary() int {
- for ; i.pce < len(i.Elems); i.pce++ {
- if v := i.Elems[i.pce].Tertiary(); v != 0 {
- i.pce++
- return int(v)
- }
- }
- return 0
- }
- func (i *iter) nextQuaternary() int {
- for ; i.pce < len(i.Elems); i.pce++ {
- if v := i.Elems[i.pce].Quaternary(); v != 0 {
- i.pce++
- return v
- }
- }
- return 0
- }
- func appendPrimary(key []byte, p int) []byte {
- // Convert to variable length encoding; supports up to 23 bits.
- if p <= 0x7FFF {
- key = append(key, uint8(p>>8), uint8(p))
- } else {
- key = append(key, uint8(p>>16)|0x80, uint8(p>>8), uint8(p))
- }
- return key
- }
- // keyFromElems converts the weights ws to a compact sequence of bytes.
- // The result will be appended to the byte buffer in buf.
- func (c *Collator) keyFromElems(buf *Buffer, ws []colltab.Elem) {
- for _, v := range ws {
- if w := v.Primary(); w > 0 {
- buf.key = appendPrimary(buf.key, w)
- }
- }
- if !c.ignore[colltab.Secondary] {
- buf.key = append(buf.key, 0, 0)
- // TODO: we can use one 0 if we can guarantee that all non-zero weights are > 0xFF.
- if !c.backwards {
- for _, v := range ws {
- if w := v.Secondary(); w > 0 {
- buf.key = append(buf.key, uint8(w>>8), uint8(w))
- }
- }
- } else {
- for i := len(ws) - 1; i >= 0; i-- {
- if w := ws[i].Secondary(); w > 0 {
- buf.key = append(buf.key, uint8(w>>8), uint8(w))
- }
- }
- }
- } else if c.caseLevel {
- buf.key = append(buf.key, 0, 0)
- }
- if !c.ignore[colltab.Tertiary] || c.caseLevel {
- buf.key = append(buf.key, 0, 0)
- for _, v := range ws {
- if w := v.Tertiary(); w > 0 {
- buf.key = append(buf.key, uint8(w))
- }
- }
- // Derive the quaternary weights from the options and other levels.
- // Note that we represent MaxQuaternary as 0xFF. The first byte of the
- // representation of a primary weight is always smaller than 0xFF,
- // so using this single byte value will compare correctly.
- if !c.ignore[colltab.Quaternary] && c.alternate >= altShifted {
- if c.alternate == altShiftTrimmed {
- lastNonFFFF := len(buf.key)
- buf.key = append(buf.key, 0)
- for _, v := range ws {
- if w := v.Quaternary(); w == colltab.MaxQuaternary {
- buf.key = append(buf.key, 0xFF)
- } else if w > 0 {
- buf.key = appendPrimary(buf.key, w)
- lastNonFFFF = len(buf.key)
- }
- }
- buf.key = buf.key[:lastNonFFFF]
- } else {
- buf.key = append(buf.key, 0)
- for _, v := range ws {
- if w := v.Quaternary(); w == colltab.MaxQuaternary {
- buf.key = append(buf.key, 0xFF)
- } else if w > 0 {
- buf.key = appendPrimary(buf.key, w)
- }
- }
- }
- }
- }
- }
- func processWeights(vw alternateHandling, top uint32, wa []colltab.Elem) {
- ignore := false
- vtop := int(top)
- switch vw {
- case altShifted, altShiftTrimmed:
- for i := range wa {
- if p := wa[i].Primary(); p <= vtop && p != 0 {
- wa[i] = colltab.MakeQuaternary(p)
- ignore = true
- } else if p == 0 {
- if ignore {
- wa[i] = colltab.Ignore
- }
- } else {
- ignore = false
- }
- }
- case altBlanked:
- for i := range wa {
- if p := wa[i].Primary(); p <= vtop && (ignore || p != 0) {
- wa[i] = colltab.Ignore
- ignore = true
- } else {
- ignore = false
- }
- }
- }
- }
|