123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371 |
- // Copyright 2012 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- package colltab
- import (
- "fmt"
- "unicode"
- )
- // Level identifies the collation comparison level.
- // The primary level corresponds to the basic sorting of text.
- // The secondary level corresponds to accents and related linguistic elements.
- // The tertiary level corresponds to casing and related concepts.
- // The quaternary level is derived from the other levels by the
- // various algorithms for handling variable elements.
- type Level int
- const (
- Primary Level = iota
- Secondary
- Tertiary
- Quaternary
- Identity
- NumLevels
- )
- const (
- defaultSecondary = 0x20
- defaultTertiary = 0x2
- maxTertiary = 0x1F
- MaxQuaternary = 0x1FFFFF // 21 bits.
- )
- // Elem is a representation of a collation element. This API provides ways to encode
- // and decode Elems. Implementations of collation tables may use values greater
- // or equal to PrivateUse for their own purposes. However, these should never be
- // returned by AppendNext.
- type Elem uint32
- const (
- maxCE Elem = 0xAFFFFFFF
- PrivateUse = minContract
- minContract = 0xC0000000
- maxContract = 0xDFFFFFFF
- minExpand = 0xE0000000
- maxExpand = 0xEFFFFFFF
- minDecomp = 0xF0000000
- )
- type ceType int
- const (
- ceNormal ceType = iota // ceNormal includes implicits (ce == 0)
- ceContractionIndex // rune can be a start of a contraction
- ceExpansionIndex // rune expands into a sequence of collation elements
- ceDecompose // rune expands using NFKC decomposition
- )
- func (ce Elem) ctype() ceType {
- if ce <= maxCE {
- return ceNormal
- }
- if ce <= maxContract {
- return ceContractionIndex
- } else {
- if ce <= maxExpand {
- return ceExpansionIndex
- }
- return ceDecompose
- }
- panic("should not reach here")
- return ceType(-1)
- }
- // For normal collation elements, we assume that a collation element either has
- // a primary or non-default secondary value, not both.
- // Collation elements with a primary value are of the form
- // 01pppppp pppppppp ppppppp0 ssssssss
- // - p* is primary collation value
- // - s* is the secondary collation value
- // 00pppppp pppppppp ppppppps sssttttt, where
- // - p* is primary collation value
- // - s* offset of secondary from default value.
- // - t* is the tertiary collation value
- // 100ttttt cccccccc pppppppp pppppppp
- // - t* is the tertiar collation value
- // - c* is the canonical combining class
- // - p* is the primary collation value
- // Collation elements with a secondary value are of the form
- // 1010cccc ccccssss ssssssss tttttttt, where
- // - c* is the canonical combining class
- // - s* is the secondary collation value
- // - t* is the tertiary collation value
- // 11qqqqqq qqqqqqqq qqqqqqq0 00000000
- // - q* quaternary value
- const (
- ceTypeMask = 0xC0000000
- ceTypeMaskExt = 0xE0000000
- ceIgnoreMask = 0xF00FFFFF
- ceType1 = 0x40000000
- ceType2 = 0x00000000
- ceType3or4 = 0x80000000
- ceType4 = 0xA0000000
- ceTypeQ = 0xC0000000
- Ignore = ceType4
- firstNonPrimary = 0x80000000
- lastSpecialPrimary = 0xA0000000
- secondaryMask = 0x80000000
- hasTertiaryMask = 0x40000000
- primaryValueMask = 0x3FFFFE00
- maxPrimaryBits = 21
- compactPrimaryBits = 16
- maxSecondaryBits = 12
- maxTertiaryBits = 8
- maxCCCBits = 8
- maxSecondaryCompactBits = 8
- maxSecondaryDiffBits = 4
- maxTertiaryCompactBits = 5
- primaryShift = 9
- compactSecondaryShift = 5
- minCompactSecondary = defaultSecondary - 4
- )
- func makeImplicitCE(primary int) Elem {
- return ceType1 | Elem(primary<<primaryShift) | defaultSecondary
- }
- // MakeElem returns an Elem for the given values. It will return an error
- // if the given combination of values is invalid.
- func MakeElem(primary, secondary, tertiary int, ccc uint8) (Elem, error) {
- if w := primary; w >= 1<<maxPrimaryBits || w < 0 {
- return 0, fmt.Errorf("makeCE: primary weight out of bounds: %x >= %x", w, 1<<maxPrimaryBits)
- }
- if w := secondary; w >= 1<<maxSecondaryBits || w < 0 {
- return 0, fmt.Errorf("makeCE: secondary weight out of bounds: %x >= %x", w, 1<<maxSecondaryBits)
- }
- if w := tertiary; w >= 1<<maxTertiaryBits || w < 0 {
- return 0, fmt.Errorf("makeCE: tertiary weight out of bounds: %x >= %x", w, 1<<maxTertiaryBits)
- }
- ce := Elem(0)
- if primary != 0 {
- if ccc != 0 {
- if primary >= 1<<compactPrimaryBits {
- return 0, fmt.Errorf("makeCE: primary weight with non-zero CCC out of bounds: %x >= %x", primary, 1<<compactPrimaryBits)
- }
- if secondary != defaultSecondary {
- return 0, fmt.Errorf("makeCE: cannot combine non-default secondary value (%x) with non-zero CCC (%x)", secondary, ccc)
- }
- ce = Elem(tertiary << (compactPrimaryBits + maxCCCBits))
- ce |= Elem(ccc) << compactPrimaryBits
- ce |= Elem(primary)
- ce |= ceType3or4
- } else if tertiary == defaultTertiary {
- if secondary >= 1<<maxSecondaryCompactBits {
- return 0, fmt.Errorf("makeCE: secondary weight with non-zero primary out of bounds: %x >= %x", secondary, 1<<maxSecondaryCompactBits)
- }
- ce = Elem(primary<<(maxSecondaryCompactBits+1) + secondary)
- ce |= ceType1
- } else {
- d := secondary - defaultSecondary + maxSecondaryDiffBits
- if d >= 1<<maxSecondaryDiffBits || d < 0 {
- return 0, fmt.Errorf("makeCE: secondary weight diff out of bounds: %x < 0 || %x > %x", d, d, 1<<maxSecondaryDiffBits)
- }
- if tertiary >= 1<<maxTertiaryCompactBits {
- return 0, fmt.Errorf("makeCE: tertiary weight with non-zero primary out of bounds: %x > %x", tertiary, 1<<maxTertiaryCompactBits)
- }
- ce = Elem(primary<<maxSecondaryDiffBits + d)
- ce = ce<<maxTertiaryCompactBits + Elem(tertiary)
- }
- } else {
- ce = Elem(secondary<<maxTertiaryBits + tertiary)
- ce += Elem(ccc) << (maxSecondaryBits + maxTertiaryBits)
- ce |= ceType4
- }
- return ce, nil
- }
- // MakeQuaternary returns an Elem with the given quaternary value.
- func MakeQuaternary(v int) Elem {
- return ceTypeQ | Elem(v<<primaryShift)
- }
- // Mask sets weights for any level smaller than l to 0.
- // The resulting Elem can be used to test for equality with
- // other Elems to which the same mask has been applied.
- func (ce Elem) Mask(l Level) uint32 {
- return 0
- }
- // CCC returns the canonical combining class associated with the underlying character,
- // if applicable, or 0 otherwise.
- func (ce Elem) CCC() uint8 {
- if ce&ceType3or4 != 0 {
- if ce&ceType4 == ceType3or4 {
- return uint8(ce >> 16)
- }
- return uint8(ce >> 20)
- }
- return 0
- }
- // Primary returns the primary collation weight for ce.
- func (ce Elem) Primary() int {
- if ce >= firstNonPrimary {
- if ce > lastSpecialPrimary {
- return 0
- }
- return int(uint16(ce))
- }
- return int(ce&primaryValueMask) >> primaryShift
- }
- // Secondary returns the secondary collation weight for ce.
- func (ce Elem) Secondary() int {
- switch ce & ceTypeMask {
- case ceType1:
- return int(uint8(ce))
- case ceType2:
- return minCompactSecondary + int((ce>>compactSecondaryShift)&0xF)
- case ceType3or4:
- if ce < ceType4 {
- return defaultSecondary
- }
- return int(ce>>8) & 0xFFF
- case ceTypeQ:
- return 0
- }
- panic("should not reach here")
- }
- // Tertiary returns the tertiary collation weight for ce.
- func (ce Elem) Tertiary() uint8 {
- if ce&hasTertiaryMask == 0 {
- if ce&ceType3or4 == 0 {
- return uint8(ce & 0x1F)
- }
- if ce&ceType4 == ceType4 {
- return uint8(ce)
- }
- return uint8(ce>>24) & 0x1F // type 2
- } else if ce&ceTypeMask == ceType1 {
- return defaultTertiary
- }
- // ce is a quaternary value.
- return 0
- }
- func (ce Elem) updateTertiary(t uint8) Elem {
- if ce&ceTypeMask == ceType1 {
- // convert to type 4
- nce := ce & primaryValueMask
- nce |= Elem(uint8(ce)-minCompactSecondary) << compactSecondaryShift
- ce = nce
- } else if ce&ceTypeMaskExt == ceType3or4 {
- ce &= ^Elem(maxTertiary << 24)
- return ce | (Elem(t) << 24)
- } else {
- // type 2 or 4
- ce &= ^Elem(maxTertiary)
- }
- return ce | Elem(t)
- }
- // Quaternary returns the quaternary value if explicitly specified,
- // 0 if ce == Ignore, or MaxQuaternary otherwise.
- // Quaternary values are used only for shifted variants.
- func (ce Elem) Quaternary() int {
- if ce&ceTypeMask == ceTypeQ {
- return int(ce&primaryValueMask) >> primaryShift
- } else if ce&ceIgnoreMask == Ignore {
- return 0
- }
- return MaxQuaternary
- }
- // Weight returns the collation weight for the given level.
- func (ce Elem) Weight(l Level) int {
- switch l {
- case Primary:
- return ce.Primary()
- case Secondary:
- return ce.Secondary()
- case Tertiary:
- return int(ce.Tertiary())
- case Quaternary:
- return ce.Quaternary()
- }
- return 0 // return 0 (ignore) for undefined levels.
- }
- // For contractions, collation elements are of the form
- // 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
- // - n* is the size of the first node in the contraction trie.
- // - i* is the index of the first node in the contraction trie.
- // - b* is the offset into the contraction collation element table.
- // See contract.go for details on the contraction trie.
- const (
- maxNBits = 4
- maxTrieIndexBits = 12
- maxContractOffsetBits = 13
- )
- func splitContractIndex(ce Elem) (index, n, offset int) {
- n = int(ce & (1<<maxNBits - 1))
- ce >>= maxNBits
- index = int(ce & (1<<maxTrieIndexBits - 1))
- ce >>= maxTrieIndexBits
- offset = int(ce & (1<<maxContractOffsetBits - 1))
- return
- }
- // For expansions, Elems are of the form 11100000 00000000 bbbbbbbb bbbbbbbb,
- // where b* is the index into the expansion sequence table.
- const maxExpandIndexBits = 16
- func splitExpandIndex(ce Elem) (index int) {
- return int(uint16(ce))
- }
- // Some runes can be expanded using NFKD decomposition. Instead of storing the full
- // sequence of collation elements, we decompose the rune and lookup the collation
- // elements for each rune in the decomposition and modify the tertiary weights.
- // The Elem, in this case, is of the form 11110000 00000000 wwwwwwww vvvvvvvv, where
- // - v* is the replacement tertiary weight for the first rune,
- // - w* is the replacement tertiary weight for the second rune,
- // Tertiary weights of subsequent runes should be replaced with maxTertiary.
- // See https://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
- func splitDecompose(ce Elem) (t1, t2 uint8) {
- return uint8(ce), uint8(ce >> 8)
- }
- const (
- // These constants were taken from https://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
- minUnified rune = 0x4E00
- maxUnified = 0x9FFF
- minCompatibility = 0xF900
- maxCompatibility = 0xFAFF
- minRare = 0x3400
- maxRare = 0x4DBF
- )
- const (
- commonUnifiedOffset = 0x10000
- rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF
- otherOffset = 0x50000 // largest rune in rare is U+2FA1D
- illegalOffset = otherOffset + int(unicode.MaxRune)
- maxPrimary = illegalOffset + 1
- )
- // implicitPrimary returns the primary weight for the a rune
- // for which there is no entry for the rune in the collation table.
- // We take a different approach from the one specified in
- // https://unicode.org/reports/tr10/#Implicit_Weights,
- // but preserve the resulting relative ordering of the runes.
- func implicitPrimary(r rune) int {
- if unicode.Is(unicode.Ideographic, r) {
- if r >= minUnified && r <= maxUnified {
- // The most common case for CJK.
- return int(r) + commonUnifiedOffset
- }
- if r >= minCompatibility && r <= maxCompatibility {
- // This will typically not hit. The DUCET explicitly specifies mappings
- // for all characters that do not decompose.
- return int(r) + commonUnifiedOffset
- }
- return int(r) + rareUnifiedOffset
- }
- return int(r) + otherOffset
- }
|