123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294 |
- // Copyright 2012 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- package build
- import (
- "fmt"
- "unicode"
- "golang.org/x/text/internal/colltab"
- )
- const (
- defaultSecondary = 0x20
- defaultTertiary = 0x2
- maxTertiary = 0x1F
- )
- type rawCE struct {
- w []int
- ccc uint8
- }
- func makeRawCE(w []int, ccc uint8) rawCE {
- ce := rawCE{w: make([]int, 4), ccc: ccc}
- copy(ce.w, w)
- return ce
- }
- // A collation element is represented as an uint32.
- // In the typical case, a rune maps to a single collation element. If a rune
- // can be the start of a contraction or expands into multiple collation elements,
- // then the collation element that is associated with a rune will have a special
- // form to represent such m to n mappings. Such special collation elements
- // have a value >= 0x80000000.
- const (
- maxPrimaryBits = 21
- maxSecondaryBits = 12
- maxTertiaryBits = 8
- )
- func makeCE(ce rawCE) (uint32, error) {
- v, e := colltab.MakeElem(ce.w[0], ce.w[1], ce.w[2], ce.ccc)
- return uint32(v), e
- }
- // For contractions, collation elements are of the form
- // 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
- // - n* is the size of the first node in the contraction trie.
- // - i* is the index of the first node in the contraction trie.
- // - b* is the offset into the contraction collation element table.
- // See contract.go for details on the contraction trie.
- const (
- contractID = 0xC0000000
- maxNBits = 4
- maxTrieIndexBits = 12
- maxContractOffsetBits = 13
- )
- func makeContractIndex(h ctHandle, offset int) (uint32, error) {
- if h.n >= 1<<maxNBits {
- return 0, fmt.Errorf("size of contraction trie node too large: %d >= %d", h.n, 1<<maxNBits)
- }
- if h.index >= 1<<maxTrieIndexBits {
- return 0, fmt.Errorf("size of contraction trie offset too large: %d >= %d", h.index, 1<<maxTrieIndexBits)
- }
- if offset >= 1<<maxContractOffsetBits {
- return 0, fmt.Errorf("contraction offset out of bounds: %x >= %x", offset, 1<<maxContractOffsetBits)
- }
- ce := uint32(contractID)
- ce += uint32(offset << (maxNBits + maxTrieIndexBits))
- ce += uint32(h.index << maxNBits)
- ce += uint32(h.n)
- return ce, nil
- }
- // For expansions, collation elements are of the form
- // 11100000 00000000 bbbbbbbb bbbbbbbb,
- // where b* is the index into the expansion sequence table.
- const (
- expandID = 0xE0000000
- maxExpandIndexBits = 16
- )
- func makeExpandIndex(index int) (uint32, error) {
- if index >= 1<<maxExpandIndexBits {
- return 0, fmt.Errorf("expansion index out of bounds: %x >= %x", index, 1<<maxExpandIndexBits)
- }
- return expandID + uint32(index), nil
- }
- // Each list of collation elements corresponding to an expansion starts with
- // a header indicating the length of the sequence.
- func makeExpansionHeader(n int) (uint32, error) {
- return uint32(n), nil
- }
- // Some runes can be expanded using NFKD decomposition. Instead of storing the full
- // sequence of collation elements, we decompose the rune and lookup the collation
- // elements for each rune in the decomposition and modify the tertiary weights.
- // The collation element, in this case, is of the form
- // 11110000 00000000 wwwwwwww vvvvvvvv, where
- // - v* is the replacement tertiary weight for the first rune,
- // - w* is the replacement tertiary weight for the second rune,
- // Tertiary weights of subsequent runes should be replaced with maxTertiary.
- // See https://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
- const (
- decompID = 0xF0000000
- )
- func makeDecompose(t1, t2 int) (uint32, error) {
- if t1 >= 256 || t1 < 0 {
- return 0, fmt.Errorf("first tertiary weight out of bounds: %d >= 256", t1)
- }
- if t2 >= 256 || t2 < 0 {
- return 0, fmt.Errorf("second tertiary weight out of bounds: %d >= 256", t2)
- }
- return uint32(t2<<8+t1) + decompID, nil
- }
- const (
- // These constants were taken from https://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
- minUnified rune = 0x4E00
- maxUnified = 0x9FFF
- minCompatibility = 0xF900
- maxCompatibility = 0xFAFF
- minRare = 0x3400
- maxRare = 0x4DBF
- )
- const (
- commonUnifiedOffset = 0x10000
- rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF
- otherOffset = 0x50000 // largest rune in rare is U+2FA1D
- illegalOffset = otherOffset + int(unicode.MaxRune)
- maxPrimary = illegalOffset + 1
- )
- // implicitPrimary returns the primary weight for the a rune
- // for which there is no entry for the rune in the collation table.
- // We take a different approach from the one specified in
- // https://unicode.org/reports/tr10/#Implicit_Weights,
- // but preserve the resulting relative ordering of the runes.
- func implicitPrimary(r rune) int {
- if unicode.Is(unicode.Ideographic, r) {
- if r >= minUnified && r <= maxUnified {
- // The most common case for CJK.
- return int(r) + commonUnifiedOffset
- }
- if r >= minCompatibility && r <= maxCompatibility {
- // This will typically not hit. The DUCET explicitly specifies mappings
- // for all characters that do not decompose.
- return int(r) + commonUnifiedOffset
- }
- return int(r) + rareUnifiedOffset
- }
- return int(r) + otherOffset
- }
- // convertLargeWeights converts collation elements with large
- // primaries (either double primaries or for illegal runes)
- // to our own representation.
- // A CJK character C is represented in the DUCET as
- // [.FBxx.0020.0002.C][.BBBB.0000.0000.C]
- // We will rewrite these characters to a single CE.
- // We assume the CJK values start at 0x8000.
- // See https://unicode.org/reports/tr10/#Implicit_Weights
- func convertLargeWeights(elems []rawCE) (res []rawCE, err error) {
- const (
- cjkPrimaryStart = 0xFB40
- rarePrimaryStart = 0xFB80
- otherPrimaryStart = 0xFBC0
- illegalPrimary = 0xFFFE
- highBitsMask = 0x3F
- lowBitsMask = 0x7FFF
- lowBitsFlag = 0x8000
- shiftBits = 15
- )
- for i := 0; i < len(elems); i++ {
- ce := elems[i].w
- p := ce[0]
- if p < cjkPrimaryStart {
- continue
- }
- if p > 0xFFFF {
- return elems, fmt.Errorf("found primary weight %X; should be <= 0xFFFF", p)
- }
- if p >= illegalPrimary {
- ce[0] = illegalOffset + p - illegalPrimary
- } else {
- if i+1 >= len(elems) {
- return elems, fmt.Errorf("second part of double primary weight missing: %v", elems)
- }
- if elems[i+1].w[0]&lowBitsFlag == 0 {
- return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems)
- }
- np := ((p & highBitsMask) << shiftBits) + elems[i+1].w[0]&lowBitsMask
- switch {
- case p < rarePrimaryStart:
- np += commonUnifiedOffset
- case p < otherPrimaryStart:
- np += rareUnifiedOffset
- default:
- p += otherOffset
- }
- ce[0] = np
- for j := i + 1; j+1 < len(elems); j++ {
- elems[j] = elems[j+1]
- }
- elems = elems[:len(elems)-1]
- }
- }
- return elems, nil
- }
- // nextWeight computes the first possible collation weights following elems
- // for the given level.
- func nextWeight(level colltab.Level, elems []rawCE) []rawCE {
- if level == colltab.Identity {
- next := make([]rawCE, len(elems))
- copy(next, elems)
- return next
- }
- next := []rawCE{makeRawCE(elems[0].w, elems[0].ccc)}
- next[0].w[level]++
- if level < colltab.Secondary {
- next[0].w[colltab.Secondary] = defaultSecondary
- }
- if level < colltab.Tertiary {
- next[0].w[colltab.Tertiary] = defaultTertiary
- }
- // Filter entries that cannot influence ordering.
- for _, ce := range elems[1:] {
- skip := true
- for i := colltab.Primary; i < level; i++ {
- skip = skip && ce.w[i] == 0
- }
- if !skip {
- next = append(next, ce)
- }
- }
- return next
- }
- func nextVal(elems []rawCE, i int, level colltab.Level) (index, value int) {
- for ; i < len(elems) && elems[i].w[level] == 0; i++ {
- }
- if i < len(elems) {
- return i, elems[i].w[level]
- }
- return i, 0
- }
- // compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise.
- // It also returns the collation level at which the difference is found.
- func compareWeights(a, b []rawCE) (result int, level colltab.Level) {
- for level := colltab.Primary; level < colltab.Identity; level++ {
- var va, vb int
- for ia, ib := 0, 0; ia < len(a) || ib < len(b); ia, ib = ia+1, ib+1 {
- ia, va = nextVal(a, ia, level)
- ib, vb = nextVal(b, ib, level)
- if va != vb {
- if va < vb {
- return -1, level
- } else {
- return 1, level
- }
- }
- }
- }
- return 0, colltab.Identity
- }
- func equalCE(a, b rawCE) bool {
- for i := 0; i < 3; i++ {
- if b.w[i] != a.w[i] {
- return false
- }
- }
- return true
- }
- func equalCEArrays(a, b []rawCE) bool {
- if len(a) != len(b) {
- return false
- }
- for i := range a {
- if !equalCE(a[i], b[i]) {
- return false
- }
- }
- return true
- }
|