colelem.go 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
  1. // Copyright 2012 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package build
  5. import (
  6. "fmt"
  7. "unicode"
  8. "golang.org/x/text/internal/colltab"
  9. )
  10. const (
  11. defaultSecondary = 0x20
  12. defaultTertiary = 0x2
  13. maxTertiary = 0x1F
  14. )
  15. type rawCE struct {
  16. w []int
  17. ccc uint8
  18. }
  19. func makeRawCE(w []int, ccc uint8) rawCE {
  20. ce := rawCE{w: make([]int, 4), ccc: ccc}
  21. copy(ce.w, w)
  22. return ce
  23. }
  24. // A collation element is represented as an uint32.
  25. // In the typical case, a rune maps to a single collation element. If a rune
  26. // can be the start of a contraction or expands into multiple collation elements,
  27. // then the collation element that is associated with a rune will have a special
  28. // form to represent such m to n mappings. Such special collation elements
  29. // have a value >= 0x80000000.
  30. const (
  31. maxPrimaryBits = 21
  32. maxSecondaryBits = 12
  33. maxTertiaryBits = 8
  34. )
  35. func makeCE(ce rawCE) (uint32, error) {
  36. v, e := colltab.MakeElem(ce.w[0], ce.w[1], ce.w[2], ce.ccc)
  37. return uint32(v), e
  38. }
  39. // For contractions, collation elements are of the form
  40. // 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
  41. // - n* is the size of the first node in the contraction trie.
  42. // - i* is the index of the first node in the contraction trie.
  43. // - b* is the offset into the contraction collation element table.
  44. // See contract.go for details on the contraction trie.
  45. const (
  46. contractID = 0xC0000000
  47. maxNBits = 4
  48. maxTrieIndexBits = 12
  49. maxContractOffsetBits = 13
  50. )
  51. func makeContractIndex(h ctHandle, offset int) (uint32, error) {
  52. if h.n >= 1<<maxNBits {
  53. return 0, fmt.Errorf("size of contraction trie node too large: %d >= %d", h.n, 1<<maxNBits)
  54. }
  55. if h.index >= 1<<maxTrieIndexBits {
  56. return 0, fmt.Errorf("size of contraction trie offset too large: %d >= %d", h.index, 1<<maxTrieIndexBits)
  57. }
  58. if offset >= 1<<maxContractOffsetBits {
  59. return 0, fmt.Errorf("contraction offset out of bounds: %x >= %x", offset, 1<<maxContractOffsetBits)
  60. }
  61. ce := uint32(contractID)
  62. ce += uint32(offset << (maxNBits + maxTrieIndexBits))
  63. ce += uint32(h.index << maxNBits)
  64. ce += uint32(h.n)
  65. return ce, nil
  66. }
  67. // For expansions, collation elements are of the form
  68. // 11100000 00000000 bbbbbbbb bbbbbbbb,
  69. // where b* is the index into the expansion sequence table.
  70. const (
  71. expandID = 0xE0000000
  72. maxExpandIndexBits = 16
  73. )
  74. func makeExpandIndex(index int) (uint32, error) {
  75. if index >= 1<<maxExpandIndexBits {
  76. return 0, fmt.Errorf("expansion index out of bounds: %x >= %x", index, 1<<maxExpandIndexBits)
  77. }
  78. return expandID + uint32(index), nil
  79. }
  80. // Each list of collation elements corresponding to an expansion starts with
  81. // a header indicating the length of the sequence.
  82. func makeExpansionHeader(n int) (uint32, error) {
  83. return uint32(n), nil
  84. }
  85. // Some runes can be expanded using NFKD decomposition. Instead of storing the full
  86. // sequence of collation elements, we decompose the rune and lookup the collation
  87. // elements for each rune in the decomposition and modify the tertiary weights.
  88. // The collation element, in this case, is of the form
  89. // 11110000 00000000 wwwwwwww vvvvvvvv, where
  90. // - v* is the replacement tertiary weight for the first rune,
  91. // - w* is the replacement tertiary weight for the second rune,
  92. // Tertiary weights of subsequent runes should be replaced with maxTertiary.
  93. // See https://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
  94. const (
  95. decompID = 0xF0000000
  96. )
  97. func makeDecompose(t1, t2 int) (uint32, error) {
  98. if t1 >= 256 || t1 < 0 {
  99. return 0, fmt.Errorf("first tertiary weight out of bounds: %d >= 256", t1)
  100. }
  101. if t2 >= 256 || t2 < 0 {
  102. return 0, fmt.Errorf("second tertiary weight out of bounds: %d >= 256", t2)
  103. }
  104. return uint32(t2<<8+t1) + decompID, nil
  105. }
  106. const (
  107. // These constants were taken from https://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
  108. minUnified rune = 0x4E00
  109. maxUnified = 0x9FFF
  110. minCompatibility = 0xF900
  111. maxCompatibility = 0xFAFF
  112. minRare = 0x3400
  113. maxRare = 0x4DBF
  114. )
  115. const (
  116. commonUnifiedOffset = 0x10000
  117. rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF
  118. otherOffset = 0x50000 // largest rune in rare is U+2FA1D
  119. illegalOffset = otherOffset + int(unicode.MaxRune)
  120. maxPrimary = illegalOffset + 1
  121. )
  122. // implicitPrimary returns the primary weight for the a rune
  123. // for which there is no entry for the rune in the collation table.
  124. // We take a different approach from the one specified in
  125. // https://unicode.org/reports/tr10/#Implicit_Weights,
  126. // but preserve the resulting relative ordering of the runes.
  127. func implicitPrimary(r rune) int {
  128. if unicode.Is(unicode.Ideographic, r) {
  129. if r >= minUnified && r <= maxUnified {
  130. // The most common case for CJK.
  131. return int(r) + commonUnifiedOffset
  132. }
  133. if r >= minCompatibility && r <= maxCompatibility {
  134. // This will typically not hit. The DUCET explicitly specifies mappings
  135. // for all characters that do not decompose.
  136. return int(r) + commonUnifiedOffset
  137. }
  138. return int(r) + rareUnifiedOffset
  139. }
  140. return int(r) + otherOffset
  141. }
  142. // convertLargeWeights converts collation elements with large
  143. // primaries (either double primaries or for illegal runes)
  144. // to our own representation.
  145. // A CJK character C is represented in the DUCET as
  146. // [.FBxx.0020.0002.C][.BBBB.0000.0000.C]
  147. // We will rewrite these characters to a single CE.
  148. // We assume the CJK values start at 0x8000.
  149. // See https://unicode.org/reports/tr10/#Implicit_Weights
  150. func convertLargeWeights(elems []rawCE) (res []rawCE, err error) {
  151. const (
  152. cjkPrimaryStart = 0xFB40
  153. rarePrimaryStart = 0xFB80
  154. otherPrimaryStart = 0xFBC0
  155. illegalPrimary = 0xFFFE
  156. highBitsMask = 0x3F
  157. lowBitsMask = 0x7FFF
  158. lowBitsFlag = 0x8000
  159. shiftBits = 15
  160. )
  161. for i := 0; i < len(elems); i++ {
  162. ce := elems[i].w
  163. p := ce[0]
  164. if p < cjkPrimaryStart {
  165. continue
  166. }
  167. if p > 0xFFFF {
  168. return elems, fmt.Errorf("found primary weight %X; should be <= 0xFFFF", p)
  169. }
  170. if p >= illegalPrimary {
  171. ce[0] = illegalOffset + p - illegalPrimary
  172. } else {
  173. if i+1 >= len(elems) {
  174. return elems, fmt.Errorf("second part of double primary weight missing: %v", elems)
  175. }
  176. if elems[i+1].w[0]&lowBitsFlag == 0 {
  177. return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems)
  178. }
  179. np := ((p & highBitsMask) << shiftBits) + elems[i+1].w[0]&lowBitsMask
  180. switch {
  181. case p < rarePrimaryStart:
  182. np += commonUnifiedOffset
  183. case p < otherPrimaryStart:
  184. np += rareUnifiedOffset
  185. default:
  186. p += otherOffset
  187. }
  188. ce[0] = np
  189. for j := i + 1; j+1 < len(elems); j++ {
  190. elems[j] = elems[j+1]
  191. }
  192. elems = elems[:len(elems)-1]
  193. }
  194. }
  195. return elems, nil
  196. }
  197. // nextWeight computes the first possible collation weights following elems
  198. // for the given level.
  199. func nextWeight(level colltab.Level, elems []rawCE) []rawCE {
  200. if level == colltab.Identity {
  201. next := make([]rawCE, len(elems))
  202. copy(next, elems)
  203. return next
  204. }
  205. next := []rawCE{makeRawCE(elems[0].w, elems[0].ccc)}
  206. next[0].w[level]++
  207. if level < colltab.Secondary {
  208. next[0].w[colltab.Secondary] = defaultSecondary
  209. }
  210. if level < colltab.Tertiary {
  211. next[0].w[colltab.Tertiary] = defaultTertiary
  212. }
  213. // Filter entries that cannot influence ordering.
  214. for _, ce := range elems[1:] {
  215. skip := true
  216. for i := colltab.Primary; i < level; i++ {
  217. skip = skip && ce.w[i] == 0
  218. }
  219. if !skip {
  220. next = append(next, ce)
  221. }
  222. }
  223. return next
  224. }
  225. func nextVal(elems []rawCE, i int, level colltab.Level) (index, value int) {
  226. for ; i < len(elems) && elems[i].w[level] == 0; i++ {
  227. }
  228. if i < len(elems) {
  229. return i, elems[i].w[level]
  230. }
  231. return i, 0
  232. }
  233. // compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise.
  234. // It also returns the collation level at which the difference is found.
  235. func compareWeights(a, b []rawCE) (result int, level colltab.Level) {
  236. for level := colltab.Primary; level < colltab.Identity; level++ {
  237. var va, vb int
  238. for ia, ib := 0, 0; ia < len(a) || ib < len(b); ia, ib = ia+1, ib+1 {
  239. ia, va = nextVal(a, ia, level)
  240. ib, vb = nextVal(b, ib, level)
  241. if va != vb {
  242. if va < vb {
  243. return -1, level
  244. } else {
  245. return 1, level
  246. }
  247. }
  248. }
  249. }
  250. return 0, colltab.Identity
  251. }
  252. func equalCE(a, b rawCE) bool {
  253. for i := 0; i < 3; i++ {
  254. if b.w[i] != a.w[i] {
  255. return false
  256. }
  257. }
  258. return true
  259. }
  260. func equalCEArrays(a, b []rawCE) bool {
  261. if len(a) != len(b) {
  262. return false
  263. }
  264. for i := range a {
  265. if !equalCE(a[i], b[i]) {
  266. return false
  267. }
  268. }
  269. return true
  270. }