collelem.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. // Copyright 2012 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package colltab
  5. import (
  6. "fmt"
  7. "unicode"
  8. )
  9. // Level identifies the collation comparison level.
  10. // The primary level corresponds to the basic sorting of text.
  11. // The secondary level corresponds to accents and related linguistic elements.
  12. // The tertiary level corresponds to casing and related concepts.
  13. // The quaternary level is derived from the other levels by the
  14. // various algorithms for handling variable elements.
  15. type Level int
  16. const (
  17. Primary Level = iota
  18. Secondary
  19. Tertiary
  20. Quaternary
  21. Identity
  22. NumLevels
  23. )
  24. const (
  25. defaultSecondary = 0x20
  26. defaultTertiary = 0x2
  27. maxTertiary = 0x1F
  28. MaxQuaternary = 0x1FFFFF // 21 bits.
  29. )
  30. // Elem is a representation of a collation element. This API provides ways to encode
  31. // and decode Elems. Implementations of collation tables may use values greater
  32. // or equal to PrivateUse for their own purposes. However, these should never be
  33. // returned by AppendNext.
  34. type Elem uint32
  35. const (
  36. maxCE Elem = 0xAFFFFFFF
  37. PrivateUse = minContract
  38. minContract = 0xC0000000
  39. maxContract = 0xDFFFFFFF
  40. minExpand = 0xE0000000
  41. maxExpand = 0xEFFFFFFF
  42. minDecomp = 0xF0000000
  43. )
  44. type ceType int
  45. const (
  46. ceNormal ceType = iota // ceNormal includes implicits (ce == 0)
  47. ceContractionIndex // rune can be a start of a contraction
  48. ceExpansionIndex // rune expands into a sequence of collation elements
  49. ceDecompose // rune expands using NFKC decomposition
  50. )
  51. func (ce Elem) ctype() ceType {
  52. if ce <= maxCE {
  53. return ceNormal
  54. }
  55. if ce <= maxContract {
  56. return ceContractionIndex
  57. } else {
  58. if ce <= maxExpand {
  59. return ceExpansionIndex
  60. }
  61. return ceDecompose
  62. }
  63. panic("should not reach here")
  64. return ceType(-1)
  65. }
  66. // For normal collation elements, we assume that a collation element either has
  67. // a primary or non-default secondary value, not both.
  68. // Collation elements with a primary value are of the form
  69. // 01pppppp pppppppp ppppppp0 ssssssss
  70. // - p* is primary collation value
  71. // - s* is the secondary collation value
  72. // 00pppppp pppppppp ppppppps sssttttt, where
  73. // - p* is primary collation value
  74. // - s* offset of secondary from default value.
  75. // - t* is the tertiary collation value
  76. // 100ttttt cccccccc pppppppp pppppppp
  77. // - t* is the tertiar collation value
  78. // - c* is the canonical combining class
  79. // - p* is the primary collation value
  80. // Collation elements with a secondary value are of the form
  81. // 1010cccc ccccssss ssssssss tttttttt, where
  82. // - c* is the canonical combining class
  83. // - s* is the secondary collation value
  84. // - t* is the tertiary collation value
  85. // 11qqqqqq qqqqqqqq qqqqqqq0 00000000
  86. // - q* quaternary value
  87. const (
  88. ceTypeMask = 0xC0000000
  89. ceTypeMaskExt = 0xE0000000
  90. ceIgnoreMask = 0xF00FFFFF
  91. ceType1 = 0x40000000
  92. ceType2 = 0x00000000
  93. ceType3or4 = 0x80000000
  94. ceType4 = 0xA0000000
  95. ceTypeQ = 0xC0000000
  96. Ignore = ceType4
  97. firstNonPrimary = 0x80000000
  98. lastSpecialPrimary = 0xA0000000
  99. secondaryMask = 0x80000000
  100. hasTertiaryMask = 0x40000000
  101. primaryValueMask = 0x3FFFFE00
  102. maxPrimaryBits = 21
  103. compactPrimaryBits = 16
  104. maxSecondaryBits = 12
  105. maxTertiaryBits = 8
  106. maxCCCBits = 8
  107. maxSecondaryCompactBits = 8
  108. maxSecondaryDiffBits = 4
  109. maxTertiaryCompactBits = 5
  110. primaryShift = 9
  111. compactSecondaryShift = 5
  112. minCompactSecondary = defaultSecondary - 4
  113. )
  114. func makeImplicitCE(primary int) Elem {
  115. return ceType1 | Elem(primary<<primaryShift) | defaultSecondary
  116. }
  117. // MakeElem returns an Elem for the given values. It will return an error
  118. // if the given combination of values is invalid.
  119. func MakeElem(primary, secondary, tertiary int, ccc uint8) (Elem, error) {
  120. if w := primary; w >= 1<<maxPrimaryBits || w < 0 {
  121. return 0, fmt.Errorf("makeCE: primary weight out of bounds: %x >= %x", w, 1<<maxPrimaryBits)
  122. }
  123. if w := secondary; w >= 1<<maxSecondaryBits || w < 0 {
  124. return 0, fmt.Errorf("makeCE: secondary weight out of bounds: %x >= %x", w, 1<<maxSecondaryBits)
  125. }
  126. if w := tertiary; w >= 1<<maxTertiaryBits || w < 0 {
  127. return 0, fmt.Errorf("makeCE: tertiary weight out of bounds: %x >= %x", w, 1<<maxTertiaryBits)
  128. }
  129. ce := Elem(0)
  130. if primary != 0 {
  131. if ccc != 0 {
  132. if primary >= 1<<compactPrimaryBits {
  133. return 0, fmt.Errorf("makeCE: primary weight with non-zero CCC out of bounds: %x >= %x", primary, 1<<compactPrimaryBits)
  134. }
  135. if secondary != defaultSecondary {
  136. return 0, fmt.Errorf("makeCE: cannot combine non-default secondary value (%x) with non-zero CCC (%x)", secondary, ccc)
  137. }
  138. ce = Elem(tertiary << (compactPrimaryBits + maxCCCBits))
  139. ce |= Elem(ccc) << compactPrimaryBits
  140. ce |= Elem(primary)
  141. ce |= ceType3or4
  142. } else if tertiary == defaultTertiary {
  143. if secondary >= 1<<maxSecondaryCompactBits {
  144. return 0, fmt.Errorf("makeCE: secondary weight with non-zero primary out of bounds: %x >= %x", secondary, 1<<maxSecondaryCompactBits)
  145. }
  146. ce = Elem(primary<<(maxSecondaryCompactBits+1) + secondary)
  147. ce |= ceType1
  148. } else {
  149. d := secondary - defaultSecondary + maxSecondaryDiffBits
  150. if d >= 1<<maxSecondaryDiffBits || d < 0 {
  151. return 0, fmt.Errorf("makeCE: secondary weight diff out of bounds: %x < 0 || %x > %x", d, d, 1<<maxSecondaryDiffBits)
  152. }
  153. if tertiary >= 1<<maxTertiaryCompactBits {
  154. return 0, fmt.Errorf("makeCE: tertiary weight with non-zero primary out of bounds: %x > %x", tertiary, 1<<maxTertiaryCompactBits)
  155. }
  156. ce = Elem(primary<<maxSecondaryDiffBits + d)
  157. ce = ce<<maxTertiaryCompactBits + Elem(tertiary)
  158. }
  159. } else {
  160. ce = Elem(secondary<<maxTertiaryBits + tertiary)
  161. ce += Elem(ccc) << (maxSecondaryBits + maxTertiaryBits)
  162. ce |= ceType4
  163. }
  164. return ce, nil
  165. }
  166. // MakeQuaternary returns an Elem with the given quaternary value.
  167. func MakeQuaternary(v int) Elem {
  168. return ceTypeQ | Elem(v<<primaryShift)
  169. }
  170. // Mask sets weights for any level smaller than l to 0.
  171. // The resulting Elem can be used to test for equality with
  172. // other Elems to which the same mask has been applied.
  173. func (ce Elem) Mask(l Level) uint32 {
  174. return 0
  175. }
  176. // CCC returns the canonical combining class associated with the underlying character,
  177. // if applicable, or 0 otherwise.
  178. func (ce Elem) CCC() uint8 {
  179. if ce&ceType3or4 != 0 {
  180. if ce&ceType4 == ceType3or4 {
  181. return uint8(ce >> 16)
  182. }
  183. return uint8(ce >> 20)
  184. }
  185. return 0
  186. }
  187. // Primary returns the primary collation weight for ce.
  188. func (ce Elem) Primary() int {
  189. if ce >= firstNonPrimary {
  190. if ce > lastSpecialPrimary {
  191. return 0
  192. }
  193. return int(uint16(ce))
  194. }
  195. return int(ce&primaryValueMask) >> primaryShift
  196. }
  197. // Secondary returns the secondary collation weight for ce.
  198. func (ce Elem) Secondary() int {
  199. switch ce & ceTypeMask {
  200. case ceType1:
  201. return int(uint8(ce))
  202. case ceType2:
  203. return minCompactSecondary + int((ce>>compactSecondaryShift)&0xF)
  204. case ceType3or4:
  205. if ce < ceType4 {
  206. return defaultSecondary
  207. }
  208. return int(ce>>8) & 0xFFF
  209. case ceTypeQ:
  210. return 0
  211. }
  212. panic("should not reach here")
  213. }
  214. // Tertiary returns the tertiary collation weight for ce.
  215. func (ce Elem) Tertiary() uint8 {
  216. if ce&hasTertiaryMask == 0 {
  217. if ce&ceType3or4 == 0 {
  218. return uint8(ce & 0x1F)
  219. }
  220. if ce&ceType4 == ceType4 {
  221. return uint8(ce)
  222. }
  223. return uint8(ce>>24) & 0x1F // type 2
  224. } else if ce&ceTypeMask == ceType1 {
  225. return defaultTertiary
  226. }
  227. // ce is a quaternary value.
  228. return 0
  229. }
  230. func (ce Elem) updateTertiary(t uint8) Elem {
  231. if ce&ceTypeMask == ceType1 {
  232. // convert to type 4
  233. nce := ce & primaryValueMask
  234. nce |= Elem(uint8(ce)-minCompactSecondary) << compactSecondaryShift
  235. ce = nce
  236. } else if ce&ceTypeMaskExt == ceType3or4 {
  237. ce &= ^Elem(maxTertiary << 24)
  238. return ce | (Elem(t) << 24)
  239. } else {
  240. // type 2 or 4
  241. ce &= ^Elem(maxTertiary)
  242. }
  243. return ce | Elem(t)
  244. }
  245. // Quaternary returns the quaternary value if explicitly specified,
  246. // 0 if ce == Ignore, or MaxQuaternary otherwise.
  247. // Quaternary values are used only for shifted variants.
  248. func (ce Elem) Quaternary() int {
  249. if ce&ceTypeMask == ceTypeQ {
  250. return int(ce&primaryValueMask) >> primaryShift
  251. } else if ce&ceIgnoreMask == Ignore {
  252. return 0
  253. }
  254. return MaxQuaternary
  255. }
  256. // Weight returns the collation weight for the given level.
  257. func (ce Elem) Weight(l Level) int {
  258. switch l {
  259. case Primary:
  260. return ce.Primary()
  261. case Secondary:
  262. return ce.Secondary()
  263. case Tertiary:
  264. return int(ce.Tertiary())
  265. case Quaternary:
  266. return ce.Quaternary()
  267. }
  268. return 0 // return 0 (ignore) for undefined levels.
  269. }
  270. // For contractions, collation elements are of the form
  271. // 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
  272. // - n* is the size of the first node in the contraction trie.
  273. // - i* is the index of the first node in the contraction trie.
  274. // - b* is the offset into the contraction collation element table.
  275. // See contract.go for details on the contraction trie.
  276. const (
  277. maxNBits = 4
  278. maxTrieIndexBits = 12
  279. maxContractOffsetBits = 13
  280. )
  281. func splitContractIndex(ce Elem) (index, n, offset int) {
  282. n = int(ce & (1<<maxNBits - 1))
  283. ce >>= maxNBits
  284. index = int(ce & (1<<maxTrieIndexBits - 1))
  285. ce >>= maxTrieIndexBits
  286. offset = int(ce & (1<<maxContractOffsetBits - 1))
  287. return
  288. }
  289. // For expansions, Elems are of the form 11100000 00000000 bbbbbbbb bbbbbbbb,
  290. // where b* is the index into the expansion sequence table.
  291. const maxExpandIndexBits = 16
  292. func splitExpandIndex(ce Elem) (index int) {
  293. return int(uint16(ce))
  294. }
  295. // Some runes can be expanded using NFKD decomposition. Instead of storing the full
  296. // sequence of collation elements, we decompose the rune and lookup the collation
  297. // elements for each rune in the decomposition and modify the tertiary weights.
  298. // The Elem, in this case, is of the form 11110000 00000000 wwwwwwww vvvvvvvv, where
  299. // - v* is the replacement tertiary weight for the first rune,
  300. // - w* is the replacement tertiary weight for the second rune,
  301. // Tertiary weights of subsequent runes should be replaced with maxTertiary.
  302. // See https://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
  303. func splitDecompose(ce Elem) (t1, t2 uint8) {
  304. return uint8(ce), uint8(ce >> 8)
  305. }
  306. const (
  307. // These constants were taken from https://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
  308. minUnified rune = 0x4E00
  309. maxUnified = 0x9FFF
  310. minCompatibility = 0xF900
  311. maxCompatibility = 0xFAFF
  312. minRare = 0x3400
  313. maxRare = 0x4DBF
  314. )
  315. const (
  316. commonUnifiedOffset = 0x10000
  317. rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF
  318. otherOffset = 0x50000 // largest rune in rare is U+2FA1D
  319. illegalOffset = otherOffset + int(unicode.MaxRune)
  320. maxPrimary = illegalOffset + 1
  321. )
  322. // implicitPrimary returns the primary weight for the a rune
  323. // for which there is no entry for the rune in the collation table.
  324. // We take a different approach from the one specified in
  325. // https://unicode.org/reports/tr10/#Implicit_Weights,
  326. // but preserve the resulting relative ordering of the runes.
  327. func implicitPrimary(r rune) int {
  328. if unicode.Is(unicode.Ideographic, r) {
  329. if r >= minUnified && r <= maxUnified {
  330. // The most common case for CJK.
  331. return int(r) + commonUnifiedOffset
  332. }
  333. if r >= minCompatibility && r <= maxCompatibility {
  334. // This will typically not hit. The DUCET explicitly specifies mappings
  335. // for all characters that do not decompose.
  336. return int(r) + commonUnifiedOffset
  337. }
  338. return int(r) + rareUnifiedOffset
  339. }
  340. return int(r) + otherOffset
  341. }