collate.go 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403
  1. // Copyright 2012 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // TODO: remove hard-coded versions when we have implemented fractional weights.
  5. // The current implementation is incompatible with later CLDR versions.
  6. //go:generate go run maketables.go -cldr=23 -unicode=6.2.0
  7. // Package collate contains types for comparing and sorting Unicode strings
  8. // according to a given collation order.
  9. package collate // import "golang.org/x/text/collate"
  10. import (
  11. "bytes"
  12. "strings"
  13. "golang.org/x/text/internal/colltab"
  14. "golang.org/x/text/language"
  15. )
  16. // Collator provides functionality for comparing strings for a given
  17. // collation order.
  18. type Collator struct {
  19. options
  20. sorter sorter
  21. _iter [2]iter
  22. }
  23. func (c *Collator) iter(i int) *iter {
  24. // TODO: evaluate performance for making the second iterator optional.
  25. return &c._iter[i]
  26. }
  27. // Supported returns the list of languages for which collating differs from its parent.
  28. func Supported() []language.Tag {
  29. // TODO: use language.Coverage instead.
  30. t := make([]language.Tag, len(tags))
  31. copy(t, tags)
  32. return t
  33. }
  34. func init() {
  35. ids := strings.Split(availableLocales, ",")
  36. tags = make([]language.Tag, len(ids))
  37. for i, s := range ids {
  38. tags[i] = language.Raw.MustParse(s)
  39. }
  40. }
  41. var tags []language.Tag
  42. // New returns a new Collator initialized for the given locale.
  43. func New(t language.Tag, o ...Option) *Collator {
  44. index := colltab.MatchLang(t, tags)
  45. c := newCollator(getTable(locales[index]))
  46. // Set options from the user-supplied tag.
  47. c.setFromTag(t)
  48. // Set the user-supplied options.
  49. c.setOptions(o)
  50. c.init()
  51. return c
  52. }
  53. // NewFromTable returns a new Collator for the given Weighter.
  54. func NewFromTable(w colltab.Weighter, o ...Option) *Collator {
  55. c := newCollator(w)
  56. c.setOptions(o)
  57. c.init()
  58. return c
  59. }
  60. func (c *Collator) init() {
  61. if c.numeric {
  62. c.t = colltab.NewNumericWeighter(c.t)
  63. }
  64. c._iter[0].init(c)
  65. c._iter[1].init(c)
  66. }
  67. // Buffer holds keys generated by Key and KeyString.
  68. type Buffer struct {
  69. buf [4096]byte
  70. key []byte
  71. }
  72. func (b *Buffer) init() {
  73. if b.key == nil {
  74. b.key = b.buf[:0]
  75. }
  76. }
  77. // Reset clears the buffer from previous results generated by Key and KeyString.
  78. func (b *Buffer) Reset() {
  79. b.key = b.key[:0]
  80. }
  81. // Compare returns an integer comparing the two byte slices.
  82. // The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
  83. func (c *Collator) Compare(a, b []byte) int {
  84. // TODO: skip identical prefixes once we have a fast way to detect if a rune is
  85. // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
  86. c.iter(0).SetInput(a)
  87. c.iter(1).SetInput(b)
  88. if res := c.compare(); res != 0 {
  89. return res
  90. }
  91. if !c.ignore[colltab.Identity] {
  92. return bytes.Compare(a, b)
  93. }
  94. return 0
  95. }
  96. // CompareString returns an integer comparing the two strings.
  97. // The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
  98. func (c *Collator) CompareString(a, b string) int {
  99. // TODO: skip identical prefixes once we have a fast way to detect if a rune is
  100. // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
  101. c.iter(0).SetInputString(a)
  102. c.iter(1).SetInputString(b)
  103. if res := c.compare(); res != 0 {
  104. return res
  105. }
  106. if !c.ignore[colltab.Identity] {
  107. if a < b {
  108. return -1
  109. } else if a > b {
  110. return 1
  111. }
  112. }
  113. return 0
  114. }
  115. func compareLevel(f func(i *iter) int, a, b *iter) int {
  116. a.pce = 0
  117. b.pce = 0
  118. for {
  119. va := f(a)
  120. vb := f(b)
  121. if va != vb {
  122. if va < vb {
  123. return -1
  124. }
  125. return 1
  126. } else if va == 0 {
  127. break
  128. }
  129. }
  130. return 0
  131. }
  132. func (c *Collator) compare() int {
  133. ia, ib := c.iter(0), c.iter(1)
  134. // Process primary level
  135. if c.alternate != altShifted {
  136. // TODO: implement script reordering
  137. if res := compareLevel((*iter).nextPrimary, ia, ib); res != 0 {
  138. return res
  139. }
  140. } else {
  141. // TODO: handle shifted
  142. }
  143. if !c.ignore[colltab.Secondary] {
  144. f := (*iter).nextSecondary
  145. if c.backwards {
  146. f = (*iter).prevSecondary
  147. }
  148. if res := compareLevel(f, ia, ib); res != 0 {
  149. return res
  150. }
  151. }
  152. // TODO: special case handling (Danish?)
  153. if !c.ignore[colltab.Tertiary] || c.caseLevel {
  154. if res := compareLevel((*iter).nextTertiary, ia, ib); res != 0 {
  155. return res
  156. }
  157. if !c.ignore[colltab.Quaternary] {
  158. if res := compareLevel((*iter).nextQuaternary, ia, ib); res != 0 {
  159. return res
  160. }
  161. }
  162. }
  163. return 0
  164. }
  165. // Key returns the collation key for str.
  166. // Passing the buffer buf may avoid memory allocations.
  167. // The returned slice will point to an allocation in Buffer and will remain
  168. // valid until the next call to buf.Reset().
  169. func (c *Collator) Key(buf *Buffer, str []byte) []byte {
  170. // See https://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
  171. buf.init()
  172. return c.key(buf, c.getColElems(str))
  173. }
  174. // KeyFromString returns the collation key for str.
  175. // Passing the buffer buf may avoid memory allocations.
  176. // The returned slice will point to an allocation in Buffer and will retain
  177. // valid until the next call to buf.ResetKeys().
  178. func (c *Collator) KeyFromString(buf *Buffer, str string) []byte {
  179. // See https://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
  180. buf.init()
  181. return c.key(buf, c.getColElemsString(str))
  182. }
  183. func (c *Collator) key(buf *Buffer, w []colltab.Elem) []byte {
  184. processWeights(c.alternate, c.t.Top(), w)
  185. kn := len(buf.key)
  186. c.keyFromElems(buf, w)
  187. return buf.key[kn:]
  188. }
  189. func (c *Collator) getColElems(str []byte) []colltab.Elem {
  190. i := c.iter(0)
  191. i.SetInput(str)
  192. for i.Next() {
  193. }
  194. return i.Elems
  195. }
  196. func (c *Collator) getColElemsString(str string) []colltab.Elem {
  197. i := c.iter(0)
  198. i.SetInputString(str)
  199. for i.Next() {
  200. }
  201. return i.Elems
  202. }
  203. type iter struct {
  204. wa [512]colltab.Elem
  205. colltab.Iter
  206. pce int
  207. }
  208. func (i *iter) init(c *Collator) {
  209. i.Weighter = c.t
  210. i.Elems = i.wa[:0]
  211. }
  212. func (i *iter) nextPrimary() int {
  213. for {
  214. for ; i.pce < i.N; i.pce++ {
  215. if v := i.Elems[i.pce].Primary(); v != 0 {
  216. i.pce++
  217. return v
  218. }
  219. }
  220. if !i.Next() {
  221. return 0
  222. }
  223. }
  224. panic("should not reach here")
  225. }
  226. func (i *iter) nextSecondary() int {
  227. for ; i.pce < len(i.Elems); i.pce++ {
  228. if v := i.Elems[i.pce].Secondary(); v != 0 {
  229. i.pce++
  230. return v
  231. }
  232. }
  233. return 0
  234. }
  235. func (i *iter) prevSecondary() int {
  236. for ; i.pce < len(i.Elems); i.pce++ {
  237. if v := i.Elems[len(i.Elems)-i.pce-1].Secondary(); v != 0 {
  238. i.pce++
  239. return v
  240. }
  241. }
  242. return 0
  243. }
  244. func (i *iter) nextTertiary() int {
  245. for ; i.pce < len(i.Elems); i.pce++ {
  246. if v := i.Elems[i.pce].Tertiary(); v != 0 {
  247. i.pce++
  248. return int(v)
  249. }
  250. }
  251. return 0
  252. }
  253. func (i *iter) nextQuaternary() int {
  254. for ; i.pce < len(i.Elems); i.pce++ {
  255. if v := i.Elems[i.pce].Quaternary(); v != 0 {
  256. i.pce++
  257. return v
  258. }
  259. }
  260. return 0
  261. }
  262. func appendPrimary(key []byte, p int) []byte {
  263. // Convert to variable length encoding; supports up to 23 bits.
  264. if p <= 0x7FFF {
  265. key = append(key, uint8(p>>8), uint8(p))
  266. } else {
  267. key = append(key, uint8(p>>16)|0x80, uint8(p>>8), uint8(p))
  268. }
  269. return key
  270. }
  271. // keyFromElems converts the weights ws to a compact sequence of bytes.
  272. // The result will be appended to the byte buffer in buf.
  273. func (c *Collator) keyFromElems(buf *Buffer, ws []colltab.Elem) {
  274. for _, v := range ws {
  275. if w := v.Primary(); w > 0 {
  276. buf.key = appendPrimary(buf.key, w)
  277. }
  278. }
  279. if !c.ignore[colltab.Secondary] {
  280. buf.key = append(buf.key, 0, 0)
  281. // TODO: we can use one 0 if we can guarantee that all non-zero weights are > 0xFF.
  282. if !c.backwards {
  283. for _, v := range ws {
  284. if w := v.Secondary(); w > 0 {
  285. buf.key = append(buf.key, uint8(w>>8), uint8(w))
  286. }
  287. }
  288. } else {
  289. for i := len(ws) - 1; i >= 0; i-- {
  290. if w := ws[i].Secondary(); w > 0 {
  291. buf.key = append(buf.key, uint8(w>>8), uint8(w))
  292. }
  293. }
  294. }
  295. } else if c.caseLevel {
  296. buf.key = append(buf.key, 0, 0)
  297. }
  298. if !c.ignore[colltab.Tertiary] || c.caseLevel {
  299. buf.key = append(buf.key, 0, 0)
  300. for _, v := range ws {
  301. if w := v.Tertiary(); w > 0 {
  302. buf.key = append(buf.key, uint8(w))
  303. }
  304. }
  305. // Derive the quaternary weights from the options and other levels.
  306. // Note that we represent MaxQuaternary as 0xFF. The first byte of the
  307. // representation of a primary weight is always smaller than 0xFF,
  308. // so using this single byte value will compare correctly.
  309. if !c.ignore[colltab.Quaternary] && c.alternate >= altShifted {
  310. if c.alternate == altShiftTrimmed {
  311. lastNonFFFF := len(buf.key)
  312. buf.key = append(buf.key, 0)
  313. for _, v := range ws {
  314. if w := v.Quaternary(); w == colltab.MaxQuaternary {
  315. buf.key = append(buf.key, 0xFF)
  316. } else if w > 0 {
  317. buf.key = appendPrimary(buf.key, w)
  318. lastNonFFFF = len(buf.key)
  319. }
  320. }
  321. buf.key = buf.key[:lastNonFFFF]
  322. } else {
  323. buf.key = append(buf.key, 0)
  324. for _, v := range ws {
  325. if w := v.Quaternary(); w == colltab.MaxQuaternary {
  326. buf.key = append(buf.key, 0xFF)
  327. } else if w > 0 {
  328. buf.key = appendPrimary(buf.key, w)
  329. }
  330. }
  331. }
  332. }
  333. }
  334. }
  335. func processWeights(vw alternateHandling, top uint32, wa []colltab.Elem) {
  336. ignore := false
  337. vtop := int(top)
  338. switch vw {
  339. case altShifted, altShiftTrimmed:
  340. for i := range wa {
  341. if p := wa[i].Primary(); p <= vtop && p != 0 {
  342. wa[i] = colltab.MakeQuaternary(p)
  343. ignore = true
  344. } else if p == 0 {
  345. if ignore {
  346. wa[i] = colltab.Ignore
  347. }
  348. } else {
  349. ignore = false
  350. }
  351. }
  352. case altBlanked:
  353. for i := range wa {
  354. if p := wa[i].Primary(); p <= vtop && (ignore || p != 0) {
  355. wa[i] = colltab.Ignore
  356. ignore = true
  357. } else {
  358. ignore = false
  359. }
  360. }
  361. }
  362. }