collate_test.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482
  1. // Copyright 2012 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package collate
  5. import (
  6. "bytes"
  7. "testing"
  8. "golang.org/x/text/internal/colltab"
  9. "golang.org/x/text/language"
  10. )
  11. type weightsTest struct {
  12. opt opts
  13. in, out ColElems
  14. }
  15. type opts struct {
  16. lev int
  17. alt alternateHandling
  18. top int
  19. backwards bool
  20. caseLevel bool
  21. }
  22. // ignore returns an initialized boolean array based on the given Level.
  23. // A negative value means using the default setting of quaternary.
  24. func ignore(level colltab.Level) (ignore [colltab.NumLevels]bool) {
  25. if level < 0 {
  26. level = colltab.Quaternary
  27. }
  28. for i := range ignore {
  29. ignore[i] = level < colltab.Level(i)
  30. }
  31. return ignore
  32. }
  33. func makeCE(w []int) colltab.Elem {
  34. ce, err := colltab.MakeElem(w[0], w[1], w[2], uint8(w[3]))
  35. if err != nil {
  36. panic(err)
  37. }
  38. return ce
  39. }
  40. func (o opts) collator() *Collator {
  41. c := &Collator{
  42. options: options{
  43. ignore: ignore(colltab.Level(o.lev - 1)),
  44. alternate: o.alt,
  45. backwards: o.backwards,
  46. caseLevel: o.caseLevel,
  47. variableTop: uint32(o.top),
  48. },
  49. }
  50. return c
  51. }
  52. const (
  53. maxQ = 0x1FFFFF
  54. )
  55. func wpq(p, q int) Weights {
  56. return W(p, defaults.Secondary, defaults.Tertiary, q)
  57. }
  58. func wsq(s, q int) Weights {
  59. return W(0, s, defaults.Tertiary, q)
  60. }
  61. func wq(q int) Weights {
  62. return W(0, 0, 0, q)
  63. }
  64. var zero = W(0, 0, 0, 0)
  65. var processTests = []weightsTest{
  66. // Shifted
  67. { // simple sequence of non-variables
  68. opt: opts{alt: altShifted, top: 100},
  69. in: ColElems{W(200), W(300), W(400)},
  70. out: ColElems{wpq(200, maxQ), wpq(300, maxQ), wpq(400, maxQ)},
  71. },
  72. { // first is a variable
  73. opt: opts{alt: altShifted, top: 250},
  74. in: ColElems{W(200), W(300), W(400)},
  75. out: ColElems{wq(200), wpq(300, maxQ), wpq(400, maxQ)},
  76. },
  77. { // all but first are variable
  78. opt: opts{alt: altShifted, top: 999},
  79. in: ColElems{W(1000), W(200), W(300), W(400)},
  80. out: ColElems{wpq(1000, maxQ), wq(200), wq(300), wq(400)},
  81. },
  82. { // first is a modifier
  83. opt: opts{alt: altShifted, top: 999},
  84. in: ColElems{W(0, 10), W(1000)},
  85. out: ColElems{wsq(10, maxQ), wpq(1000, maxQ)},
  86. },
  87. { // primary ignorables
  88. opt: opts{alt: altShifted, top: 250},
  89. in: ColElems{W(200), W(0, 10), W(300), W(0, 15), W(400)},
  90. out: ColElems{wq(200), zero, wpq(300, maxQ), wsq(15, maxQ), wpq(400, maxQ)},
  91. },
  92. { // secondary ignorables
  93. opt: opts{alt: altShifted, top: 250},
  94. in: ColElems{W(200), W(0, 0, 10), W(300), W(0, 0, 15), W(400)},
  95. out: ColElems{wq(200), zero, wpq(300, maxQ), W(0, 0, 15, maxQ), wpq(400, maxQ)},
  96. },
  97. { // tertiary ignorables, no change
  98. opt: opts{alt: altShifted, top: 250},
  99. in: ColElems{W(200), zero, W(300), zero, W(400)},
  100. out: ColElems{wq(200), zero, wpq(300, maxQ), zero, wpq(400, maxQ)},
  101. },
  102. // ShiftTrimmed (same as Shifted)
  103. { // simple sequence of non-variables
  104. opt: opts{alt: altShiftTrimmed, top: 100},
  105. in: ColElems{W(200), W(300), W(400)},
  106. out: ColElems{wpq(200, maxQ), wpq(300, maxQ), wpq(400, maxQ)},
  107. },
  108. { // first is a variable
  109. opt: opts{alt: altShiftTrimmed, top: 250},
  110. in: ColElems{W(200), W(300), W(400)},
  111. out: ColElems{wq(200), wpq(300, maxQ), wpq(400, maxQ)},
  112. },
  113. { // all but first are variable
  114. opt: opts{alt: altShiftTrimmed, top: 999},
  115. in: ColElems{W(1000), W(200), W(300), W(400)},
  116. out: ColElems{wpq(1000, maxQ), wq(200), wq(300), wq(400)},
  117. },
  118. { // first is a modifier
  119. opt: opts{alt: altShiftTrimmed, top: 999},
  120. in: ColElems{W(0, 10), W(1000)},
  121. out: ColElems{wsq(10, maxQ), wpq(1000, maxQ)},
  122. },
  123. { // primary ignorables
  124. opt: opts{alt: altShiftTrimmed, top: 250},
  125. in: ColElems{W(200), W(0, 10), W(300), W(0, 15), W(400)},
  126. out: ColElems{wq(200), zero, wpq(300, maxQ), wsq(15, maxQ), wpq(400, maxQ)},
  127. },
  128. { // secondary ignorables
  129. opt: opts{alt: altShiftTrimmed, top: 250},
  130. in: ColElems{W(200), W(0, 0, 10), W(300), W(0, 0, 15), W(400)},
  131. out: ColElems{wq(200), zero, wpq(300, maxQ), W(0, 0, 15, maxQ), wpq(400, maxQ)},
  132. },
  133. { // tertiary ignorables, no change
  134. opt: opts{alt: altShiftTrimmed, top: 250},
  135. in: ColElems{W(200), zero, W(300), zero, W(400)},
  136. out: ColElems{wq(200), zero, wpq(300, maxQ), zero, wpq(400, maxQ)},
  137. },
  138. // Blanked
  139. { // simple sequence of non-variables
  140. opt: opts{alt: altBlanked, top: 100},
  141. in: ColElems{W(200), W(300), W(400)},
  142. out: ColElems{W(200), W(300), W(400)},
  143. },
  144. { // first is a variable
  145. opt: opts{alt: altBlanked, top: 250},
  146. in: ColElems{W(200), W(300), W(400)},
  147. out: ColElems{zero, W(300), W(400)},
  148. },
  149. { // all but first are variable
  150. opt: opts{alt: altBlanked, top: 999},
  151. in: ColElems{W(1000), W(200), W(300), W(400)},
  152. out: ColElems{W(1000), zero, zero, zero},
  153. },
  154. { // first is a modifier
  155. opt: opts{alt: altBlanked, top: 999},
  156. in: ColElems{W(0, 10), W(1000)},
  157. out: ColElems{W(0, 10), W(1000)},
  158. },
  159. { // primary ignorables
  160. opt: opts{alt: altBlanked, top: 250},
  161. in: ColElems{W(200), W(0, 10), W(300), W(0, 15), W(400)},
  162. out: ColElems{zero, zero, W(300), W(0, 15), W(400)},
  163. },
  164. { // secondary ignorables
  165. opt: opts{alt: altBlanked, top: 250},
  166. in: ColElems{W(200), W(0, 0, 10), W(300), W(0, 0, 15), W(400)},
  167. out: ColElems{zero, zero, W(300), W(0, 0, 15), W(400)},
  168. },
  169. { // tertiary ignorables, no change
  170. opt: opts{alt: altBlanked, top: 250},
  171. in: ColElems{W(200), zero, W(300), zero, W(400)},
  172. out: ColElems{zero, zero, W(300), zero, W(400)},
  173. },
  174. // Non-ignorable: input is always equal to output.
  175. { // all but first are variable
  176. opt: opts{alt: altNonIgnorable, top: 999},
  177. in: ColElems{W(1000), W(200), W(300), W(400)},
  178. out: ColElems{W(1000), W(200), W(300), W(400)},
  179. },
  180. { // primary ignorables
  181. opt: opts{alt: altNonIgnorable, top: 250},
  182. in: ColElems{W(200), W(0, 10), W(300), W(0, 15), W(400)},
  183. out: ColElems{W(200), W(0, 10), W(300), W(0, 15), W(400)},
  184. },
  185. { // secondary ignorables
  186. opt: opts{alt: altNonIgnorable, top: 250},
  187. in: ColElems{W(200), W(0, 0, 10), W(300), W(0, 0, 15), W(400)},
  188. out: ColElems{W(200), W(0, 0, 10), W(300), W(0, 0, 15), W(400)},
  189. },
  190. { // tertiary ignorables, no change
  191. opt: opts{alt: altNonIgnorable, top: 250},
  192. in: ColElems{W(200), zero, W(300), zero, W(400)},
  193. out: ColElems{W(200), zero, W(300), zero, W(400)},
  194. },
  195. }
  196. func TestProcessWeights(t *testing.T) {
  197. for i, tt := range processTests {
  198. in := convertFromWeights(tt.in)
  199. out := convertFromWeights(tt.out)
  200. processWeights(tt.opt.alt, uint32(tt.opt.top), in)
  201. for j, w := range in {
  202. if w != out[j] {
  203. t.Errorf("%d: Weights %d was %v; want %v", i, j, w, out[j])
  204. }
  205. }
  206. }
  207. }
  208. type keyFromElemTest struct {
  209. opt opts
  210. in ColElems
  211. out []byte
  212. }
  213. var defS = byte(defaults.Secondary)
  214. var defT = byte(defaults.Tertiary)
  215. const sep = 0 // separator byte
  216. var keyFromElemTests = []keyFromElemTest{
  217. { // simple primary and secondary weights.
  218. opts{alt: altShifted},
  219. ColElems{W(0x200), W(0x7FFF), W(0, 0x30), W(0x100)},
  220. []byte{0x2, 0, 0x7F, 0xFF, 0x1, 0x00, // primary
  221. sep, sep, 0, defS, 0, defS, 0, 0x30, 0, defS, // secondary
  222. sep, sep, defT, defT, defT, defT, // tertiary
  223. sep, 0xFF, 0xFF, 0xFF, 0xFF, // quaternary
  224. },
  225. },
  226. { // same as first, but with zero element that need to be removed
  227. opts{alt: altShifted},
  228. ColElems{W(0x200), zero, W(0x7FFF), W(0, 0x30), zero, W(0x100)},
  229. []byte{0x2, 0, 0x7F, 0xFF, 0x1, 0x00, // primary
  230. sep, sep, 0, defS, 0, defS, 0, 0x30, 0, defS, // secondary
  231. sep, sep, defT, defT, defT, defT, // tertiary
  232. sep, 0xFF, 0xFF, 0xFF, 0xFF, // quaternary
  233. },
  234. },
  235. { // same as first, with large primary values
  236. opts{alt: altShifted},
  237. ColElems{W(0x200), W(0x8000), W(0, 0x30), W(0x12345)},
  238. []byte{0x2, 0, 0x80, 0x80, 0x00, 0x81, 0x23, 0x45, // primary
  239. sep, sep, 0, defS, 0, defS, 0, 0x30, 0, defS, // secondary
  240. sep, sep, defT, defT, defT, defT, // tertiary
  241. sep, 0xFF, 0xFF, 0xFF, 0xFF, // quaternary
  242. },
  243. },
  244. { // same as first, but with the secondary level backwards
  245. opts{alt: altShifted, backwards: true},
  246. ColElems{W(0x200), W(0x7FFF), W(0, 0x30), W(0x100)},
  247. []byte{0x2, 0, 0x7F, 0xFF, 0x1, 0x00, // primary
  248. sep, sep, 0, defS, 0, 0x30, 0, defS, 0, defS, // secondary
  249. sep, sep, defT, defT, defT, defT, // tertiary
  250. sep, 0xFF, 0xFF, 0xFF, 0xFF, // quaternary
  251. },
  252. },
  253. { // same as first, ignoring quaternary level
  254. opts{alt: altShifted, lev: 3},
  255. ColElems{W(0x200), zero, W(0x7FFF), W(0, 0x30), zero, W(0x100)},
  256. []byte{0x2, 0, 0x7F, 0xFF, 0x1, 0x00, // primary
  257. sep, sep, 0, defS, 0, defS, 0, 0x30, 0, defS, // secondary
  258. sep, sep, defT, defT, defT, defT, // tertiary
  259. },
  260. },
  261. { // same as first, ignoring tertiary level
  262. opts{alt: altShifted, lev: 2},
  263. ColElems{W(0x200), zero, W(0x7FFF), W(0, 0x30), zero, W(0x100)},
  264. []byte{0x2, 0, 0x7F, 0xFF, 0x1, 0x00, // primary
  265. sep, sep, 0, defS, 0, defS, 0, 0x30, 0, defS, // secondary
  266. },
  267. },
  268. { // same as first, ignoring secondary level
  269. opts{alt: altShifted, lev: 1},
  270. ColElems{W(0x200), zero, W(0x7FFF), W(0, 0x30), zero, W(0x100)},
  271. []byte{0x2, 0, 0x7F, 0xFF, 0x1, 0x00},
  272. },
  273. { // simple primary and secondary weights.
  274. opts{alt: altShiftTrimmed, top: 0x250},
  275. ColElems{W(0x300), W(0x200), W(0x7FFF), W(0, 0x30), W(0x800)},
  276. []byte{0x3, 0, 0x7F, 0xFF, 0x8, 0x00, // primary
  277. sep, sep, 0, defS, 0, defS, 0, 0x30, 0, defS, // secondary
  278. sep, sep, defT, defT, defT, defT, // tertiary
  279. sep, 0xFF, 0x2, 0, // quaternary
  280. },
  281. },
  282. { // as first, primary with case level enabled
  283. opts{alt: altShifted, lev: 1, caseLevel: true},
  284. ColElems{W(0x200), W(0x7FFF), W(0, 0x30), W(0x100)},
  285. []byte{0x2, 0, 0x7F, 0xFF, 0x1, 0x00, // primary
  286. sep, sep, // secondary
  287. sep, sep, defT, defT, defT, defT, // tertiary
  288. },
  289. },
  290. }
  291. func TestKeyFromElems(t *testing.T) {
  292. buf := Buffer{}
  293. for i, tt := range keyFromElemTests {
  294. buf.Reset()
  295. in := convertFromWeights(tt.in)
  296. processWeights(tt.opt.alt, uint32(tt.opt.top), in)
  297. tt.opt.collator().keyFromElems(&buf, in)
  298. res := buf.key
  299. if len(res) != len(tt.out) {
  300. t.Errorf("%d: len(ws) was %d; want %d (%X should be %X)", i, len(res), len(tt.out), res, tt.out)
  301. }
  302. n := len(res)
  303. if len(tt.out) < n {
  304. n = len(tt.out)
  305. }
  306. for j, c := range res[:n] {
  307. if c != tt.out[j] {
  308. t.Errorf("%d: byte %d was %X; want %X", i, j, c, tt.out[j])
  309. }
  310. }
  311. }
  312. }
  313. func TestGetColElems(t *testing.T) {
  314. for i, tt := range appendNextTests {
  315. c, err := makeTable(tt.in)
  316. if err != nil {
  317. // error is reported in TestAppendNext
  318. continue
  319. }
  320. // Create one large test per table
  321. str := make([]byte, 0, 4000)
  322. out := ColElems{}
  323. for len(str) < 3000 {
  324. for _, chk := range tt.chk {
  325. str = append(str, chk.in[:chk.n]...)
  326. out = append(out, chk.out...)
  327. }
  328. }
  329. for j, chk := range append(tt.chk, check{string(str), len(str), out}) {
  330. out := convertFromWeights(chk.out)
  331. ce := c.getColElems([]byte(chk.in)[:chk.n])
  332. if len(ce) != len(out) {
  333. t.Errorf("%d:%d: len(ws) was %d; want %d", i, j, len(ce), len(out))
  334. continue
  335. }
  336. cnt := 0
  337. for k, w := range ce {
  338. w, _ = colltab.MakeElem(w.Primary(), w.Secondary(), int(w.Tertiary()), 0)
  339. if w != out[k] {
  340. t.Errorf("%d:%d: Weights %d was %X; want %X", i, j, k, w, out[k])
  341. cnt++
  342. }
  343. if cnt > 10 {
  344. break
  345. }
  346. }
  347. }
  348. }
  349. }
  350. type keyTest struct {
  351. in string
  352. out []byte
  353. }
  354. var keyTests = []keyTest{
  355. {"abc",
  356. []byte{0, 100, 0, 200, 1, 44, 0, 0, 0, 32, 0, 32, 0, 32, 0, 0, 2, 2, 2, 0, 255, 255, 255},
  357. },
  358. {"a\u0301",
  359. []byte{0, 102, 0, 0, 0, 32, 0, 0, 2, 0, 255},
  360. },
  361. {"aaaaa",
  362. []byte{0, 100, 0, 100, 0, 100, 0, 100, 0, 100, 0, 0,
  363. 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 0,
  364. 2, 2, 2, 2, 2, 0,
  365. 255, 255, 255, 255, 255,
  366. },
  367. },
  368. // Issue 16391: incomplete rune at end of UTF-8 sequence.
  369. {"\xc2", []byte{133, 255, 253, 0, 0, 0, 32, 0, 0, 2, 0, 255}},
  370. {"\xc2a", []byte{133, 255, 253, 0, 100, 0, 0, 0, 32, 0, 32, 0, 0, 2, 2, 0, 255, 255}},
  371. }
  372. func TestKey(t *testing.T) {
  373. c, _ := makeTable(appendNextTests[4].in)
  374. c.alternate = altShifted
  375. c.ignore = ignore(colltab.Quaternary)
  376. buf := Buffer{}
  377. keys1 := [][]byte{}
  378. keys2 := [][]byte{}
  379. for _, tt := range keyTests {
  380. keys1 = append(keys1, c.Key(&buf, []byte(tt.in)))
  381. keys2 = append(keys2, c.KeyFromString(&buf, tt.in))
  382. }
  383. // Separate generation from testing to ensure buffers are not overwritten.
  384. for i, tt := range keyTests {
  385. if !bytes.Equal(keys1[i], tt.out) {
  386. t.Errorf("%d: Key(%q) = %d; want %d", i, tt.in, keys1[i], tt.out)
  387. }
  388. if !bytes.Equal(keys2[i], tt.out) {
  389. t.Errorf("%d: KeyFromString(%q) = %d; want %d", i, tt.in, keys2[i], tt.out)
  390. }
  391. }
  392. }
  393. type compareTest struct {
  394. a, b string
  395. res int // comparison result
  396. }
  397. var compareTests = []compareTest{
  398. {"a\u0301", "a", 1},
  399. {"a\u0301b", "ab", 1},
  400. {"a", "a\u0301", -1},
  401. {"ab", "a\u0301b", -1},
  402. {"bc", "a\u0301c", 1},
  403. {"ab", "aB", -1},
  404. {"a\u0301", "a\u0301", 0},
  405. {"a", "a", 0},
  406. // Only clip prefixes of whole runes.
  407. {"\u302E", "\u302F", 1},
  408. // Don't clip prefixes when last rune of prefix may be part of contraction.
  409. {"a\u035E", "a\u0301\u035F", -1},
  410. {"a\u0301\u035Fb", "a\u0301\u035F", -1},
  411. }
  412. func TestCompare(t *testing.T) {
  413. c, _ := makeTable(appendNextTests[4].in)
  414. for i, tt := range compareTests {
  415. if res := c.Compare([]byte(tt.a), []byte(tt.b)); res != tt.res {
  416. t.Errorf("%d: Compare(%q, %q) == %d; want %d", i, tt.a, tt.b, res, tt.res)
  417. }
  418. if res := c.CompareString(tt.a, tt.b); res != tt.res {
  419. t.Errorf("%d: CompareString(%q, %q) == %d; want %d", i, tt.a, tt.b, res, tt.res)
  420. }
  421. }
  422. }
  423. func TestNumeric(t *testing.T) {
  424. c := New(language.English, Loose, Numeric)
  425. for i, tt := range []struct {
  426. a, b string
  427. want int
  428. }{
  429. {"1", "2", -1},
  430. {"2", "12", -1},
  431. {"2", "12", -1}, // Fullwidth is sorted as usual.
  432. {"₂", "₁₂", 1}, // Subscript is not sorted as numbers.
  433. {"②", "①②", 1}, // Circled is not sorted as numbers.
  434. { // Imperial Aramaic, is not sorted as number.
  435. "\U00010859",
  436. "\U00010858\U00010859",
  437. 1,
  438. },
  439. {"12", "2", 1},
  440. {"A-1", "A-2", -1},
  441. {"A-2", "A-12", -1},
  442. {"A-12", "A-2", 1},
  443. {"A-0001", "A-1", 0},
  444. } {
  445. if got := c.CompareString(tt.a, tt.b); got != tt.want {
  446. t.Errorf("%d: CompareString(%s, %s) = %d; want %d", i, tt.a, tt.b, got, tt.want)
  447. }
  448. }
  449. }