pattern_test.go 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. // Copyright 2015 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package search
  5. import (
  6. "reflect"
  7. "strings"
  8. "testing"
  9. "golang.org/x/text/language"
  10. )
  11. func TestCompile(t *testing.T) {
  12. for i, tc := range []struct {
  13. desc string
  14. pattern string
  15. options []Option
  16. n int
  17. }{{
  18. desc: "empty",
  19. pattern: "",
  20. n: 0,
  21. }, {
  22. desc: "single",
  23. pattern: "a",
  24. n: 1,
  25. }, {
  26. desc: "keep modifier",
  27. pattern: "a\u0300", // U+0300: COMBINING GRAVE ACCENT
  28. n: 2,
  29. }, {
  30. desc: "remove modifier",
  31. pattern: "a\u0300", // U+0300: COMBINING GRAVE ACCENT
  32. options: []Option{IgnoreDiacritics},
  33. n: 1,
  34. }, {
  35. desc: "single with double collation element",
  36. pattern: "ä",
  37. n: 2,
  38. }, {
  39. desc: "leading variable",
  40. pattern: " a",
  41. n: 2,
  42. }, {
  43. desc: "trailing variable",
  44. pattern: "aa ",
  45. n: 3,
  46. }, {
  47. desc: "leading and trailing variable",
  48. pattern: " äb ",
  49. n: 5,
  50. }, {
  51. desc: "keep interior variable",
  52. pattern: " ä b ",
  53. n: 6,
  54. }, {
  55. desc: "keep interior variables",
  56. pattern: " b ä ",
  57. n: 7,
  58. }, {
  59. desc: "remove ignoreables (zero-weights across the board)",
  60. pattern: "\u009Db\u009Dä\u009D", // U+009D: OPERATING SYSTEM COMMAND
  61. n: 3,
  62. }} {
  63. m := New(language.Und, tc.options...)
  64. p := m.CompileString(tc.pattern)
  65. if len(p.ce) != tc.n {
  66. t.Errorf("%d:%s: Compile(%+q): got %d; want %d", i, tc.desc, tc.pattern, len(p.ce), tc.n)
  67. }
  68. }
  69. }
  70. func TestNorm(t *testing.T) {
  71. // U+0300: COMBINING GRAVE ACCENT (CCC=230)
  72. // U+031B: COMBINING HORN (CCC=216)
  73. for _, tc := range []struct {
  74. desc string
  75. a string
  76. b string
  77. want bool // a and b compile into the same pattern?
  78. }{{
  79. "simple",
  80. "eee\u0300\u031b",
  81. "eee\u031b\u0300",
  82. true,
  83. }, {
  84. "large number of modifiers in pattern",
  85. strings.Repeat("\u0300", 29) + "\u0318",
  86. "\u0318" + strings.Repeat("\u0300", 29),
  87. true,
  88. }, {
  89. "modifier overflow in pattern",
  90. strings.Repeat("\u0300", 30) + "\u0318",
  91. "\u0318" + strings.Repeat("\u0300", 30),
  92. false,
  93. }} {
  94. m := New(language.Und)
  95. a := m.CompileString(tc.a)
  96. b := m.CompileString(tc.b)
  97. if got := reflect.DeepEqual(a, b); got != tc.want {
  98. t.Errorf("Compile(a) == Compile(b) == %v; want %v", got, tc.want)
  99. }
  100. }
  101. }
  102. func TestForwardSearch(t *testing.T) {
  103. for i, tc := range []struct {
  104. desc string
  105. tag string
  106. options []Option
  107. pattern string
  108. text string
  109. want []int
  110. }{{
  111. // The semantics of an empty search is to match nothing.
  112. // TODO: change this to be in line with strings.Index? It is quite a
  113. // different beast, so not sure yet.
  114. desc: "empty pattern and text",
  115. tag: "und",
  116. pattern: "",
  117. text: "",
  118. want: nil, // TODO: consider: []int{0, 0},
  119. }, {
  120. desc: "non-empty pattern and empty text",
  121. tag: "und",
  122. pattern: " ",
  123. text: "",
  124. want: nil,
  125. }, {
  126. desc: "empty pattern and non-empty text",
  127. tag: "und",
  128. pattern: "",
  129. text: "abc",
  130. want: nil, // TODO: consider: []int{0, 0, 1, 1, 2, 2, 3, 3},
  131. }, {
  132. // Variable-only patterns. We don't support variables at the moment,
  133. // but verify that, given this, the behavior is indeed as expected.
  134. desc: "exact match of variable",
  135. tag: "und",
  136. pattern: " ",
  137. text: " ",
  138. want: []int{0, 1},
  139. }, {
  140. desc: "variables not handled by default",
  141. tag: "und",
  142. pattern: "- ",
  143. text: " -",
  144. want: nil, // Would be (1, 2) for a median match with variable}.
  145. }, {
  146. desc: "multiple subsequent identical variables",
  147. tag: "und",
  148. pattern: " ",
  149. text: " ",
  150. want: []int{0, 1, 1, 2, 2, 3, 3, 4},
  151. }, {
  152. desc: "text with variables",
  153. tag: "und",
  154. options: []Option{IgnoreDiacritics},
  155. pattern: "abc",
  156. text: "3 abc 3",
  157. want: []int{2, 5},
  158. }, {
  159. desc: "pattern with interior variables",
  160. tag: "und",
  161. options: []Option{IgnoreDiacritics},
  162. pattern: "a b c",
  163. text: "3 a b c abc a b c 3",
  164. want: []int{2, 7}, // Would have 3 matches using variable.
  165. // TODO: Different variable handling settings.
  166. }, {
  167. // Options.
  168. desc: "match all levels",
  169. tag: "und",
  170. pattern: "Abc",
  171. text: "abcAbcABCÁbcábc",
  172. want: []int{3, 6},
  173. }, {
  174. desc: "ignore diacritics in text",
  175. tag: "und",
  176. options: []Option{IgnoreDiacritics},
  177. pattern: "Abc",
  178. text: "Ábc",
  179. want: []int{0, 4},
  180. }, {
  181. desc: "ignore diacritics in pattern",
  182. tag: "und",
  183. options: []Option{IgnoreDiacritics},
  184. pattern: "Ábc",
  185. text: "Abc",
  186. want: []int{0, 3},
  187. }, {
  188. desc: "ignore diacritics",
  189. tag: "und",
  190. options: []Option{IgnoreDiacritics},
  191. pattern: "Abc",
  192. text: "abcAbcABCÁbcábc",
  193. want: []int{3, 6, 9, 13},
  194. }, {
  195. desc: "ignore case",
  196. tag: "und",
  197. options: []Option{IgnoreCase},
  198. pattern: "Abc",
  199. text: "abcAbcABCÁbcábc",
  200. want: []int{0, 3, 3, 6, 6, 9},
  201. }, {
  202. desc: "ignore case and diacritics",
  203. tag: "und",
  204. options: []Option{IgnoreCase, IgnoreDiacritics},
  205. pattern: "Abc",
  206. text: "abcAbcABCÁbcábc",
  207. want: []int{0, 3, 3, 6, 6, 9, 9, 13, 13, 17},
  208. }, {
  209. desc: "ignore width to fullwidth",
  210. tag: "und",
  211. options: []Option{IgnoreWidth},
  212. pattern: "abc",
  213. text: "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C
  214. want: []int{4, 13},
  215. }, {
  216. // TODO: distinguish between case and width.
  217. desc: "don't ignore width to fullwidth, ignoring only case",
  218. tag: "und",
  219. options: []Option{IgnoreCase},
  220. pattern: "abc",
  221. text: "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C
  222. want: []int{4, 13},
  223. }, {
  224. desc: "ignore width to fullwidth and diacritics",
  225. tag: "und",
  226. options: []Option{IgnoreWidth, IgnoreDiacritics},
  227. pattern: "abc",
  228. text: "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C
  229. want: []int{4, 13},
  230. }, {
  231. desc: "whole grapheme, single rune",
  232. tag: "und",
  233. pattern: "eee",
  234. text: "123 eeé 123",
  235. want: nil,
  236. }, {
  237. // Note: rules on when to apply contractions may, for certain languages,
  238. // differ between search and collation. For example, "ch" is not
  239. // considered a contraction for the purpose of searching in Spanish.
  240. // Therefore, be careful picking this test.
  241. desc: "whole grapheme, contractions",
  242. tag: "da",
  243. pattern: "aba",
  244. // Fails at the primary level, because "aa" is a contraction.
  245. text: "123 abaa 123",
  246. want: []int{},
  247. }, {
  248. desc: "whole grapheme, trailing modifier",
  249. tag: "und",
  250. pattern: "eee",
  251. text: "123 eee\u0300 123", // U+0300: COMBINING GRAVE ACCENT
  252. want: nil,
  253. }, {
  254. // Language-specific matching.
  255. desc: "",
  256. tag: "da",
  257. options: []Option{IgnoreCase},
  258. pattern: "Århus",
  259. text: "AarhusÅrhus Århus ",
  260. want: []int{0, 6, 6, 12, 14, 20},
  261. }, {
  262. desc: "",
  263. tag: "da",
  264. options: []Option{IgnoreCase},
  265. pattern: "Aarhus",
  266. text: "Århus Aarhus",
  267. want: []int{0, 6, 7, 13},
  268. }, {
  269. desc: "",
  270. tag: "en", // Å does not match A for English.
  271. options: []Option{IgnoreCase},
  272. pattern: "Aarhus",
  273. text: "Århus",
  274. want: nil,
  275. }, {
  276. desc: "ignore modifier in text",
  277. options: []Option{IgnoreDiacritics},
  278. tag: "und",
  279. pattern: "eee",
  280. text: "123 eee\u0300 123", // U+0300: COMBINING GRAVE ACCENT
  281. want: []int{4, 9}, // Matches on grapheme boundary.
  282. }, {
  283. desc: "ignore multiple modifiers in text",
  284. options: []Option{IgnoreDiacritics},
  285. tag: "und",
  286. pattern: "eee",
  287. text: "123 eee\u0300\u0300 123", // U+0300: COMBINING GRAVE ACCENT
  288. want: []int{4, 11}, // Matches on grapheme boundary.
  289. }, {
  290. desc: "ignore modifier in pattern",
  291. options: []Option{IgnoreDiacritics},
  292. tag: "und",
  293. pattern: "eee\u0300", // U+0300: COMBINING GRAVE ACCENT
  294. text: "123 eee 123",
  295. want: []int{4, 7},
  296. }, {
  297. desc: "ignore multiple modifiers in pattern",
  298. options: []Option{IgnoreDiacritics},
  299. tag: "und",
  300. pattern: "eee\u0300\u0300", // U+0300: COMBINING GRAVE ACCENT
  301. text: "123 eee 123",
  302. want: []int{4, 7},
  303. }, {
  304. desc: "match non-normalized pattern",
  305. tag: "und",
  306. // U+0300: COMBINING GRAVE ACCENT (CCC=230)
  307. // U+031B: COMBINING HORN (CCC=216)
  308. pattern: "eee\u0300\u031b",
  309. text: "123 eee\u031b\u0300 123",
  310. want: []int{4, 11},
  311. }, {
  312. desc: "match non-normalized text",
  313. tag: "und",
  314. // U+0300: COMBINING GRAVE ACCENT (CCC=230)
  315. // U+031B: COMBINING HORN (CCC=216)
  316. pattern: "eee\u031b\u0300",
  317. text: "123 eee\u0300\u031b 123",
  318. want: []int{4, 11},
  319. }} {
  320. m := New(language.MustParse(tc.tag), tc.options...)
  321. p := m.CompileString(tc.pattern)
  322. for j := 0; j < len(tc.text); {
  323. start, end := p.IndexString(tc.text[j:])
  324. if start == -1 && end == -1 {
  325. j++
  326. continue
  327. }
  328. start += j
  329. end += j
  330. j = end
  331. if len(tc.want) == 0 {
  332. t.Errorf("%d:%s: found unexpected result [%d %d]", i, tc.desc, start, end)
  333. break
  334. }
  335. if tc.want[0] != start || tc.want[1] != end {
  336. t.Errorf("%d:%s: got [%d %d]; want %v", i, tc.desc, start, end, tc.want[:2])
  337. tc.want = tc.want[2:]
  338. break
  339. }
  340. tc.want = tc.want[2:]
  341. }
  342. if len(tc.want) != 0 {
  343. t.Errorf("%d:%s: %d extra results", i, tc.desc, len(tc.want)/2)
  344. }
  345. }
  346. }