normalize_test.go 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323
  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package norm
  5. import (
  6. "bytes"
  7. "flag"
  8. "fmt"
  9. "io"
  10. "io/ioutil"
  11. "log"
  12. "os"
  13. "os/exec"
  14. "path/filepath"
  15. "runtime"
  16. "strings"
  17. "testing"
  18. "unicode/utf8"
  19. "golang.org/x/text/internal/testtext"
  20. "golang.org/x/text/transform"
  21. )
  22. var (
  23. testn = flag.Int("testn", -1, "specific test number to run or -1 for all")
  24. )
  25. // pc replaces any rune r that is repeated n times, for n > 1, with r{n}.
  26. func pc(s string) []byte {
  27. b := bytes.NewBuffer(make([]byte, 0, len(s)))
  28. for i := 0; i < len(s); {
  29. r, sz := utf8.DecodeRuneInString(s[i:])
  30. n := 0
  31. if sz == 1 {
  32. // Special-case one-byte case to handle repetition for invalid UTF-8.
  33. for c := s[i]; i+n < len(s) && s[i+n] == c; n++ {
  34. }
  35. } else {
  36. for _, r2 := range s[i:] {
  37. if r2 != r {
  38. break
  39. }
  40. n++
  41. }
  42. }
  43. b.WriteString(s[i : i+sz])
  44. if n > 1 {
  45. fmt.Fprintf(b, "{%d}", n)
  46. }
  47. i += sz * n
  48. }
  49. return b.Bytes()
  50. }
  51. // pidx finds the index from which two strings start to differ, plus context.
  52. // It returns the index and ellipsis if the index is greater than 0.
  53. func pidx(a, b string) (i int, prefix string) {
  54. for ; i < len(a) && i < len(b) && a[i] == b[i]; i++ {
  55. }
  56. if i < 8 {
  57. return 0, ""
  58. }
  59. i -= 3 // ensure taking at least one full rune before the difference.
  60. for k := i - 7; i > k && !utf8.RuneStart(a[i]); i-- {
  61. }
  62. return i, "..."
  63. }
  64. type PositionTest struct {
  65. input string
  66. pos int
  67. buffer string // expected contents of reorderBuffer, if applicable
  68. }
  69. type positionFunc func(rb *reorderBuffer, s string) (int, []byte)
  70. func runPosTests(t *testing.T, name string, f Form, fn positionFunc, tests []PositionTest) {
  71. rb := reorderBuffer{}
  72. rb.init(f, nil)
  73. for i, test := range tests {
  74. rb.reset()
  75. rb.src = inputString(test.input)
  76. rb.nsrc = len(test.input)
  77. pos, out := fn(&rb, test.input)
  78. if pos != test.pos {
  79. t.Errorf("%s:%d: position is %d; want %d", name, i, pos, test.pos)
  80. }
  81. if outs := string(out); outs != test.buffer {
  82. k, pfx := pidx(outs, test.buffer)
  83. t.Errorf("%s:%d: buffer \nwas %s%+q; \nwant %s%+q", name, i, pfx, pc(outs[k:]), pfx, pc(test.buffer[k:]))
  84. }
  85. }
  86. }
  87. func grave(n int) string {
  88. return rep(0x0300, n)
  89. }
  90. func rep(r rune, n int) string {
  91. return strings.Repeat(string(r), n)
  92. }
  93. const segSize = maxByteBufferSize
  94. var cgj = GraphemeJoiner
  95. var decomposeSegmentTests = []PositionTest{
  96. // illegal runes
  97. {"\xC2", 0, ""},
  98. {"\xC0", 1, "\xC0"},
  99. {"\u00E0\x80", 2, "\u0061\u0300"},
  100. // starter
  101. {"a", 1, "a"},
  102. {"ab", 1, "a"},
  103. // starter + composing
  104. {"a\u0300", 3, "a\u0300"},
  105. {"a\u0300b", 3, "a\u0300"},
  106. // with decomposition
  107. {"\u00C0", 2, "A\u0300"},
  108. {"\u00C0b", 2, "A\u0300"},
  109. // long
  110. {grave(31), 60, grave(30) + cgj},
  111. {"a" + grave(31), 61, "a" + grave(30) + cgj},
  112. // Stability tests: see https://www.unicode.org/review/pr-29.html.
  113. // U+0300 COMBINING GRAVE ACCENT;Mn;230;NSM;;;;;N;NON-SPACING GRAVE;;;;
  114. // U+0B47 ORIYA VOWEL SIGN E;Mc;0;L;;;;;N;;;;;
  115. // U+0B3E ORIYA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
  116. // U+1100 HANGUL CHOSEONG KIYEOK;Lo;0;L;;;;;N;;;;;
  117. // U+1161 HANGUL JUNGSEONG A;Lo;0;L;;;;;N;;;;;
  118. {"\u0B47\u0300\u0B3E", 8, "\u0B47\u0300\u0B3E"},
  119. {"\u1100\u0300\u1161", 8, "\u1100\u0300\u1161"},
  120. {"\u0B47\u0B3E", 6, "\u0B47\u0B3E"},
  121. {"\u1100\u1161", 6, "\u1100\u1161"},
  122. // U+04DA MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;;
  123. // Sequence of decomposing characters that are starters and modifiers.
  124. {"\u0d4a" + strings.Repeat("\u0d3e", 31), 90, "\u0d46" + strings.Repeat("\u0d3e", 30) + cgj},
  125. {grave(30), 60, grave(30)},
  126. // U+FF9E is a starter, but decomposes to U+3099, which is not.
  127. {grave(30) + "\uff9e", 60, grave(30) + cgj},
  128. // ends with incomplete UTF-8 encoding
  129. {"\xCC", 0, ""},
  130. {"\u0300\xCC", 2, "\u0300"},
  131. }
  132. func decomposeSegmentF(rb *reorderBuffer, s string) (int, []byte) {
  133. rb.initString(NFD, s)
  134. rb.setFlusher(nil, appendFlush)
  135. p := decomposeSegment(rb, 0, true)
  136. return p, rb.out
  137. }
  138. func TestDecomposeSegment(t *testing.T) {
  139. runPosTests(t, "TestDecomposeSegment", NFC, decomposeSegmentF, decomposeSegmentTests)
  140. }
  141. var firstBoundaryTests = []PositionTest{
  142. // no boundary
  143. {"", -1, ""},
  144. {"\u0300", -1, ""},
  145. {"\x80\x80", -1, ""},
  146. // illegal runes
  147. {"\xff", 0, ""},
  148. {"\u0300\xff", 2, ""},
  149. {"\u0300\xc0\x80\x80", 2, ""},
  150. // boundaries
  151. {"a", 0, ""},
  152. {"\u0300a", 2, ""},
  153. // Hangul
  154. {"\u1103\u1161", 0, ""},
  155. {"\u110B\u1173\u11B7", 0, ""},
  156. {"\u1161\u110B\u1173\u11B7", 3, ""},
  157. {"\u1173\u11B7\u1103\u1161", 6, ""},
  158. // too many combining characters.
  159. {grave(maxNonStarters - 1), -1, ""},
  160. {grave(maxNonStarters), 60, ""},
  161. {grave(maxNonStarters + 1), 60, ""},
  162. }
  163. func firstBoundaryF(rb *reorderBuffer, s string) (int, []byte) {
  164. return rb.f.form.FirstBoundary([]byte(s)), nil
  165. }
  166. func firstBoundaryStringF(rb *reorderBuffer, s string) (int, []byte) {
  167. return rb.f.form.FirstBoundaryInString(s), nil
  168. }
  169. func TestFirstBoundary(t *testing.T) {
  170. runPosTests(t, "TestFirstBoundary", NFC, firstBoundaryF, firstBoundaryTests)
  171. runPosTests(t, "TestFirstBoundaryInString", NFC, firstBoundaryStringF, firstBoundaryTests)
  172. }
  173. func TestNextBoundary(t *testing.T) {
  174. testCases := []struct {
  175. input string
  176. atEOF bool
  177. want int
  178. }{
  179. // no boundary
  180. {"", true, 0},
  181. {"", false, -1},
  182. {"\u0300", true, 2},
  183. {"\u0300", false, -1},
  184. {"\x80\x80", true, 1},
  185. {"\x80\x80", false, 1},
  186. // illegal runes
  187. {"\xff", false, 1},
  188. {"\u0300\xff", false, 2},
  189. {"\u0300\xc0\x80\x80", false, 2},
  190. {"\xc2\x80\x80", false, 2},
  191. {"\xc2", false, -1},
  192. {"\xc2", true, 1},
  193. {"a\u0300\xc2", false, -1},
  194. {"a\u0300\xc2", true, 3},
  195. // boundaries
  196. {"a", true, 1},
  197. {"a", false, -1},
  198. {"aa", false, 1},
  199. {"\u0300", true, 2},
  200. {"\u0300", false, -1},
  201. {"\u0300a", false, 2},
  202. // Hangul
  203. {"\u1103\u1161", true, 6},
  204. {"\u1103\u1161", false, -1},
  205. {"\u110B\u1173\u11B7", false, -1},
  206. {"\u110B\u1173\u11B7\u110B\u1173\u11B7", false, 9},
  207. {"\u1161\u110B\u1173\u11B7", false, 3},
  208. {"\u1173\u11B7\u1103\u1161", false, 6},
  209. // too many combining characters.
  210. {grave(maxNonStarters - 1), false, -1},
  211. {grave(maxNonStarters), false, 60},
  212. {grave(maxNonStarters + 1), false, 60},
  213. }
  214. for _, tc := range testCases {
  215. if got := NFC.NextBoundary([]byte(tc.input), tc.atEOF); got != tc.want {
  216. t.Errorf("NextBoundary(%+q, %v) = %d; want %d", tc.input, tc.atEOF, got, tc.want)
  217. }
  218. if got := NFC.NextBoundaryInString(tc.input, tc.atEOF); got != tc.want {
  219. t.Errorf("NextBoundaryInString(%+q, %v) = %d; want %d", tc.input, tc.atEOF, got, tc.want)
  220. }
  221. }
  222. }
  223. var decomposeToLastTests = []PositionTest{
  224. // ends with inert character
  225. {"Hello!", 6, ""},
  226. {"\u0632", 2, ""},
  227. {"a\u0301\u0635", 5, ""},
  228. // ends with non-inert starter
  229. {"a", 0, "a"},
  230. {"a\u0301a", 3, "a"},
  231. {"a\u0301\u03B9", 3, "\u03B9"},
  232. {"a\u0327", 0, "a\u0327"},
  233. // illegal runes
  234. {"\xFF", 1, ""},
  235. {"aa\xFF", 3, ""},
  236. {"\xC0\x80\x80", 3, ""},
  237. {"\xCC\x80\x80", 3, ""},
  238. // ends with incomplete UTF-8 encoding
  239. {"a\xCC", 2, ""},
  240. // ends with combining characters
  241. {"\u0300\u0301", 0, "\u0300\u0301"},
  242. {"a\u0300\u0301", 0, "a\u0300\u0301"},
  243. {"a\u0301\u0308", 0, "a\u0301\u0308"},
  244. {"a\u0308\u0301", 0, "a\u0308\u0301"},
  245. {"aaaa\u0300\u0301", 3, "a\u0300\u0301"},
  246. {"\u0300a\u0300\u0301", 2, "a\u0300\u0301"},
  247. {"\u00C0", 0, "A\u0300"},
  248. {"a\u00C0", 1, "A\u0300"},
  249. // decomposing
  250. {"a\u0300\u00E0", 3, "a\u0300"},
  251. // multisegment decompositions (flushes leading segments)
  252. {"a\u0300\uFDC0", 7, "\u064A"},
  253. {"\uFDC0" + grave(29), 4, "\u064A" + grave(29)},
  254. {"\uFDC0" + grave(30), 4, "\u064A" + grave(30)},
  255. {"\uFDC0" + grave(31), 5, grave(30)},
  256. {"\uFDFA" + grave(14), 31, "\u0645" + grave(14)},
  257. // Overflow
  258. {"\u00E0" + grave(29), 0, "a" + grave(30)},
  259. {"\u00E0" + grave(30), 2, grave(30)},
  260. // Hangul
  261. {"a\u1103", 1, "\u1103"},
  262. {"a\u110B", 1, "\u110B"},
  263. {"a\u110B\u1173", 1, "\u110B\u1173"},
  264. // See comment in composition.go:compBoundaryAfter.
  265. {"a\u110B\u1173\u11B7", 1, "\u110B\u1173\u11B7"},
  266. {"a\uC73C", 1, "\u110B\u1173"},
  267. {"다음", 3, "\u110B\u1173\u11B7"},
  268. {"다", 0, "\u1103\u1161"},
  269. {"\u1103\u1161\u110B\u1173\u11B7", 6, "\u110B\u1173\u11B7"},
  270. {"\u110B\u1173\u11B7\u1103\u1161", 9, "\u1103\u1161"},
  271. {"다음음", 6, "\u110B\u1173\u11B7"},
  272. {"음다다", 6, "\u1103\u1161"},
  273. // maximized buffer
  274. {"a" + grave(30), 0, "a" + grave(30)},
  275. // Buffer overflow
  276. {"a" + grave(31), 3, grave(30)},
  277. // weird UTF-8
  278. {"a\u0300\u11B7", 0, "a\u0300\u11B7"},
  279. }
  280. func decomposeToLast(rb *reorderBuffer, s string) (int, []byte) {
  281. rb.setFlusher([]byte(s), appendFlush)
  282. decomposeToLastBoundary(rb)
  283. buf := rb.flush(nil)
  284. return len(rb.out), buf
  285. }
  286. func TestDecomposeToLastBoundary(t *testing.T) {
  287. runPosTests(t, "TestDecomposeToLastBoundary", NFKC, decomposeToLast, decomposeToLastTests)
  288. }
  289. var lastBoundaryTests = []PositionTest{
  290. // ends with inert character
  291. {"Hello!", 6, ""},
  292. {"\u0632", 2, ""},
  293. // ends with non-inert starter
  294. {"a", 0, ""},
  295. // illegal runes
  296. {"\xff", 1, ""},
  297. {"aa\xff", 3, ""},
  298. {"a\xff\u0300", 1, ""}, // TODO: should probably be 2.
  299. {"\xc0\x80\x80", 3, ""},
  300. {"\xc0\x80\x80\u0300", 3, ""},
  301. // ends with incomplete UTF-8 encoding
  302. {"\xCC", -1, ""},
  303. {"\xE0\x80", -1, ""},
  304. {"\xF0\x80\x80", -1, ""},
  305. {"a\xCC", 0, ""},
  306. {"\x80\xCC", 1, ""},
  307. {"\xCC\xCC", 1, ""},
  308. // ends with combining characters
  309. {"a\u0300\u0301", 0, ""},
  310. {"aaaa\u0300\u0301", 3, ""},
  311. {"\u0300a\u0300\u0301", 2, ""},
  312. {"\u00C2", 0, ""},
  313. {"a\u00C2", 1, ""},
  314. // decomposition may recombine
  315. {"\u0226", 0, ""},
  316. // no boundary
  317. {"", -1, ""},
  318. {"\u0300\u0301", -1, ""},
  319. {"\u0300", -1, ""},
  320. {"\x80\x80", -1, ""},
  321. {"\x80\x80\u0301", -1, ""},
  322. // Hangul
  323. {"다음", 3, ""},
  324. {"다", 0, ""},
  325. {"\u1103\u1161\u110B\u1173\u11B7", 6, ""},
  326. {"\u110B\u1173\u11B7\u1103\u1161", 9, ""},
  327. // too many combining characters.
  328. {grave(maxNonStarters - 1), -1, ""},
  329. // May still be preceded with a non-starter.
  330. {grave(maxNonStarters), -1, ""},
  331. // May still need to insert a cgj after the last combiner.
  332. {grave(maxNonStarters + 1), 2, ""},
  333. {grave(maxNonStarters + 2), 4, ""},
  334. {"a" + grave(maxNonStarters-1), 0, ""},
  335. {"a" + grave(maxNonStarters), 0, ""},
  336. // May still need to insert a cgj after the last combiner.
  337. {"a" + grave(maxNonStarters+1), 3, ""},
  338. {"a" + grave(maxNonStarters+2), 5, ""},
  339. }
  340. func lastBoundaryF(rb *reorderBuffer, s string) (int, []byte) {
  341. return rb.f.form.LastBoundary([]byte(s)), nil
  342. }
  343. func TestLastBoundary(t *testing.T) {
  344. runPosTests(t, "TestLastBoundary", NFC, lastBoundaryF, lastBoundaryTests)
  345. }
  346. type spanTest struct {
  347. input string
  348. atEOF bool
  349. n int
  350. err error
  351. }
  352. var quickSpanTests = []spanTest{
  353. {"", true, 0, nil},
  354. // starters
  355. {"a", true, 1, nil},
  356. {"abc", true, 3, nil},
  357. {"\u043Eb", true, 3, nil},
  358. // incomplete last rune.
  359. {"\xCC", true, 1, nil},
  360. {"\xCC", false, 0, transform.ErrShortSrc},
  361. {"a\xCC", true, 2, nil},
  362. {"a\xCC", false, 0, transform.ErrShortSrc}, // TODO: could be 1 for NFD
  363. // incorrectly ordered combining characters
  364. {"\u0300\u0316", true, 0, transform.ErrEndOfSpan},
  365. {"\u0300\u0316", false, 0, transform.ErrEndOfSpan},
  366. {"\u0300\u0316cd", true, 0, transform.ErrEndOfSpan},
  367. {"\u0300\u0316cd", false, 0, transform.ErrEndOfSpan},
  368. // have a maximum number of combining characters.
  369. {rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan},
  370. {"a" + rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan},
  371. {"Ɵ" + rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan},
  372. {"aa" + rep(0x035D, 30) + "\u035B", true, 1, transform.ErrEndOfSpan},
  373. {rep(0x035D, 30) + cgj + "\u035B", true, 64, nil},
  374. {"a" + rep(0x035D, 30) + cgj + "\u035B", true, 65, nil},
  375. {"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", true, 66, nil},
  376. {"aa" + rep(0x035D, 30) + cgj + "\u035B", true, 66, nil},
  377. {"a" + rep(0x035D, 30) + cgj + "\u035B", false, 61, transform.ErrShortSrc},
  378. {"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", false, 62, transform.ErrShortSrc},
  379. {"aa" + rep(0x035D, 30) + cgj + "\u035B", false, 62, transform.ErrShortSrc},
  380. }
  381. var quickSpanNFDTests = []spanTest{
  382. // needs decomposing
  383. {"\u00C0", true, 0, transform.ErrEndOfSpan},
  384. {"abc\u00C0", true, 3, transform.ErrEndOfSpan},
  385. // correctly ordered combining characters
  386. {"\u0300", true, 2, nil},
  387. {"ab\u0300", true, 4, nil},
  388. {"ab\u0300cd", true, 6, nil},
  389. {"\u0300cd", true, 4, nil},
  390. {"\u0316\u0300", true, 4, nil},
  391. {"ab\u0316\u0300", true, 6, nil},
  392. {"ab\u0316\u0300cd", true, 8, nil},
  393. {"ab\u0316\u0300\u00C0", true, 6, transform.ErrEndOfSpan},
  394. {"\u0316\u0300cd", true, 6, nil},
  395. {"\u043E\u0308b", true, 5, nil},
  396. // incorrectly ordered combining characters
  397. {"ab\u0300\u0316", true, 1, transform.ErrEndOfSpan}, // TODO: we could skip 'b' as well.
  398. {"ab\u0300\u0316cd", true, 1, transform.ErrEndOfSpan},
  399. // Hangul
  400. {"같은", true, 0, transform.ErrEndOfSpan},
  401. }
  402. var quickSpanNFCTests = []spanTest{
  403. // okay composed
  404. {"\u00C0", true, 2, nil},
  405. {"abc\u00C0", true, 5, nil},
  406. // correctly ordered combining characters
  407. // TODO: b may combine with modifiers, which is why this fails. We could
  408. // make a more precise test that actually checks whether last
  409. // characters combines. Probably not worth it.
  410. {"ab\u0300", true, 1, transform.ErrEndOfSpan},
  411. {"ab\u0300cd", true, 1, transform.ErrEndOfSpan},
  412. {"ab\u0316\u0300", true, 1, transform.ErrEndOfSpan},
  413. {"ab\u0316\u0300cd", true, 1, transform.ErrEndOfSpan},
  414. {"\u00C0\u035D", true, 4, nil},
  415. // we do not special case leading combining characters
  416. {"\u0300cd", true, 0, transform.ErrEndOfSpan},
  417. {"\u0300", true, 0, transform.ErrEndOfSpan},
  418. {"\u0316\u0300", true, 0, transform.ErrEndOfSpan},
  419. {"\u0316\u0300cd", true, 0, transform.ErrEndOfSpan},
  420. // incorrectly ordered combining characters
  421. {"ab\u0300\u0316", true, 1, transform.ErrEndOfSpan},
  422. {"ab\u0300\u0316cd", true, 1, transform.ErrEndOfSpan},
  423. // Hangul
  424. {"같은", true, 6, nil},
  425. {"같은", false, 3, transform.ErrShortSrc},
  426. // We return the start of the violating segment in case of overflow.
  427. {grave(30) + "\uff9e", true, 0, transform.ErrEndOfSpan},
  428. {grave(30), true, 0, transform.ErrEndOfSpan},
  429. }
  430. func runSpanTests(t *testing.T, name string, f Form, testCases []spanTest) {
  431. for i, tc := range testCases {
  432. s := fmt.Sprintf("Bytes/%s/%d=%+q/atEOF=%v", name, i, pc(tc.input), tc.atEOF)
  433. ok := testtext.Run(t, s, func(t *testing.T) {
  434. n, err := f.Span([]byte(tc.input), tc.atEOF)
  435. if n != tc.n || err != tc.err {
  436. t.Errorf("\n got %d, %v;\nwant %d, %v", n, err, tc.n, tc.err)
  437. }
  438. })
  439. if !ok {
  440. continue // Don't do the String variant if the Bytes variant failed.
  441. }
  442. s = fmt.Sprintf("String/%s/%d=%+q/atEOF=%v", name, i, pc(tc.input), tc.atEOF)
  443. testtext.Run(t, s, func(t *testing.T) {
  444. n, err := f.SpanString(tc.input, tc.atEOF)
  445. if n != tc.n || err != tc.err {
  446. t.Errorf("\n got %d, %v;\nwant %d, %v", n, err, tc.n, tc.err)
  447. }
  448. })
  449. }
  450. }
  451. func TestSpan(t *testing.T) {
  452. runSpanTests(t, "NFD", NFD, quickSpanTests)
  453. runSpanTests(t, "NFD", NFD, quickSpanNFDTests)
  454. runSpanTests(t, "NFC", NFC, quickSpanTests)
  455. runSpanTests(t, "NFC", NFC, quickSpanNFCTests)
  456. }
  457. var isNormalTests = []PositionTest{
  458. {"", 1, ""},
  459. // illegal runes
  460. {"\xff", 1, ""},
  461. // starters
  462. {"a", 1, ""},
  463. {"abc", 1, ""},
  464. {"\u043Eb", 1, ""},
  465. // incorrectly ordered combining characters
  466. {"\u0300\u0316", 0, ""},
  467. {"ab\u0300\u0316", 0, ""},
  468. {"ab\u0300\u0316cd", 0, ""},
  469. {"\u0300\u0316cd", 0, ""},
  470. }
  471. var isNormalNFDTests = []PositionTest{
  472. // needs decomposing
  473. {"\u00C0", 0, ""},
  474. {"abc\u00C0", 0, ""},
  475. // correctly ordered combining characters
  476. {"\u0300", 1, ""},
  477. {"ab\u0300", 1, ""},
  478. {"ab\u0300cd", 1, ""},
  479. {"\u0300cd", 1, ""},
  480. {"\u0316\u0300", 1, ""},
  481. {"ab\u0316\u0300", 1, ""},
  482. {"ab\u0316\u0300cd", 1, ""},
  483. {"\u0316\u0300cd", 1, ""},
  484. {"\u043E\u0308b", 1, ""},
  485. // Hangul
  486. {"같은", 0, ""},
  487. }
  488. var isNormalNFCTests = []PositionTest{
  489. // okay composed
  490. {"\u00C0", 1, ""},
  491. {"abc\u00C0", 1, ""},
  492. // need reordering
  493. {"a\u0300", 0, ""},
  494. {"a\u0300cd", 0, ""},
  495. {"a\u0316\u0300", 0, ""},
  496. {"a\u0316\u0300cd", 0, ""},
  497. // correctly ordered combining characters
  498. {"ab\u0300", 1, ""},
  499. {"ab\u0300cd", 1, ""},
  500. {"ab\u0316\u0300", 1, ""},
  501. {"ab\u0316\u0300cd", 1, ""},
  502. {"\u00C0\u035D", 1, ""},
  503. {"\u0300", 1, ""},
  504. {"\u0316\u0300cd", 1, ""},
  505. // Hangul
  506. {"같은", 1, ""},
  507. }
  508. var isNormalNFKXTests = []PositionTest{
  509. // Special case.
  510. {"\u00BC", 0, ""},
  511. }
  512. func isNormalF(rb *reorderBuffer, s string) (int, []byte) {
  513. if rb.f.form.IsNormal([]byte(s)) {
  514. return 1, nil
  515. }
  516. return 0, nil
  517. }
  518. func isNormalStringF(rb *reorderBuffer, s string) (int, []byte) {
  519. if rb.f.form.IsNormalString(s) {
  520. return 1, nil
  521. }
  522. return 0, nil
  523. }
  524. func TestIsNormal(t *testing.T) {
  525. runPosTests(t, "TestIsNormalNFD1", NFD, isNormalF, isNormalTests)
  526. runPosTests(t, "TestIsNormalNFD2", NFD, isNormalF, isNormalNFDTests)
  527. runPosTests(t, "TestIsNormalNFC1", NFC, isNormalF, isNormalTests)
  528. runPosTests(t, "TestIsNormalNFC2", NFC, isNormalF, isNormalNFCTests)
  529. runPosTests(t, "TestIsNormalNFKD1", NFKD, isNormalF, isNormalTests)
  530. runPosTests(t, "TestIsNormalNFKD2", NFKD, isNormalF, isNormalNFDTests)
  531. runPosTests(t, "TestIsNormalNFKD3", NFKD, isNormalF, isNormalNFKXTests)
  532. runPosTests(t, "TestIsNormalNFKC1", NFKC, isNormalF, isNormalTests)
  533. runPosTests(t, "TestIsNormalNFKC2", NFKC, isNormalF, isNormalNFCTests)
  534. runPosTests(t, "TestIsNormalNFKC3", NFKC, isNormalF, isNormalNFKXTests)
  535. }
  536. func TestIsNormalString(t *testing.T) {
  537. runPosTests(t, "TestIsNormalNFD1", NFD, isNormalStringF, isNormalTests)
  538. runPosTests(t, "TestIsNormalNFD2", NFD, isNormalStringF, isNormalNFDTests)
  539. runPosTests(t, "TestIsNormalNFC1", NFC, isNormalStringF, isNormalTests)
  540. runPosTests(t, "TestIsNormalNFC2", NFC, isNormalStringF, isNormalNFCTests)
  541. }
  542. type AppendTest struct {
  543. left string
  544. right string
  545. out string
  546. }
  547. type appendFunc func(f Form, out []byte, s string) []byte
  548. var fstr = []string{"NFC", "NFD", "NFKC", "NFKD"}
  549. func runNormTests(t *testing.T, name string, fn appendFunc) {
  550. for f := NFC; f <= NFKD; f++ {
  551. runAppendTests(t, name, f, fn, normTests[f])
  552. }
  553. }
  554. func runAppendTests(t *testing.T, name string, f Form, fn appendFunc, tests []AppendTest) {
  555. for i, test := range tests {
  556. t.Run(fmt.Sprintf("%s/%d", fstr[f], i), func(t *testing.T) {
  557. id := pc(test.left + test.right)
  558. if *testn >= 0 && i != *testn {
  559. return
  560. }
  561. t.Run("fn", func(t *testing.T) {
  562. out := []byte(test.left)
  563. have := string(fn(f, out, test.right))
  564. if len(have) != len(test.out) {
  565. t.Errorf("%+q: length is %d; want %d (%+q vs %+q)", id, len(have), len(test.out), pc(have), pc(test.out))
  566. }
  567. if have != test.out {
  568. k, pf := pidx(have, test.out)
  569. t.Errorf("%+q:\nwas %s%+q; \nwant %s%+q", id, pf, pc(have[k:]), pf, pc(test.out[k:]))
  570. }
  571. })
  572. // Bootstrap by normalizing input. Ensures that the various variants
  573. // behave the same.
  574. for g := NFC; g <= NFKD; g++ {
  575. if f == g {
  576. continue
  577. }
  578. t.Run(fstr[g], func(t *testing.T) {
  579. want := g.String(test.left + test.right)
  580. have := string(fn(g, g.AppendString(nil, test.left), test.right))
  581. if len(have) != len(want) {
  582. t.Errorf("%+q: length is %d; want %d (%+q vs %+q)", id, len(have), len(want), pc(have), pc(want))
  583. }
  584. if have != want {
  585. k, pf := pidx(have, want)
  586. t.Errorf("%+q:\nwas %s%+q; \nwant %s%+q", id, pf, pc(have[k:]), pf, pc(want[k:]))
  587. }
  588. })
  589. }
  590. })
  591. }
  592. }
  593. var normTests = [][]AppendTest{
  594. appendTestsNFC,
  595. appendTestsNFD,
  596. appendTestsNFKC,
  597. appendTestsNFKD,
  598. }
  599. var appendTestsNFC = []AppendTest{
  600. {"", ascii, ascii},
  601. {"", txt_all, txt_all},
  602. {"\uff9e", grave(30), "\uff9e" + grave(29) + cgj + grave(1)},
  603. {grave(30), "\uff9e", grave(30) + cgj + "\uff9e"},
  604. // Tests designed for Iter.
  605. { // ordering of non-composing combining characters
  606. "",
  607. "\u0305\u0316",
  608. "\u0316\u0305",
  609. },
  610. { // segment overflow
  611. "",
  612. "a" + rep(0x0305, maxNonStarters+4) + "\u0316",
  613. "a" + rep(0x0305, maxNonStarters) + cgj + "\u0316" + rep(0x305, 4),
  614. },
  615. { // Combine across non-blocking non-starters.
  616. // U+0327 COMBINING CEDILLA;Mn;202;NSM;;;;;N;NON-SPACING CEDILLA;;;;
  617. // U+0325 COMBINING RING BELOW;Mn;220;NSM;;;;;N;NON-SPACING RING BELOW;;;;
  618. "", "a\u0327\u0325", "\u1e01\u0327",
  619. },
  620. { // Jamo V+T does not combine.
  621. "",
  622. "\u1161\u11a8",
  623. "\u1161\u11a8",
  624. },
  625. // Stability tests: see https://www.unicode.org/review/pr-29.html.
  626. {"", "\u0b47\u0300\u0b3e", "\u0b47\u0300\u0b3e"},
  627. {"", "\u1100\u0300\u1161", "\u1100\u0300\u1161"},
  628. {"", "\u0b47\u0b3e", "\u0b4b"},
  629. {"", "\u1100\u1161", "\uac00"},
  630. // U+04DA MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;;
  631. { // 0d4a starts a new segment.
  632. "",
  633. "\u0d4a" + strings.Repeat("\u0d3e", 15) + "\u0d4a" + strings.Repeat("\u0d3e", 15),
  634. "\u0d4a" + strings.Repeat("\u0d3e", 15) + "\u0d4a" + strings.Repeat("\u0d3e", 15),
  635. },
  636. { // Split combining characters.
  637. // TODO: don't insert CGJ before starters.
  638. "",
  639. "\u0d46" + strings.Repeat("\u0d3e", 31),
  640. "\u0d4a" + strings.Repeat("\u0d3e", 29) + cgj + "\u0d3e",
  641. },
  642. { // Split combining characters.
  643. "",
  644. "\u0d4a" + strings.Repeat("\u0d3e", 30),
  645. "\u0d4a" + strings.Repeat("\u0d3e", 29) + cgj + "\u0d3e",
  646. },
  647. { // https://golang.org/issues/20079
  648. "",
  649. "\xeb\u0344",
  650. "\xeb\u0308\u0301",
  651. },
  652. { // https://golang.org/issues/20079
  653. "",
  654. "\uac00" + strings.Repeat("\u0300", 30),
  655. "\uac00" + strings.Repeat("\u0300", 29) + "\u034f\u0300",
  656. },
  657. { // https://golang.org/issues/20079
  658. "",
  659. "\xeb" + strings.Repeat("\u0300", 31),
  660. "\xeb" + strings.Repeat("\u0300", 30) + "\u034f\u0300",
  661. },
  662. }
  663. var appendTestsNFD = []AppendTest{
  664. // TODO: Move some of the tests here.
  665. }
  666. var appendTestsNFKC = []AppendTest{
  667. // empty buffers
  668. {"", "", ""},
  669. {"a", "", "a"},
  670. {"", "a", "a"},
  671. {"", "\u0041\u0307\u0304", "\u01E0"},
  672. // segment split across buffers
  673. {"", "a\u0300b", "\u00E0b"},
  674. {"a", "\u0300b", "\u00E0b"},
  675. {"a", "\u0300\u0316", "\u00E0\u0316"},
  676. {"a", "\u0316\u0300", "\u00E0\u0316"},
  677. {"a", "\u0300a\u0300", "\u00E0\u00E0"},
  678. {"a", "\u0300a\u0300a\u0300", "\u00E0\u00E0\u00E0"},
  679. {"a", "\u0300aaa\u0300aaa\u0300", "\u00E0aa\u00E0aa\u00E0"},
  680. {"a\u0300", "\u0327", "\u00E0\u0327"},
  681. {"a\u0327", "\u0300", "\u00E0\u0327"},
  682. {"a\u0316", "\u0300", "\u00E0\u0316"},
  683. {"\u0041\u0307", "\u0304", "\u01E0"},
  684. // Hangul
  685. {"", "\u110B\u1173", "\uC73C"},
  686. {"", "\u1103\u1161", "\uB2E4"},
  687. {"", "\u110B\u1173\u11B7", "\uC74C"},
  688. {"", "\u320E", "\x28\uAC00\x29"},
  689. {"", "\x28\u1100\u1161\x29", "\x28\uAC00\x29"},
  690. {"\u1103", "\u1161", "\uB2E4"},
  691. {"\u110B", "\u1173\u11B7", "\uC74C"},
  692. {"\u110B\u1173", "\u11B7", "\uC74C"},
  693. {"\uC73C", "\u11B7", "\uC74C"},
  694. // UTF-8 encoding split across buffers
  695. {"a\xCC", "\x80", "\u00E0"},
  696. {"a\xCC", "\x80b", "\u00E0b"},
  697. {"a\xCC", "\x80a\u0300", "\u00E0\u00E0"},
  698. {"a\xCC", "\x80\x80", "\u00E0\x80"},
  699. {"a\xCC", "\x80\xCC", "\u00E0\xCC"},
  700. {"a\u0316\xCC", "\x80a\u0316\u0300", "\u00E0\u0316\u00E0\u0316"},
  701. // ending in incomplete UTF-8 encoding
  702. {"", "\xCC", "\xCC"},
  703. {"a", "\xCC", "a\xCC"},
  704. {"a", "b\xCC", "ab\xCC"},
  705. {"\u0226", "\xCC", "\u0226\xCC"},
  706. // illegal runes
  707. {"", "\x80", "\x80"},
  708. {"", "\x80\x80\x80", "\x80\x80\x80"},
  709. {"", "\xCC\x80\x80\x80", "\xCC\x80\x80\x80"},
  710. {"", "a\x80", "a\x80"},
  711. {"", "a\x80\x80\x80", "a\x80\x80\x80"},
  712. {"", "a\x80\x80\x80\x80\x80\x80", "a\x80\x80\x80\x80\x80\x80"},
  713. {"a", "\x80\x80\x80", "a\x80\x80\x80"},
  714. // overflow
  715. {"", strings.Repeat("\x80", 33), strings.Repeat("\x80", 33)},
  716. {strings.Repeat("\x80", 33), "", strings.Repeat("\x80", 33)},
  717. {strings.Repeat("\x80", 33), strings.Repeat("\x80", 33), strings.Repeat("\x80", 66)},
  718. // overflow of combining characters
  719. {"", grave(34), grave(30) + cgj + grave(4)},
  720. {"", grave(36), grave(30) + cgj + grave(6)},
  721. {grave(29), grave(5), grave(30) + cgj + grave(4)},
  722. {grave(30), grave(4), grave(30) + cgj + grave(4)},
  723. {grave(30), grave(3), grave(30) + cgj + grave(3)},
  724. {grave(30) + "\xCC", "\x80", grave(30) + cgj + grave(1)},
  725. {"", "\uFDFA" + grave(14), "\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645" + grave(14)},
  726. {"", "\uFDFA" + grave(28) + "\u0316", "\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645\u0316" + grave(28)},
  727. // - First rune has a trailing non-starter.
  728. {"\u00d5", grave(30), "\u00d5" + grave(29) + cgj + grave(1)},
  729. // - U+FF9E decomposes into a non-starter in compatibility mode. A CGJ must be
  730. // inserted even when FF9E starts a new segment.
  731. {"\uff9e", grave(30), "\u3099" + grave(29) + cgj + grave(1)},
  732. {grave(30), "\uff9e", grave(30) + cgj + "\u3099"},
  733. // - Many non-starter decompositions in a row causing overflow.
  734. {"", rep(0x340, 31), rep(0x300, 30) + cgj + "\u0300"},
  735. {"", rep(0xFF9E, 31), rep(0x3099, 30) + cgj + "\u3099"},
  736. {"", "\u0644\u0625" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + "\u0300\u0300"},
  737. {"", "\ufef9" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + rep(0x0300, 2)},
  738. {"", "\ufef9" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + rep(0x0300, 2)},
  739. // U+0F81 TIBETAN VOWEL SIGN REVERSED II splits into two modifiers.
  740. {"", "\u0f7f" + rep(0xf71, 29) + "\u0f81", "\u0f7f" + rep(0xf71, 29) + cgj + "\u0f71\u0f80"},
  741. {"", "\u0f7f" + rep(0xf71, 28) + "\u0f81", "\u0f7f" + rep(0xf71, 29) + "\u0f80"},
  742. {"", "\u0f7f" + rep(0xf81, 16), "\u0f7f" + rep(0xf71, 15) + rep(0xf80, 15) + cgj + "\u0f71\u0f80"},
  743. // weird UTF-8
  744. {"\u00E0\xE1", "\x86", "\u00E0\xE1\x86"},
  745. {"a\u0300\u11B7", "\u0300", "\u00E0\u11B7\u0300"},
  746. {"a\u0300\u11B7\u0300", "\u0300", "\u00E0\u11B7\u0300\u0300"},
  747. {"\u0300", "\xF8\x80\x80\x80\x80\u0300", "\u0300\xF8\x80\x80\x80\x80\u0300"},
  748. {"\u0300", "\xFC\x80\x80\x80\x80\x80\u0300", "\u0300\xFC\x80\x80\x80\x80\x80\u0300"},
  749. {"\xF8\x80\x80\x80\x80\u0300", "\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
  750. {"\xFC\x80\x80\x80\x80\x80\u0300", "\u0300", "\xFC\x80\x80\x80\x80\x80\u0300\u0300"},
  751. {"\xF8\x80\x80\x80", "\x80\u0300\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
  752. {"", strings.Repeat("a\u0316\u0300", 6), strings.Repeat("\u00E0\u0316", 6)},
  753. // large input.
  754. {"", strings.Repeat("a\u0300\u0316", 31), strings.Repeat("\u00E0\u0316", 31)},
  755. {"", strings.Repeat("a\u0300\u0316", 4000), strings.Repeat("\u00E0\u0316", 4000)},
  756. {"", strings.Repeat("\x80\x80", 4000), strings.Repeat("\x80\x80", 4000)},
  757. {"", "\u0041\u0307\u0304", "\u01E0"},
  758. }
  759. var appendTestsNFKD = []AppendTest{
  760. {"", "a" + grave(64), "a" + grave(30) + cgj + grave(30) + cgj + grave(4)},
  761. { // segment overflow on unchanged character
  762. "",
  763. "a" + grave(64) + "\u0316",
  764. "a" + grave(30) + cgj + grave(30) + cgj + "\u0316" + grave(4),
  765. },
  766. { // segment overflow on unchanged character + start value
  767. "",
  768. "a" + grave(98) + "\u0316",
  769. "a" + grave(30) + cgj + grave(30) + cgj + grave(30) + cgj + "\u0316" + grave(8),
  770. },
  771. { // segment overflow on decomposition. (U+0340 decomposes to U+0300.)
  772. "",
  773. "a" + grave(59) + "\u0340",
  774. "a" + grave(30) + cgj + grave(30),
  775. },
  776. { // segment overflow on non-starter decomposition
  777. "",
  778. "a" + grave(33) + "\u0340" + grave(30) + "\u0320",
  779. "a" + grave(30) + cgj + grave(30) + cgj + "\u0320" + grave(4),
  780. },
  781. { // start value after ASCII overflow
  782. "",
  783. rep('a', segSize) + grave(32) + "\u0320",
  784. rep('a', segSize) + grave(30) + cgj + "\u0320" + grave(2),
  785. },
  786. { // Jamo overflow
  787. "",
  788. "\u1100\u1161" + grave(30) + "\u0320" + grave(2),
  789. "\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
  790. },
  791. { // Hangul
  792. "",
  793. "\uac00",
  794. "\u1100\u1161",
  795. },
  796. { // Hangul overflow
  797. "",
  798. "\uac00" + grave(32) + "\u0320",
  799. "\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
  800. },
  801. { // Hangul overflow in Hangul mode.
  802. "",
  803. "\uac00\uac00" + grave(32) + "\u0320",
  804. "\u1100\u1161\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
  805. },
  806. { // Hangul overflow in Hangul mode.
  807. "",
  808. strings.Repeat("\uac00", 3) + grave(32) + "\u0320",
  809. strings.Repeat("\u1100\u1161", 3) + grave(29) + cgj + "\u0320" + grave(3),
  810. },
  811. { // start value after cc=0
  812. "",
  813. "您您" + grave(34) + "\u0320",
  814. "您您" + grave(30) + cgj + "\u0320" + grave(4),
  815. },
  816. { // start value after normalization
  817. "",
  818. "\u0300\u0320a" + grave(34) + "\u0320",
  819. "\u0320\u0300a" + grave(30) + cgj + "\u0320" + grave(4),
  820. },
  821. {
  822. // U+0F81 TIBETAN VOWEL SIGN REVERSED II splits into two modifiers.
  823. "",
  824. "a\u0f7f" + rep(0xf71, 29) + "\u0f81",
  825. "a\u0f7f" + rep(0xf71, 29) + cgj + "\u0f71\u0f80",
  826. },
  827. }
  828. func TestAppend(t *testing.T) {
  829. runNormTests(t, "Append", func(f Form, out []byte, s string) []byte {
  830. return f.Append(out, []byte(s)...)
  831. })
  832. }
  833. func TestAppendString(t *testing.T) {
  834. runNormTests(t, "AppendString", func(f Form, out []byte, s string) []byte {
  835. return f.AppendString(out, s)
  836. })
  837. }
  838. func TestBytes(t *testing.T) {
  839. runNormTests(t, "Bytes", func(f Form, out []byte, s string) []byte {
  840. buf := []byte{}
  841. buf = append(buf, out...)
  842. buf = append(buf, s...)
  843. return f.Bytes(buf)
  844. })
  845. }
  846. func TestString(t *testing.T) {
  847. runNormTests(t, "String", func(f Form, out []byte, s string) []byte {
  848. outs := string(out) + s
  849. return []byte(f.String(outs))
  850. })
  851. }
  852. func runNM(code string) (string, error) {
  853. // Write the file.
  854. tmpdir, err := ioutil.TempDir(os.TempDir(), "normalize_test")
  855. if err != nil {
  856. return "", fmt.Errorf("failed to create tmpdir: %v", err)
  857. }
  858. defer os.RemoveAll(tmpdir)
  859. goTool := filepath.Join(runtime.GOROOT(), "bin", "go")
  860. filename := filepath.Join(tmpdir, "main.go")
  861. if err := ioutil.WriteFile(filename, []byte(code), 0644); err != nil {
  862. return "", fmt.Errorf("failed to write main.go: %v", err)
  863. }
  864. outputFile := filepath.Join(tmpdir, "main")
  865. // Build the binary.
  866. out, err := exec.Command(goTool, "build", "-o", outputFile, filename).CombinedOutput()
  867. if err != nil {
  868. return "", fmt.Errorf("failed to execute command: %v", err)
  869. }
  870. // Get the symbols.
  871. out, err = exec.Command(goTool, "tool", "nm", outputFile).CombinedOutput()
  872. return string(out), err
  873. }
  874. func TestLinking(t *testing.T) {
  875. const prog = `
  876. package main
  877. import "fmt"
  878. import "golang.org/x/text/unicode/norm"
  879. func main() { fmt.Println(norm.%s) }
  880. `
  881. baseline, errB := runNM(fmt.Sprintf(prog, "MaxSegmentSize"))
  882. withTables, errT := runNM(fmt.Sprintf(prog, `NFC.String("")`))
  883. if errB != nil || errT != nil {
  884. t.Skipf("TestLinking failed: %v and %v", errB, errT)
  885. }
  886. symbols := []string{"norm.formTable", "norm.nfkcValues", "norm.decomps"}
  887. for _, symbol := range symbols {
  888. if strings.Contains(baseline, symbol) {
  889. t.Errorf("found: %q unexpectedly", symbol)
  890. }
  891. if !strings.Contains(withTables, symbol) {
  892. t.Errorf("didn't find: %q unexpectedly", symbol)
  893. }
  894. }
  895. }
  896. func appendBench(f Form, in []byte) func() {
  897. buf := make([]byte, 0, 4*len(in))
  898. return func() {
  899. f.Append(buf, in...)
  900. }
  901. }
  902. func bytesBench(f Form, in []byte) func() {
  903. return func() {
  904. f.Bytes(in)
  905. }
  906. }
  907. func iterBench(f Form, in []byte) func() {
  908. iter := Iter{}
  909. return func() {
  910. iter.Init(f, in)
  911. for !iter.Done() {
  912. iter.Next()
  913. }
  914. }
  915. }
  916. func transformBench(f Form, in []byte) func() {
  917. buf := make([]byte, 4*len(in))
  918. return func() {
  919. if _, n, err := f.Transform(buf, in, true); err != nil || len(in) != n {
  920. log.Panic(n, len(in), err)
  921. }
  922. }
  923. }
  924. func readerBench(f Form, in []byte) func() {
  925. buf := make([]byte, 4*len(in))
  926. return func() {
  927. r := f.Reader(bytes.NewReader(in))
  928. var err error
  929. for err == nil {
  930. _, err = r.Read(buf)
  931. }
  932. if err != io.EOF {
  933. panic("")
  934. }
  935. }
  936. }
  937. func writerBench(f Form, in []byte) func() {
  938. buf := make([]byte, 0, 4*len(in))
  939. return func() {
  940. r := f.Writer(bytes.NewBuffer(buf))
  941. if _, err := r.Write(in); err != nil {
  942. panic("")
  943. }
  944. }
  945. }
  946. func appendBenchmarks(bm []func(), f Form, in []byte) []func() {
  947. bm = append(bm, appendBench(f, in))
  948. bm = append(bm, iterBench(f, in))
  949. bm = append(bm, transformBench(f, in))
  950. bm = append(bm, readerBench(f, in))
  951. bm = append(bm, writerBench(f, in))
  952. return bm
  953. }
  954. func doFormBenchmark(b *testing.B, inf, f Form, s string) {
  955. b.StopTimer()
  956. in := inf.Bytes([]byte(s))
  957. bm := appendBenchmarks(nil, f, in)
  958. b.SetBytes(int64(len(in) * len(bm)))
  959. b.StartTimer()
  960. for i := 0; i < b.N; i++ {
  961. for _, fn := range bm {
  962. fn()
  963. }
  964. }
  965. }
  966. func doSingle(b *testing.B, f func(Form, []byte) func(), s []byte) {
  967. b.StopTimer()
  968. fn := f(NFC, s)
  969. b.SetBytes(int64(len(s)))
  970. b.StartTimer()
  971. for i := 0; i < b.N; i++ {
  972. fn()
  973. }
  974. }
  975. var (
  976. smallNoChange = []byte("nörmalization")
  977. smallChange = []byte("No\u0308rmalization")
  978. ascii = strings.Repeat("There is nothing to change here! ", 500)
  979. )
  980. func lowerBench(f Form, in []byte) func() {
  981. // Use package strings instead of bytes as it doesn't allocate memory
  982. // if there aren't any changes.
  983. s := string(in)
  984. return func() {
  985. strings.ToLower(s)
  986. }
  987. }
  988. func BenchmarkLowerCaseNoChange(b *testing.B) {
  989. doSingle(b, lowerBench, smallNoChange)
  990. }
  991. func BenchmarkLowerCaseChange(b *testing.B) {
  992. doSingle(b, lowerBench, smallChange)
  993. }
  994. func quickSpanBench(f Form, in []byte) func() {
  995. return func() {
  996. f.QuickSpan(in)
  997. }
  998. }
  999. func BenchmarkQuickSpanChangeNFC(b *testing.B) {
  1000. doSingle(b, quickSpanBench, smallNoChange)
  1001. }
  1002. func BenchmarkBytesNoChangeNFC(b *testing.B) {
  1003. doSingle(b, bytesBench, smallNoChange)
  1004. }
  1005. func BenchmarkBytesChangeNFC(b *testing.B) {
  1006. doSingle(b, bytesBench, smallChange)
  1007. }
  1008. func BenchmarkAppendNoChangeNFC(b *testing.B) {
  1009. doSingle(b, appendBench, smallNoChange)
  1010. }
  1011. func BenchmarkAppendChangeNFC(b *testing.B) {
  1012. doSingle(b, appendBench, smallChange)
  1013. }
  1014. func BenchmarkAppendLargeNFC(b *testing.B) {
  1015. doSingle(b, appendBench, txt_all_bytes)
  1016. }
  1017. func BenchmarkIterNoChangeNFC(b *testing.B) {
  1018. doSingle(b, iterBench, smallNoChange)
  1019. }
  1020. func BenchmarkIterChangeNFC(b *testing.B) {
  1021. doSingle(b, iterBench, smallChange)
  1022. }
  1023. func BenchmarkIterLargeNFC(b *testing.B) {
  1024. doSingle(b, iterBench, txt_all_bytes)
  1025. }
  1026. func BenchmarkTransformNoChangeNFC(b *testing.B) {
  1027. doSingle(b, transformBench, smallNoChange)
  1028. }
  1029. func BenchmarkTransformChangeNFC(b *testing.B) {
  1030. doSingle(b, transformBench, smallChange)
  1031. }
  1032. func BenchmarkTransformLargeNFC(b *testing.B) {
  1033. doSingle(b, transformBench, txt_all_bytes)
  1034. }
  1035. func BenchmarkNormalizeAsciiNFC(b *testing.B) {
  1036. doFormBenchmark(b, NFC, NFC, ascii)
  1037. }
  1038. func BenchmarkNormalizeAsciiNFD(b *testing.B) {
  1039. doFormBenchmark(b, NFC, NFD, ascii)
  1040. }
  1041. func BenchmarkNormalizeAsciiNFKC(b *testing.B) {
  1042. doFormBenchmark(b, NFC, NFKC, ascii)
  1043. }
  1044. func BenchmarkNormalizeAsciiNFKD(b *testing.B) {
  1045. doFormBenchmark(b, NFC, NFKD, ascii)
  1046. }
  1047. func BenchmarkNormalizeNFC2NFC(b *testing.B) {
  1048. doFormBenchmark(b, NFC, NFC, txt_all)
  1049. }
  1050. func BenchmarkNormalizeNFC2NFD(b *testing.B) {
  1051. doFormBenchmark(b, NFC, NFD, txt_all)
  1052. }
  1053. func BenchmarkNormalizeNFD2NFC(b *testing.B) {
  1054. doFormBenchmark(b, NFD, NFC, txt_all)
  1055. }
  1056. func BenchmarkNormalizeNFD2NFD(b *testing.B) {
  1057. doFormBenchmark(b, NFD, NFD, txt_all)
  1058. }
  1059. // Hangul is often special-cased, so we test it separately.
  1060. func BenchmarkNormalizeHangulNFC2NFC(b *testing.B) {
  1061. doFormBenchmark(b, NFC, NFC, txt_kr)
  1062. }
  1063. func BenchmarkNormalizeHangulNFC2NFD(b *testing.B) {
  1064. doFormBenchmark(b, NFC, NFD, txt_kr)
  1065. }
  1066. func BenchmarkNormalizeHangulNFD2NFC(b *testing.B) {
  1067. doFormBenchmark(b, NFD, NFC, txt_kr)
  1068. }
  1069. func BenchmarkNormalizeHangulNFD2NFD(b *testing.B) {
  1070. doFormBenchmark(b, NFD, NFD, txt_kr)
  1071. }
  1072. var forms = []Form{NFC, NFD, NFKC, NFKD}
  1073. func doTextBenchmark(b *testing.B, s string) {
  1074. b.StopTimer()
  1075. in := []byte(s)
  1076. bm := []func(){}
  1077. for _, f := range forms {
  1078. bm = appendBenchmarks(bm, f, in)
  1079. }
  1080. b.SetBytes(int64(len(s) * len(bm)))
  1081. b.StartTimer()
  1082. for i := 0; i < b.N; i++ {
  1083. for _, f := range bm {
  1084. f()
  1085. }
  1086. }
  1087. }
  1088. func BenchmarkCanonicalOrdering(b *testing.B) {
  1089. doTextBenchmark(b, txt_canon)
  1090. }
  1091. func BenchmarkExtendedLatin(b *testing.B) {
  1092. doTextBenchmark(b, txt_vn)
  1093. }
  1094. func BenchmarkMiscTwoByteUtf8(b *testing.B) {
  1095. doTextBenchmark(b, twoByteUtf8)
  1096. }
  1097. func BenchmarkMiscThreeByteUtf8(b *testing.B) {
  1098. doTextBenchmark(b, threeByteUtf8)
  1099. }
  1100. func BenchmarkHangul(b *testing.B) {
  1101. doTextBenchmark(b, txt_kr)
  1102. }
  1103. func BenchmarkJapanese(b *testing.B) {
  1104. doTextBenchmark(b, txt_jp)
  1105. }
  1106. func BenchmarkChinese(b *testing.B) {
  1107. doTextBenchmark(b, txt_cn)
  1108. }
  1109. func BenchmarkOverflow(b *testing.B) {
  1110. doTextBenchmark(b, overflow)
  1111. }
  1112. var overflow = string(bytes.Repeat([]byte("\u035D"), 4096)) + "\u035B"
  1113. // Tests sampled from the Canonical ordering tests (Part 2) of
  1114. // https://unicode.org/Public/UNIDATA/NormalizationTest.txt
  1115. const txt_canon = `\u0061\u0315\u0300\u05AE\u0300\u0062 \u0061\u0300\u0315\u0300\u05AE\u0062
  1116. \u0061\u0302\u0315\u0300\u05AE\u0062 \u0061\u0307\u0315\u0300\u05AE\u0062
  1117. \u0061\u0315\u0300\u05AE\u030A\u0062 \u0061\u059A\u0316\u302A\u031C\u0062
  1118. \u0061\u032E\u059A\u0316\u302A\u0062 \u0061\u0338\u093C\u0334\u0062
  1119. \u0061\u059A\u0316\u302A\u0339 \u0061\u0341\u0315\u0300\u05AE\u0062
  1120. \u0061\u0348\u059A\u0316\u302A\u0062 \u0061\u0361\u0345\u035D\u035C\u0062
  1121. \u0061\u0366\u0315\u0300\u05AE\u0062 \u0061\u0315\u0300\u05AE\u0486\u0062
  1122. \u0061\u05A4\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0613\u0062
  1123. \u0061\u0315\u0300\u05AE\u0615\u0062 \u0061\u0617\u0315\u0300\u05AE\u0062
  1124. \u0061\u0619\u0618\u064D\u064E\u0062 \u0061\u0315\u0300\u05AE\u0654\u0062
  1125. \u0061\u0315\u0300\u05AE\u06DC\u0062 \u0061\u0733\u0315\u0300\u05AE\u0062
  1126. \u0061\u0744\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0745\u0062
  1127. \u0061\u09CD\u05B0\u094D\u3099\u0062 \u0061\u0E38\u0E48\u0E38\u0C56\u0062
  1128. \u0061\u0EB8\u0E48\u0E38\u0E49\u0062 \u0061\u0F72\u0F71\u0EC8\u0F71\u0062
  1129. \u0061\u1039\u05B0\u094D\u3099\u0062 \u0061\u05B0\u094D\u3099\u1A60\u0062
  1130. \u0061\u3099\u093C\u0334\u1BE6\u0062 \u0061\u3099\u093C\u0334\u1C37\u0062
  1131. \u0061\u1CD9\u059A\u0316\u302A\u0062 \u0061\u2DED\u0315\u0300\u05AE\u0062
  1132. \u0061\u2DEF\u0315\u0300\u05AE\u0062 \u0061\u302D\u302E\u059A\u0316\u0062`
  1133. // Taken from http://creativecommons.org/licenses/by-sa/3.0/vn/
  1134. const txt_vn = `Với các điều kiện sau: Ghi nhận công của tác giả.
  1135. Nếu bạn sử dụng, chuyển đổi, hoặc xây dựng dự án từ
  1136. nội dung được chia sẻ này, bạn phải áp dụng giấy phép này hoặc
  1137. một giấy phép khác có các điều khoản tương tự như giấy phép này
  1138. cho dự án của bạn. Hiểu rằng: Miễn — Bất kỳ các điều kiện nào
  1139. trên đây cũng có thể được miễn bỏ nếu bạn được sự cho phép của
  1140. người sở hữu bản quyền. Phạm vi công chúng — Khi tác phẩm hoặc
  1141. bất kỳ chương nào của tác phẩm đã trong vùng dành cho công
  1142. chúng theo quy định của pháp luật thì tình trạng của nó không
  1143. bị ảnh hưởng bởi giấy phép trong bất kỳ trường hợp nào.`
  1144. // Taken from http://creativecommons.org/licenses/by-sa/1.0/deed.ru
  1145. const txt_ru = `При обязательном соблюдении следующих условий:
  1146. Attribution — Вы должны атрибутировать произведение (указывать
  1147. автора и источник) в порядке, предусмотренном автором или
  1148. лицензиаром (но только так, чтобы никоим образом не подразумевалось,
  1149. что они поддерживают вас или использование вами данного произведения).
  1150. Υπό τις ακόλουθες προϋποθέσεις:`
  1151. // Taken from http://creativecommons.org/licenses/by-sa/3.0/gr/
  1152. const txt_gr = `Αναφορά Δημιουργού — Θα πρέπει να κάνετε την αναφορά στο έργο με τον
  1153. τρόπο που έχει οριστεί από το δημιουργό ή το χορηγούντο την άδεια
  1154. (χωρίς όμως να εννοείται με οποιονδήποτε τρόπο ότι εγκρίνουν εσάς ή
  1155. τη χρήση του έργου από εσάς). Παρόμοια Διανομή — Εάν αλλοιώσετε,
  1156. τροποποιήσετε ή δημιουργήσετε περαιτέρω βασισμένοι στο έργο θα
  1157. μπορείτε να διανέμετε το έργο που θα προκύψει μόνο με την ίδια ή
  1158. παρόμοια άδεια.`
  1159. // Taken from http://creativecommons.org/licenses/by-sa/3.0/deed.ar
  1160. const txt_ar = `بموجب الشروط التالية نسب المصنف — يجب عليك أن
  1161. تنسب العمل بالطريقة التي تحددها المؤلف أو المرخص (ولكن ليس بأي حال من
  1162. الأحوال أن توحي وتقترح بتحول أو استخدامك للعمل).
  1163. المشاركة على قدم المساواة — إذا كنت يعدل ، والتغيير ، أو الاستفادة
  1164. من هذا العمل ، قد ينتج عن توزيع العمل إلا في ظل تشابه او تطابق فى واحد
  1165. لهذا الترخيص.`
  1166. // Taken from http://creativecommons.org/licenses/by-sa/1.0/il/
  1167. const txt_il = `בכפוף לתנאים הבאים: ייחוס — עליך לייחס את היצירה (לתת קרדיט) באופן
  1168. המצויין על-ידי היוצר או מעניק הרישיון (אך לא בשום אופן המרמז על כך
  1169. שהם תומכים בך או בשימוש שלך ביצירה). שיתוף זהה — אם תחליט/י לשנות,
  1170. לעבד או ליצור יצירה נגזרת בהסתמך על יצירה זו, תוכל/י להפיץ את יצירתך
  1171. החדשה רק תחת אותו הרישיון או רישיון דומה לרישיון זה.`
  1172. const twoByteUtf8 = txt_ru + txt_gr + txt_ar + txt_il
  1173. // Taken from http://creativecommons.org/licenses/by-sa/2.0/kr/
  1174. const txt_kr = `다음과 같은 조건을 따라야 합니다: 저작자표시
  1175. (Attribution) — 저작자나 이용허락자가 정한 방법으로 저작물의
  1176. 원저작자를 표시하여야 합니다(그러나 원저작자가 이용자나 이용자의
  1177. 이용을 보증하거나 추천한다는 의미로 표시해서는 안됩니다).
  1178. 동일조건변경허락 — 이 저작물을 이용하여 만든 이차적 저작물에는 본
  1179. 라이선스와 동일한 라이선스를 적용해야 합니다.`
  1180. // Taken from http://creativecommons.org/licenses/by-sa/3.0/th/
  1181. const txt_th = `ภายใต้เงื่อนไข ดังต่อไปนี้ : แสดงที่มา — คุณต้องแสดงที่
  1182. มาของงานดังกล่าว ตามรูปแบบที่ผู้สร้างสรรค์หรือผู้อนุญาตกำหนด (แต่
  1183. ไม่ใช่ในลักษณะที่ว่า พวกเขาสนับสนุนคุณหรือสนับสนุนการที่
  1184. คุณนำงานไปใช้) อนุญาตแบบเดียวกัน — หากคุณดัดแปลง เปลี่ยนรูป หรื
  1185. อต่อเติมงานนี้ คุณต้องใช้สัญญาอนุญาตแบบเดียวกันหรือแบบที่เหมื
  1186. อนกับสัญญาอนุญาตที่ใช้กับงานนี้เท่านั้น`
  1187. const threeByteUtf8 = txt_th
  1188. // Taken from http://creativecommons.org/licenses/by-sa/2.0/jp/
  1189. const txt_jp = `あなたの従うべき条件は以下の通りです。
  1190. 表示 — あなたは原著作者のクレジットを表示しなければなりません。
  1191. 継承 — もしあなたがこの作品を改変、変形または加工した場合、
  1192. あなたはその結果生じた作品をこの作品と同一の許諾条件の下でのみ
  1193. 頒布することができます。`
  1194. // http://creativecommons.org/licenses/by-sa/2.5/cn/
  1195. const txt_cn = `您可以自由: 复制、发行、展览、表演、放映、
  1196. 广播或通过信息网络传播本作品 创作演绎作品
  1197. 对本作品进行商业性使用 惟须遵守下列条件:
  1198. 署名 — 您必须按照作者或者许可人指定的方式对作品进行署名。
  1199. 相同方式共享 — 如果您改变、转换本作品或者以本作品为基础进行创作,
  1200. 您只能采用与本协议相同的许可协议发布基于本作品的演绎作品。`
  1201. const txt_cjk = txt_cn + txt_jp + txt_kr
  1202. const txt_all = txt_vn + twoByteUtf8 + threeByteUtf8 + txt_cjk
  1203. var txt_all_bytes = []byte(txt_all)