ucd_test.go 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package norm
  5. import (
  6. "bufio"
  7. "bytes"
  8. "fmt"
  9. "regexp"
  10. "runtime"
  11. "strconv"
  12. "strings"
  13. "sync"
  14. "testing"
  15. "time"
  16. "unicode/utf8"
  17. "golang.org/x/text/internal/gen"
  18. "golang.org/x/text/internal/testtext"
  19. )
  20. var once sync.Once
  21. func skipShort(t *testing.T) {
  22. testtext.SkipIfNotLong(t)
  23. once.Do(func() { loadTestData(t) })
  24. }
  25. // This regression test runs the test set in NormalizationTest.txt
  26. // (taken from https://www.unicode.org/Public/<unicode.Version>/ucd/).
  27. //
  28. // NormalizationTest.txt has form:
  29. // @Part0 # Specific cases
  30. // #
  31. // 1E0A;1E0A;0044 0307;1E0A;0044 0307; # (Ḋ; Ḋ; D◌̇; Ḋ; D◌̇; ) LATIN CAPITAL LETTER D WITH DOT ABOVE
  32. // 1E0C;1E0C;0044 0323;1E0C;0044 0323; # (Ḍ; Ḍ; D◌̣; Ḍ; D◌̣; ) LATIN CAPITAL LETTER D WITH DOT BELOW
  33. //
  34. // Each test has 5 columns (c1, c2, c3, c4, c5), where
  35. // (c1, c2, c3, c4, c5) == (c1, NFC(c1), NFD(c1), NFKC(c1), NFKD(c1))
  36. //
  37. // CONFORMANCE:
  38. // 1. The following invariants must be true for all conformant implementations
  39. //
  40. // NFC
  41. // c2 == NFC(c1) == NFC(c2) == NFC(c3)
  42. // c4 == NFC(c4) == NFC(c5)
  43. //
  44. // NFD
  45. // c3 == NFD(c1) == NFD(c2) == NFD(c3)
  46. // c5 == NFD(c4) == NFD(c5)
  47. //
  48. // NFKC
  49. // c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
  50. //
  51. // NFKD
  52. // c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
  53. //
  54. // 2. For every code point X assigned in this version of Unicode that is not
  55. // specifically listed in Part 1, the following invariants must be true
  56. // for all conformant implementations:
  57. //
  58. // X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)
  59. //
  60. // Column types.
  61. const (
  62. cRaw = iota
  63. cNFC
  64. cNFD
  65. cNFKC
  66. cNFKD
  67. cMaxColumns
  68. )
  69. // Holds data from NormalizationTest.txt
  70. var part []Part
  71. type Part struct {
  72. name string
  73. number int
  74. tests []Test
  75. }
  76. type Test struct {
  77. name string
  78. partnr int
  79. number int
  80. r rune // used for character by character test
  81. cols [cMaxColumns]string // Each has 5 entries, see below.
  82. }
  83. func (t Test) Name() string {
  84. if t.number < 0 {
  85. return part[t.partnr].name
  86. }
  87. return fmt.Sprintf("%s:%d", part[t.partnr].name, t.number)
  88. }
  89. var partRe = regexp.MustCompile(`@Part(\d) # (.*)$`)
  90. var testRe = regexp.MustCompile(`^` + strings.Repeat(`([\dA-F ]+);`, 5) + ` # (.*)$`)
  91. var counter int
  92. // Load the data form NormalizationTest.txt
  93. func loadTestData(t *testing.T) {
  94. f := gen.OpenUCDFile("NormalizationTest.txt")
  95. defer f.Close()
  96. scanner := bufio.NewScanner(f)
  97. for scanner.Scan() {
  98. line := scanner.Text()
  99. if len(line) == 0 || line[0] == '#' {
  100. continue
  101. }
  102. m := partRe.FindStringSubmatch(line)
  103. if m != nil {
  104. if len(m) < 3 {
  105. t.Fatal("Failed to parse Part: ", line)
  106. }
  107. i, err := strconv.Atoi(m[1])
  108. if err != nil {
  109. t.Fatal(err)
  110. }
  111. name := m[2]
  112. part = append(part, Part{name: name[:len(name)-1], number: i})
  113. continue
  114. }
  115. m = testRe.FindStringSubmatch(line)
  116. if m == nil || len(m) < 7 {
  117. t.Fatalf(`Failed to parse: "%s" result: %#v`, line, m)
  118. }
  119. test := Test{name: m[6], partnr: len(part) - 1, number: counter}
  120. counter++
  121. for j := 1; j < len(m)-1; j++ {
  122. for _, split := range strings.Split(m[j], " ") {
  123. r, err := strconv.ParseUint(split, 16, 64)
  124. if err != nil {
  125. t.Fatal(err)
  126. }
  127. if test.r == 0 {
  128. // save for CharacterByCharacterTests
  129. test.r = rune(r)
  130. }
  131. var buf [utf8.UTFMax]byte
  132. sz := utf8.EncodeRune(buf[:], rune(r))
  133. test.cols[j-1] += string(buf[:sz])
  134. }
  135. }
  136. part := &part[len(part)-1]
  137. part.tests = append(part.tests, test)
  138. }
  139. if scanner.Err() != nil {
  140. t.Fatal(scanner.Err())
  141. }
  142. }
  143. func cmpResult(t *testing.T, tc *Test, name string, f Form, gold, test, result string) {
  144. if gold != result {
  145. t.Errorf("%s:%s: %s(%+q)=%+q; want %+q: %s",
  146. tc.Name(), name, fstr[f], test, result, gold, tc.name)
  147. }
  148. }
  149. func cmpIsNormal(t *testing.T, tc *Test, name string, f Form, test string, result, want bool) {
  150. if result != want {
  151. t.Errorf("%s:%s: %s(%+q)=%v; want %v", tc.Name(), name, fstr[f], test, result, want)
  152. }
  153. }
  154. func doTest(t *testing.T, tc *Test, f Form, gold, test string) {
  155. testb := []byte(test)
  156. result := f.Bytes(testb)
  157. cmpResult(t, tc, "Bytes", f, gold, test, string(result))
  158. sresult := f.String(test)
  159. cmpResult(t, tc, "String", f, gold, test, sresult)
  160. acc := []byte{}
  161. i := Iter{}
  162. i.InitString(f, test)
  163. for !i.Done() {
  164. acc = append(acc, i.Next()...)
  165. }
  166. cmpResult(t, tc, "Iter.Next", f, gold, test, string(acc))
  167. buf := make([]byte, 128)
  168. acc = nil
  169. for p := 0; p < len(testb); {
  170. nDst, nSrc, _ := f.Transform(buf, testb[p:], true)
  171. acc = append(acc, buf[:nDst]...)
  172. p += nSrc
  173. }
  174. cmpResult(t, tc, "Transform", f, gold, test, string(acc))
  175. for i := range test {
  176. out := f.Append(f.Bytes([]byte(test[:i])), []byte(test[i:])...)
  177. cmpResult(t, tc, fmt.Sprintf(":Append:%d", i), f, gold, test, string(out))
  178. }
  179. cmpIsNormal(t, tc, "IsNormal", f, test, f.IsNormal([]byte(test)), test == gold)
  180. cmpIsNormal(t, tc, "IsNormalString", f, test, f.IsNormalString(test), test == gold)
  181. }
  182. func doConformanceTests(t *testing.T, tc *Test, partn int) {
  183. for i := 0; i <= 2; i++ {
  184. doTest(t, tc, NFC, tc.cols[1], tc.cols[i])
  185. doTest(t, tc, NFD, tc.cols[2], tc.cols[i])
  186. doTest(t, tc, NFKC, tc.cols[3], tc.cols[i])
  187. doTest(t, tc, NFKD, tc.cols[4], tc.cols[i])
  188. }
  189. for i := 3; i <= 4; i++ {
  190. doTest(t, tc, NFC, tc.cols[3], tc.cols[i])
  191. doTest(t, tc, NFD, tc.cols[4], tc.cols[i])
  192. doTest(t, tc, NFKC, tc.cols[3], tc.cols[i])
  193. doTest(t, tc, NFKD, tc.cols[4], tc.cols[i])
  194. }
  195. }
  196. func TestCharacterByCharacter(t *testing.T) {
  197. skipShort(t)
  198. tests := part[1].tests
  199. var last rune = 0
  200. for i := 0; i <= len(tests); i++ { // last one is special case
  201. var r rune
  202. if i == len(tests) {
  203. r = 0x2FA1E // Don't have to go to 0x10FFFF
  204. } else {
  205. r = tests[i].r
  206. }
  207. for last++; last < r; last++ {
  208. // Check all characters that were not explicitly listed in the test.
  209. tc := &Test{partnr: 1, number: -1}
  210. char := string(last)
  211. doTest(t, tc, NFC, char, char)
  212. doTest(t, tc, NFD, char, char)
  213. doTest(t, tc, NFKC, char, char)
  214. doTest(t, tc, NFKD, char, char)
  215. }
  216. if i < len(tests) {
  217. doConformanceTests(t, &tests[i], 1)
  218. }
  219. }
  220. }
  221. func TestStandardTests(t *testing.T) {
  222. skipShort(t)
  223. for _, j := range []int{0, 2, 3} {
  224. for _, test := range part[j].tests {
  225. doConformanceTests(t, &test, j)
  226. }
  227. }
  228. }
  229. // TestPerformance verifies that normalization is O(n). If any of the
  230. // code does not properly check for maxCombiningChars, normalization
  231. // may exhibit O(n**2) behavior.
  232. func TestPerformance(t *testing.T) {
  233. skipShort(t)
  234. runtime.GOMAXPROCS(2)
  235. success := make(chan bool, 1)
  236. go func() {
  237. buf := bytes.Repeat([]byte("\u035D"), 1024*1024)
  238. buf = append(buf, "\u035B"...)
  239. NFC.Append(nil, buf...)
  240. success <- true
  241. }()
  242. timeout := time.After(1 * time.Second)
  243. select {
  244. case <-success:
  245. // test completed before the timeout
  246. case <-timeout:
  247. t.Errorf(`unexpectedly long time to complete PerformanceTest`)
  248. }
  249. }