reg_test.go 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. // Copyright 2012 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package collate
  5. import (
  6. "archive/zip"
  7. "bufio"
  8. "bytes"
  9. "flag"
  10. "io"
  11. "io/ioutil"
  12. "log"
  13. "path"
  14. "regexp"
  15. "strconv"
  16. "strings"
  17. "testing"
  18. "unicode/utf8"
  19. "golang.org/x/text/collate/build"
  20. "golang.org/x/text/internal/gen"
  21. "golang.org/x/text/language"
  22. )
  23. var long = flag.Bool("long", false,
  24. "run time-consuming tests, such as tests that fetch data online")
  25. // This regression test runs tests for the test files in CollationTest.zip
  26. // (taken from https://www.unicode.org/Public/UCA/<gen.UnicodeVersion()>/).
  27. //
  28. // The test files have the following form:
  29. // # header
  30. // 0009 0021; # ('\u0009') <CHARACTER TABULATION> [| | | 0201 025E]
  31. // 0009 003F; # ('\u0009') <CHARACTER TABULATION> [| | | 0201 0263]
  32. // 000A 0021; # ('\u000A') <LINE FEED (LF)> [| | | 0202 025E]
  33. // 000A 003F; # ('\u000A') <LINE FEED (LF)> [| | | 0202 0263]
  34. //
  35. // The part before the semicolon is the hex representation of a sequence
  36. // of runes. After the hash mark is a comment. The strings
  37. // represented by rune sequence are in the file in sorted order, as
  38. // defined by the DUCET.
  39. type Test struct {
  40. name string
  41. str [][]byte
  42. comment []string
  43. }
  44. var versionRe = regexp.MustCompile(`# UCA Version: (.*)\n?$`)
  45. var testRe = regexp.MustCompile(`^([\dA-F ]+);.*# (.*)\n?$`)
  46. func TestCollation(t *testing.T) {
  47. if !gen.IsLocal() && !*long {
  48. t.Skip("skipping test to prevent downloading; to run use -long or use -local to specify a local source")
  49. }
  50. t.Skip("must first update to new file format to support test")
  51. for _, test := range loadTestData() {
  52. doTest(t, test)
  53. }
  54. }
  55. func Error(e error) {
  56. if e != nil {
  57. log.Fatal(e)
  58. }
  59. }
  60. // parseUCA parses a Default Unicode Collation Element Table of the format
  61. // specified in https://www.unicode.org/reports/tr10/#File_Format.
  62. // It returns the variable top.
  63. func parseUCA(builder *build.Builder) {
  64. r := gen.OpenUnicodeFile("UCA", "", "allkeys.txt")
  65. defer r.Close()
  66. input := bufio.NewReader(r)
  67. colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`)
  68. for i := 1; true; i++ {
  69. l, prefix, err := input.ReadLine()
  70. if err == io.EOF {
  71. break
  72. }
  73. Error(err)
  74. line := string(l)
  75. if prefix {
  76. log.Fatalf("%d: buffer overflow", i)
  77. }
  78. if len(line) == 0 || line[0] == '#' {
  79. continue
  80. }
  81. if line[0] == '@' {
  82. if strings.HasPrefix(line[1:], "version ") {
  83. if v := strings.Split(line[1:], " ")[1]; v != gen.UnicodeVersion() {
  84. log.Fatalf("incompatible version %s; want %s", v, gen.UnicodeVersion())
  85. }
  86. }
  87. } else {
  88. // parse entries
  89. part := strings.Split(line, " ; ")
  90. if len(part) != 2 {
  91. log.Fatalf("%d: production rule without ';': %v", i, line)
  92. }
  93. lhs := []rune{}
  94. for _, v := range strings.Split(part[0], " ") {
  95. if v != "" {
  96. lhs = append(lhs, rune(convHex(i, v)))
  97. }
  98. }
  99. vars := []int{}
  100. rhs := [][]int{}
  101. for i, m := range colelem.FindAllStringSubmatch(part[1], -1) {
  102. if m[1] == "*" {
  103. vars = append(vars, i)
  104. }
  105. elem := []int{}
  106. for _, h := range strings.Split(m[2], ".") {
  107. elem = append(elem, convHex(i, h))
  108. }
  109. rhs = append(rhs, elem)
  110. }
  111. builder.Add(lhs, rhs, vars)
  112. }
  113. }
  114. }
  115. func convHex(line int, s string) int {
  116. r, e := strconv.ParseInt(s, 16, 32)
  117. if e != nil {
  118. log.Fatalf("%d: %v", line, e)
  119. }
  120. return int(r)
  121. }
  122. func loadTestData() []Test {
  123. f := gen.OpenUnicodeFile("UCA", "", "CollationTest.zip")
  124. buffer, err := ioutil.ReadAll(f)
  125. f.Close()
  126. Error(err)
  127. archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer)))
  128. Error(err)
  129. tests := []Test{}
  130. for _, f := range archive.File {
  131. // Skip the short versions, which are simply duplicates of the long versions.
  132. if strings.Contains(f.Name, "SHORT") || f.FileInfo().IsDir() {
  133. continue
  134. }
  135. ff, err := f.Open()
  136. Error(err)
  137. defer ff.Close()
  138. scanner := bufio.NewScanner(ff)
  139. test := Test{name: path.Base(f.Name)}
  140. for scanner.Scan() {
  141. line := scanner.Text()
  142. if len(line) <= 1 || line[0] == '#' {
  143. if m := versionRe.FindStringSubmatch(line); m != nil {
  144. if m[1] != gen.UnicodeVersion() {
  145. log.Printf("warning:%s: version is %s; want %s", f.Name, m[1], gen.UnicodeVersion())
  146. }
  147. }
  148. continue
  149. }
  150. m := testRe.FindStringSubmatch(line)
  151. if m == nil || len(m) < 3 {
  152. log.Fatalf(`Failed to parse: "%s" result: %#v`, line, m)
  153. }
  154. str := []byte{}
  155. // In the regression test data (unpaired) surrogates are assigned a weight
  156. // corresponding to their code point value. However, utf8.DecodeRune,
  157. // which is used to compute the implicit weight, assigns FFFD to surrogates.
  158. // We therefore skip tests with surrogates. This skips about 35 entries
  159. // per test.
  160. valid := true
  161. for _, split := range strings.Split(m[1], " ") {
  162. r, err := strconv.ParseUint(split, 16, 64)
  163. Error(err)
  164. valid = valid && utf8.ValidRune(rune(r))
  165. str = append(str, string(rune(r))...)
  166. }
  167. if valid {
  168. test.str = append(test.str, str)
  169. test.comment = append(test.comment, m[2])
  170. }
  171. }
  172. if scanner.Err() != nil {
  173. log.Fatal(scanner.Err())
  174. }
  175. tests = append(tests, test)
  176. }
  177. return tests
  178. }
  179. var errorCount int
  180. func runes(b []byte) []rune {
  181. return []rune(string(b))
  182. }
  183. var shifted = language.MustParse("und-u-ka-shifted-ks-level4")
  184. func doTest(t *testing.T, tc Test) {
  185. bld := build.NewBuilder()
  186. parseUCA(bld)
  187. w, err := bld.Build()
  188. Error(err)
  189. var tag language.Tag
  190. if !strings.Contains(tc.name, "NON_IGNOR") {
  191. tag = shifted
  192. }
  193. c := NewFromTable(w, OptionsFromTag(tag))
  194. b := &Buffer{}
  195. prev := tc.str[0]
  196. for i := 1; i < len(tc.str); i++ {
  197. b.Reset()
  198. s := tc.str[i]
  199. ka := c.Key(b, prev)
  200. kb := c.Key(b, s)
  201. if r := bytes.Compare(ka, kb); r == 1 {
  202. t.Errorf("%s:%d: Key(%.4X) < Key(%.4X) (%X < %X) == %d; want -1 or 0", tc.name, i, []rune(string(prev)), []rune(string(s)), ka, kb, r)
  203. prev = s
  204. continue
  205. }
  206. if r := c.Compare(prev, s); r == 1 {
  207. t.Errorf("%s:%d: Compare(%.4X, %.4X) == %d; want -1 or 0", tc.name, i, runes(prev), runes(s), r)
  208. }
  209. if r := c.Compare(s, prev); r == -1 {
  210. t.Errorf("%s:%d: Compare(%.4X, %.4X) == %d; want 1 or 0", tc.name, i, runes(s), runes(prev), r)
  211. }
  212. prev = s
  213. }
  214. }