parse_test.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package language
  5. import (
  6. "bytes"
  7. "strings"
  8. "testing"
  9. "golang.org/x/text/internal/tag"
  10. )
  11. type scanTest struct {
  12. ok bool // true if scanning does not result in an error
  13. in string
  14. tok []string // the expected tokens
  15. }
  16. var tests = []scanTest{
  17. {true, "", []string{}},
  18. {true, "1", []string{"1"}},
  19. {true, "en", []string{"en"}},
  20. {true, "root", []string{"root"}},
  21. {true, "maxchars", []string{"maxchars"}},
  22. {false, "bad/", []string{}},
  23. {false, "morethan8", []string{}},
  24. {false, "-", []string{}},
  25. {false, "----", []string{}},
  26. {false, "_", []string{}},
  27. {true, "en-US", []string{"en", "US"}},
  28. {true, "en_US", []string{"en", "US"}},
  29. {false, "en-US-", []string{"en", "US"}},
  30. {false, "en-US--", []string{"en", "US"}},
  31. {false, "en-US---", []string{"en", "US"}},
  32. {false, "en--US", []string{"en", "US"}},
  33. {false, "-en-US", []string{"en", "US"}},
  34. {false, "-en--US-", []string{"en", "US"}},
  35. {false, "-en--US-", []string{"en", "US"}},
  36. {false, "en-.-US", []string{"en", "US"}},
  37. {false, ".-en--US-.", []string{"en", "US"}},
  38. {false, "en-u.-US", []string{"en", "US"}},
  39. {true, "en-u1-US", []string{"en", "u1", "US"}},
  40. {true, "maxchar1_maxchar2-maxchar3", []string{"maxchar1", "maxchar2", "maxchar3"}},
  41. {false, "moreThan8-moreThan8-e", []string{"e"}},
  42. }
  43. func TestScan(t *testing.T) {
  44. for i, tt := range tests {
  45. scan := makeScannerString(tt.in)
  46. for j := 0; !scan.done; j++ {
  47. if j >= len(tt.tok) {
  48. t.Errorf("%d: extra token %q", i, scan.token)
  49. } else if tag.Compare(tt.tok[j], scan.token) != 0 {
  50. t.Errorf("%d: token %d: found %q; want %q", i, j, scan.token, tt.tok[j])
  51. break
  52. }
  53. scan.scan()
  54. }
  55. if s := strings.Join(tt.tok, "-"); tag.Compare(s, bytes.Replace(scan.b, b("_"), b("-"), -1)) != 0 {
  56. t.Errorf("%d: input: found %q; want %q", i, scan.b, s)
  57. }
  58. if (scan.err == nil) != tt.ok {
  59. t.Errorf("%d: ok: found %v; want %v", i, scan.err == nil, tt.ok)
  60. }
  61. }
  62. }
  63. func TestAcceptMinSize(t *testing.T) {
  64. for i, tt := range tests {
  65. // count number of successive tokens with a minimum size.
  66. for sz := 1; sz <= 8; sz++ {
  67. scan := makeScannerString(tt.in)
  68. scan.end, scan.next = 0, 0
  69. end := scan.acceptMinSize(sz)
  70. n := 0
  71. for i := 0; i < len(tt.tok) && len(tt.tok[i]) >= sz; i++ {
  72. n += len(tt.tok[i])
  73. if i > 0 {
  74. n++
  75. }
  76. }
  77. if end != n {
  78. t.Errorf("%d:%d: found len %d; want %d", i, sz, end, n)
  79. }
  80. }
  81. }
  82. }
  83. type parseTest struct {
  84. i int // the index of this test
  85. in string
  86. lang, script, region string
  87. variants, ext string
  88. extList []string // only used when more than one extension is present
  89. invalid bool
  90. rewrite bool // special rewrite not handled by parseTag
  91. changed bool // string needed to be reformatted
  92. }
  93. func parseTests() []parseTest {
  94. tests := []parseTest{
  95. {in: "root", lang: "und"},
  96. {in: "und", lang: "und"},
  97. {in: "en", lang: "en"},
  98. {in: "xy", lang: "und", invalid: true},
  99. {in: "en-ZY", lang: "en", invalid: true},
  100. {in: "gsw", lang: "gsw"},
  101. {in: "sr_Latn", lang: "sr", script: "Latn"},
  102. {in: "af-Arab", lang: "af", script: "Arab"},
  103. {in: "nl-BE", lang: "nl", region: "BE"},
  104. {in: "es-419", lang: "es", region: "419"},
  105. {in: "und-001", lang: "und", region: "001"},
  106. {in: "de-latn-be", lang: "de", script: "Latn", region: "BE"},
  107. // Variants
  108. {in: "de-1901", lang: "de", variants: "1901"},
  109. // Accept with unsuppressed script.
  110. {in: "de-Latn-1901", lang: "de", script: "Latn", variants: "1901"},
  111. // Specialized.
  112. {in: "sl-rozaj", lang: "sl", variants: "rozaj"},
  113. {in: "sl-rozaj-lipaw", lang: "sl", variants: "rozaj-lipaw"},
  114. {in: "sl-rozaj-biske", lang: "sl", variants: "rozaj-biske"},
  115. {in: "sl-rozaj-biske-1994", lang: "sl", variants: "rozaj-biske-1994"},
  116. {in: "sl-rozaj-1994", lang: "sl", variants: "rozaj-1994"},
  117. // Maximum number of variants while adhering to prefix rules.
  118. {in: "sl-rozaj-biske-1994-alalc97-fonipa-fonupa-fonxsamp", lang: "sl", variants: "rozaj-biske-1994-alalc97-fonipa-fonupa-fonxsamp"},
  119. // Sorting.
  120. {in: "sl-1994-biske-rozaj", lang: "sl", variants: "rozaj-biske-1994", changed: true},
  121. {in: "sl-rozaj-biske-1994-alalc97-fonupa-fonipa-fonxsamp", lang: "sl", variants: "rozaj-biske-1994-alalc97-fonipa-fonupa-fonxsamp", changed: true},
  122. {in: "nl-fonxsamp-alalc97-fonipa-fonupa", lang: "nl", variants: "alalc97-fonipa-fonupa-fonxsamp", changed: true},
  123. // Duplicates variants are removed, but not an error.
  124. {in: "nl-fonupa-fonupa", lang: "nl", variants: "fonupa"},
  125. // Variants that do not have correct prefixes. We still accept these.
  126. {in: "de-Cyrl-1901", lang: "de", script: "Cyrl", variants: "1901"},
  127. {in: "sl-rozaj-lipaw-1994", lang: "sl", variants: "rozaj-lipaw-1994"},
  128. {in: "sl-1994-biske-rozaj-1994-biske-rozaj", lang: "sl", variants: "rozaj-biske-1994", changed: true},
  129. {in: "de-Cyrl-1901", lang: "de", script: "Cyrl", variants: "1901"},
  130. // Invalid variant.
  131. {in: "de-1902", lang: "de", variants: "", invalid: true},
  132. {in: "EN_CYRL", lang: "en", script: "Cyrl"},
  133. // private use and extensions
  134. {in: "x-a-b-c-d", ext: "x-a-b-c-d"},
  135. {in: "x_A.-B-C_D", ext: "x-b-c-d", invalid: true, changed: true},
  136. {in: "x-aa-bbbb-cccccccc-d", ext: "x-aa-bbbb-cccccccc-d"},
  137. {in: "en-c_cc-b-bbb-a-aaa", lang: "en", changed: true, extList: []string{"a-aaa", "b-bbb", "c-cc"}},
  138. {in: "en-x_cc-b-bbb-a-aaa", lang: "en", ext: "x-cc-b-bbb-a-aaa", changed: true},
  139. {in: "en-c_cc-b-bbb-a-aaa-x-x", lang: "en", changed: true, extList: []string{"a-aaa", "b-bbb", "c-cc", "x-x"}},
  140. {in: "en-v-c", lang: "en", ext: "", invalid: true},
  141. {in: "en-v-abcdefghi", lang: "en", ext: "", invalid: true},
  142. {in: "en-v-abc-x", lang: "en", ext: "v-abc", invalid: true},
  143. {in: "en-v-abc-x-", lang: "en", ext: "v-abc", invalid: true},
  144. {in: "en-v-abc-w-x-xx", lang: "en", extList: []string{"v-abc", "x-xx"}, invalid: true, changed: true},
  145. {in: "en-v-abc-w-y-yx", lang: "en", extList: []string{"v-abc", "y-yx"}, invalid: true, changed: true},
  146. {in: "en-v-c-abc", lang: "en", ext: "c-abc", invalid: true, changed: true},
  147. {in: "en-v-w-abc", lang: "en", ext: "w-abc", invalid: true, changed: true},
  148. {in: "en-v-x-abc", lang: "en", ext: "x-abc", invalid: true, changed: true},
  149. {in: "en-v-x-a", lang: "en", ext: "x-a", invalid: true, changed: true},
  150. {in: "en-9-aa-0-aa-z-bb-x-a", lang: "en", extList: []string{"0-aa", "9-aa", "z-bb", "x-a"}, changed: true},
  151. {in: "en-u-c", lang: "en", ext: "", invalid: true},
  152. {in: "en-u-co-phonebk", lang: "en", ext: "u-co-phonebk"},
  153. {in: "en-u-co-phonebk-ca", lang: "en", ext: "u-ca-co-phonebk", changed: true},
  154. {in: "en-u-nu-arabic-co-phonebk-ca", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", changed: true},
  155. {in: "en-u-nu-arabic-co-phonebk-ca-x", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", invalid: true, changed: true},
  156. {in: "en-u-nu-arabic-co-phonebk-ca-s", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", invalid: true, changed: true},
  157. {in: "en-u-nu-arabic-co-phonebk-ca-a12345678", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", invalid: true, changed: true},
  158. {in: "en-u-co-phonebook", lang: "en", ext: "u-co", invalid: true},
  159. {in: "en-u-co-phonebook-cu-xau", lang: "en", ext: "u-co-cu-xau", invalid: true, changed: true},
  160. {in: "en-Cyrl-u-co-phonebk", lang: "en", script: "Cyrl", ext: "u-co-phonebk"},
  161. {in: "en-US-u-co-phonebk", lang: "en", region: "US", ext: "u-co-phonebk"},
  162. {in: "en-US-u-co-phonebk-cu-xau", lang: "en", region: "US", ext: "u-co-phonebk-cu-xau"},
  163. {in: "en-scotland-u-co-phonebk", lang: "en", variants: "scotland", ext: "u-co-phonebk"},
  164. {in: "en-u-cu-xua-co-phonebk", lang: "en", ext: "u-co-phonebk-cu-xua", changed: true},
  165. {in: "en-u-def-abc-cu-xua-co-phonebk", lang: "en", ext: "u-abc-def-co-phonebk-cu-xua", changed: true},
  166. {in: "en-u-def-abc", lang: "en", ext: "u-abc-def", changed: true},
  167. {in: "en-u-cu-xua-co-phonebk-a-cd", lang: "en", extList: []string{"a-cd", "u-co-phonebk-cu-xua"}, changed: true},
  168. {in: "en-u-cu-co-phonebk", lang: "en", extList: []string{"u-co-phonebk-cu"}, changed: true},
  169. {in: "en-u-cu-xau-co", lang: "en", extList: []string{"u-co-cu-xau"}, changed: true},
  170. // LDML spec is not specific about it, but remove duplicates and return an error if the values differ.
  171. {in: "en-u-cu-xau-co-phonebk-cu-xau", lang: "en", ext: "u-co-phonebk-cu-xau", changed: true},
  172. // No change as the result is a substring of the original!
  173. {in: "en-US-u-cu-xau-cu-eur", lang: "en", region: "US", ext: "u-cu-xau", invalid: true, changed: false},
  174. {in: "en-t-en-Cyrl-NL-fonipa", lang: "en", ext: "t-en-cyrl-nl-fonipa", changed: true},
  175. {in: "en-t-en-Cyrl-NL-fonipa-t0-abc-def", lang: "en", ext: "t-en-cyrl-nl-fonipa-t0-abc-def", changed: true},
  176. {in: "en-t-t0-abcd", lang: "en", ext: "t-t0-abcd"},
  177. // Not necessary to have changed here.
  178. {in: "en-t-nl-abcd", lang: "en", ext: "t-nl", invalid: true},
  179. {in: "en-t-nl-latn", lang: "en", ext: "t-nl-latn"},
  180. {in: "en-t-t0-abcd-x-a", lang: "en", extList: []string{"t-t0-abcd", "x-a"}},
  181. // invalid
  182. {in: "", lang: "und", invalid: true},
  183. {in: "-", lang: "und", invalid: true},
  184. {in: "x", lang: "und", invalid: true},
  185. {in: "x-", lang: "und", invalid: true},
  186. {in: "x--", lang: "und", invalid: true},
  187. {in: "a-a-b-c-d", lang: "und", invalid: true},
  188. {in: "en-", lang: "en", invalid: true},
  189. {in: "enne-", lang: "und", invalid: true},
  190. {in: "en.", lang: "und", invalid: true},
  191. {in: "en.-latn", lang: "und", invalid: true},
  192. {in: "en.-en", lang: "en", invalid: true},
  193. {in: "x-a-tooManyChars-c-d", ext: "x-a-c-d", invalid: true, changed: true},
  194. {in: "a-tooManyChars-c-d", lang: "und", invalid: true},
  195. // TODO: check key-value validity
  196. // { in: "en-u-cu-xd", lang: "en", ext: "u-cu-xd", invalid: true },
  197. {in: "en-t-abcd", lang: "en", invalid: true},
  198. {in: "en-Latn-US-en", lang: "en", script: "Latn", region: "US", invalid: true},
  199. // rewrites (more tests in TestGrandfathered)
  200. {in: "zh-min-nan", lang: "nan"},
  201. {in: "zh-yue", lang: "yue"},
  202. {in: "zh-xiang", lang: "hsn", rewrite: true},
  203. {in: "zh-guoyu", lang: "cmn", rewrite: true},
  204. {in: "iw", lang: "iw"},
  205. {in: "sgn-BE-FR", lang: "sfb", rewrite: true},
  206. {in: "i-klingon", lang: "tlh", rewrite: true},
  207. }
  208. for i, tt := range tests {
  209. tests[i].i = i
  210. if tt.extList != nil {
  211. tests[i].ext = strings.Join(tt.extList, "-")
  212. }
  213. if tt.ext != "" && tt.extList == nil {
  214. tests[i].extList = []string{tt.ext}
  215. }
  216. }
  217. return tests
  218. }
  219. func TestParseExtensions(t *testing.T) {
  220. for i, tt := range parseTests() {
  221. if tt.ext == "" || tt.rewrite {
  222. continue
  223. }
  224. scan := makeScannerString(tt.in)
  225. if len(scan.b) > 1 && scan.b[1] != '-' {
  226. scan.end = nextExtension(string(scan.b), 0)
  227. scan.next = scan.end + 1
  228. scan.scan()
  229. }
  230. start := scan.start
  231. scan.toLower(start, len(scan.b))
  232. parseExtensions(&scan)
  233. ext := string(scan.b[start:])
  234. if ext != tt.ext {
  235. t.Errorf("%d(%s): ext was %v; want %v", i, tt.in, ext, tt.ext)
  236. }
  237. if changed := !strings.HasPrefix(tt.in[start:], ext); changed != tt.changed {
  238. t.Errorf("%d(%s): changed was %v; want %v", i, tt.in, changed, tt.changed)
  239. }
  240. }
  241. }
  242. // partChecks runs checks for each part by calling the function returned by f.
  243. func partChecks(t *testing.T, f func(*testing.T, *parseTest) (Tag, bool)) {
  244. for i, tt := range parseTests() {
  245. t.Run(tt.in, func(t *testing.T) {
  246. tag, skip := f(t, &tt)
  247. if skip {
  248. return
  249. }
  250. if l, _ := getLangID(b(tt.lang)); l != tag.LangID {
  251. t.Errorf("%d: lang was %q; want %q", i, tag.LangID, l)
  252. }
  253. if sc, _ := getScriptID(script, b(tt.script)); sc != tag.ScriptID {
  254. t.Errorf("%d: script was %q; want %q", i, tag.ScriptID, sc)
  255. }
  256. if r, _ := getRegionID(b(tt.region)); r != tag.RegionID {
  257. t.Errorf("%d: region was %q; want %q", i, tag.RegionID, r)
  258. }
  259. if tag.str == "" {
  260. return
  261. }
  262. p := int(tag.pVariant)
  263. if p < int(tag.pExt) {
  264. p++
  265. }
  266. if s, g := tag.str[p:tag.pExt], tt.variants; s != g {
  267. t.Errorf("%d: variants was %q; want %q", i, s, g)
  268. }
  269. p = int(tag.pExt)
  270. if p > 0 && p < len(tag.str) {
  271. p++
  272. }
  273. if s, g := (tag.str)[p:], tt.ext; s != g {
  274. t.Errorf("%d: extensions were %q; want %q", i, s, g)
  275. }
  276. })
  277. }
  278. }
  279. func TestParseTag(t *testing.T) {
  280. partChecks(t, func(t *testing.T, tt *parseTest) (id Tag, skip bool) {
  281. if strings.HasPrefix(tt.in, "x-") || tt.rewrite {
  282. return Tag{}, true
  283. }
  284. scan := makeScannerString(tt.in)
  285. id, end := parseTag(&scan)
  286. id.str = string(scan.b[:end])
  287. tt.ext = ""
  288. tt.extList = []string{}
  289. return id, false
  290. })
  291. }
  292. func TestParse(t *testing.T) {
  293. partChecks(t, func(t *testing.T, tt *parseTest) (id Tag, skip bool) {
  294. id, err := Parse(tt.in)
  295. ext := ""
  296. if id.str != "" {
  297. if strings.HasPrefix(id.str, "x-") {
  298. ext = id.str
  299. } else if int(id.pExt) < len(id.str) && id.pExt > 0 {
  300. ext = id.str[id.pExt+1:]
  301. }
  302. }
  303. if tag, _ := Parse(id.String()); tag.String() != id.String() {
  304. t.Errorf("%d:%s: reparse was %q; want %q", tt.i, tt.in, id.String(), tag.String())
  305. }
  306. if ext != tt.ext {
  307. t.Errorf("%d:%s: ext was %q; want %q", tt.i, tt.in, ext, tt.ext)
  308. }
  309. changed := id.str != "" && !strings.HasPrefix(tt.in, id.str)
  310. if changed != tt.changed {
  311. t.Errorf("%d:%s: changed was %v; want %v", tt.i, tt.in, changed, tt.changed)
  312. }
  313. if (err != nil) != tt.invalid {
  314. t.Errorf("%d:%s: invalid was %v; want %v. Error: %v", tt.i, tt.in, err != nil, tt.invalid, err)
  315. }
  316. return id, false
  317. })
  318. }
  319. func TestErrors(t *testing.T) {
  320. mkInvalid := func(s string) error {
  321. return NewValueError([]byte(s))
  322. }
  323. tests := []struct {
  324. in string
  325. out error
  326. }{
  327. // invalid subtags.
  328. {"ac", mkInvalid("ac")},
  329. {"AC", mkInvalid("ac")},
  330. {"aa-Uuuu", mkInvalid("Uuuu")},
  331. {"aa-AB", mkInvalid("AB")},
  332. // ill-formed wins over invalid.
  333. {"ac-u", ErrSyntax},
  334. {"ac-u-ca", mkInvalid("ac")},
  335. {"ac-u-ca-co-pinyin", mkInvalid("ac")},
  336. {"noob", ErrSyntax},
  337. }
  338. for _, tt := range tests {
  339. _, err := Parse(tt.in)
  340. if err != tt.out {
  341. t.Errorf("%s: was %q; want %q", tt.in, err, tt.out)
  342. }
  343. }
  344. }