parse_test.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390
  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package language
  5. import (
  6. "strings"
  7. "testing"
  8. "golang.org/x/text/internal/language"
  9. )
  10. // equalTags compares language, script and region subtags only.
  11. func (t Tag) equalTags(a Tag) bool {
  12. return t.lang() == a.lang() &&
  13. t.script() == a.script() &&
  14. t.region() == a.region()
  15. }
  16. var errSyntax = language.ErrSyntax
  17. type parseTest struct {
  18. i int // the index of this test
  19. in string
  20. lang, script, region string
  21. variants, ext string
  22. extList []string // only used when more than one extension is present
  23. invalid bool
  24. rewrite bool // special rewrite not handled by parseTag
  25. changed bool // string needed to be reformatted
  26. }
  27. func parseTests() []parseTest {
  28. tests := []parseTest{
  29. {in: "root", lang: "und"},
  30. {in: "und", lang: "und"},
  31. {in: "en", lang: "en"},
  32. {in: "en-US-u-va-posix", lang: "en", region: "US", ext: "u-va-posix"},
  33. {in: "ca-ES-valencia", lang: "ca", region: "ES", variants: "valencia"},
  34. {in: "en-US-u-rg-gbzzzz", lang: "en", region: "US", ext: "u-rg-gbzzzz"},
  35. {in: "xy", lang: "und", invalid: true},
  36. {in: "en-ZY", lang: "en", invalid: true},
  37. {in: "gsw", lang: "gsw"},
  38. {in: "sr_Latn", lang: "sr", script: "Latn"},
  39. {in: "af-Arab", lang: "af", script: "Arab"},
  40. {in: "nl-BE", lang: "nl", region: "BE"},
  41. {in: "es-419", lang: "es", region: "419"},
  42. {in: "und-001", lang: "und", region: "001"},
  43. {in: "de-latn-be", lang: "de", script: "Latn", region: "BE"},
  44. // Variants
  45. {in: "de-1901", lang: "de", variants: "1901"},
  46. // Accept with unsuppressed script.
  47. {in: "de-Latn-1901", lang: "de", script: "Latn", variants: "1901"},
  48. // Specialized.
  49. {in: "sl-rozaj", lang: "sl", variants: "rozaj"},
  50. {in: "sl-rozaj-lipaw", lang: "sl", variants: "rozaj-lipaw"},
  51. {in: "sl-rozaj-biske", lang: "sl", variants: "rozaj-biske"},
  52. {in: "sl-rozaj-biske-1994", lang: "sl", variants: "rozaj-biske-1994"},
  53. {in: "sl-rozaj-1994", lang: "sl", variants: "rozaj-1994"},
  54. // Maximum number of variants while adhering to prefix rules.
  55. {in: "sl-rozaj-biske-1994-alalc97-fonipa-fonupa-fonxsamp", lang: "sl", variants: "rozaj-biske-1994-alalc97-fonipa-fonupa-fonxsamp"},
  56. // Sorting.
  57. {in: "sl-1994-biske-rozaj", lang: "sl", variants: "rozaj-biske-1994", changed: true},
  58. {in: "sl-rozaj-biske-1994-alalc97-fonupa-fonipa-fonxsamp", lang: "sl", variants: "rozaj-biske-1994-alalc97-fonipa-fonupa-fonxsamp", changed: true},
  59. {in: "nl-fonxsamp-alalc97-fonipa-fonupa", lang: "nl", variants: "alalc97-fonipa-fonupa-fonxsamp", changed: true},
  60. // Duplicates variants are removed, but not an error.
  61. {in: "nl-fonupa-fonupa", lang: "nl", variants: "fonupa"},
  62. // Variants that do not have correct prefixes. We still accept these.
  63. {in: "de-Cyrl-1901", lang: "de", script: "Cyrl", variants: "1901"},
  64. {in: "sl-rozaj-lipaw-1994", lang: "sl", variants: "rozaj-lipaw-1994"},
  65. {in: "sl-1994-biske-rozaj-1994-biske-rozaj", lang: "sl", variants: "rozaj-biske-1994", changed: true},
  66. {in: "de-Cyrl-1901", lang: "de", script: "Cyrl", variants: "1901"},
  67. // Invalid variant.
  68. {in: "de-1902", lang: "de", variants: "", invalid: true},
  69. {in: "EN_CYRL", lang: "en", script: "Cyrl"},
  70. // private use and extensions
  71. {in: "x-a-b-c-d", ext: "x-a-b-c-d"},
  72. {in: "x_A.-B-C_D", ext: "x-b-c-d", invalid: true, changed: true},
  73. {in: "x-aa-bbbb-cccccccc-d", ext: "x-aa-bbbb-cccccccc-d"},
  74. {in: "en-c_cc-b-bbb-a-aaa", lang: "en", changed: true, extList: []string{"a-aaa", "b-bbb", "c-cc"}},
  75. {in: "en-x_cc-b-bbb-a-aaa", lang: "en", ext: "x-cc-b-bbb-a-aaa", changed: true},
  76. {in: "en-c_cc-b-bbb-a-aaa-x-x", lang: "en", changed: true, extList: []string{"a-aaa", "b-bbb", "c-cc", "x-x"}},
  77. {in: "en-v-c", lang: "en", ext: "", invalid: true},
  78. {in: "en-v-abcdefghi", lang: "en", ext: "", invalid: true},
  79. {in: "en-v-abc-x", lang: "en", ext: "v-abc", invalid: true},
  80. {in: "en-v-abc-x-", lang: "en", ext: "v-abc", invalid: true},
  81. {in: "en-v-abc-w-x-xx", lang: "en", extList: []string{"v-abc", "x-xx"}, invalid: true, changed: true},
  82. {in: "en-v-abc-w-y-yx", lang: "en", extList: []string{"v-abc", "y-yx"}, invalid: true, changed: true},
  83. {in: "en-v-c-abc", lang: "en", ext: "c-abc", invalid: true, changed: true},
  84. {in: "en-v-w-abc", lang: "en", ext: "w-abc", invalid: true, changed: true},
  85. {in: "en-v-x-abc", lang: "en", ext: "x-abc", invalid: true, changed: true},
  86. {in: "en-v-x-a", lang: "en", ext: "x-a", invalid: true, changed: true},
  87. {in: "en-9-aa-0-aa-z-bb-x-a", lang: "en", extList: []string{"0-aa", "9-aa", "z-bb", "x-a"}, changed: true},
  88. {in: "en-u-c", lang: "en", ext: "", invalid: true},
  89. {in: "en-u-co-phonebk", lang: "en", ext: "u-co-phonebk"},
  90. {in: "en-u-co-phonebk-ca", lang: "en", ext: "u-co-phonebk", invalid: true},
  91. {in: "en-u-nu-arabic-co-phonebk-ca", lang: "en", ext: "u-co-phonebk-nu-arabic", invalid: true, changed: true},
  92. {in: "en-u-nu-arabic-co-phonebk-ca-x", lang: "en", ext: "u-co-phonebk-nu-arabic", invalid: true, changed: true},
  93. {in: "en-u-nu-arabic-co-phonebk-ca-s", lang: "en", ext: "u-co-phonebk-nu-arabic", invalid: true, changed: true},
  94. {in: "en-u-nu-arabic-co-phonebk-ca-a12345678", lang: "en", ext: "u-co-phonebk-nu-arabic", invalid: true, changed: true},
  95. {in: "en-u-co-phonebook", lang: "en", ext: "", invalid: true},
  96. {in: "en-u-co-phonebook-cu-xau", lang: "en", ext: "u-cu-xau", invalid: true, changed: true},
  97. {in: "en-Cyrl-u-co-phonebk", lang: "en", script: "Cyrl", ext: "u-co-phonebk"},
  98. {in: "en-US-u-co-phonebk", lang: "en", region: "US", ext: "u-co-phonebk"},
  99. {in: "en-US-u-co-phonebk-cu-xau", lang: "en", region: "US", ext: "u-co-phonebk-cu-xau"},
  100. {in: "en-scotland-u-co-phonebk", lang: "en", variants: "scotland", ext: "u-co-phonebk"},
  101. {in: "en-u-cu-xua-co-phonebk", lang: "en", ext: "u-co-phonebk-cu-xua", changed: true},
  102. {in: "en-u-def-abc-cu-xua-co-phonebk", lang: "en", ext: "u-abc-def-co-phonebk-cu-xua", changed: true},
  103. {in: "en-u-def-abc", lang: "en", ext: "u-abc-def", changed: true},
  104. {in: "en-u-cu-xua-co-phonebk-a-cd", lang: "en", extList: []string{"a-cd", "u-co-phonebk-cu-xua"}, changed: true},
  105. // Invalid "u" extension. Drop invalid parts.
  106. {in: "en-u-cu-co-phonebk", lang: "en", extList: []string{"u-co-phonebk"}, invalid: true, changed: true},
  107. {in: "en-u-cu-xau-co", lang: "en", extList: []string{"u-cu-xau"}, invalid: true},
  108. // We allow duplicate keys as the LDML spec does not explicitly prohibit it.
  109. // TODO: Consider eliminating duplicates and returning an error.
  110. {in: "en-u-cu-xau-co-phonebk-cu-xau", lang: "en", ext: "u-co-phonebk-cu-xau", changed: true},
  111. {in: "en-t-en-Cyrl-NL-fonipa", lang: "en", ext: "t-en-cyrl-nl-fonipa", changed: true},
  112. {in: "en-t-en-Cyrl-NL-fonipa-t0-abc-def", lang: "en", ext: "t-en-cyrl-nl-fonipa-t0-abc-def", changed: true},
  113. {in: "en-t-t0-abcd", lang: "en", ext: "t-t0-abcd"},
  114. // Not necessary to have changed here.
  115. {in: "en-t-nl-abcd", lang: "en", ext: "t-nl", invalid: true},
  116. {in: "en-t-nl-latn", lang: "en", ext: "t-nl-latn"},
  117. {in: "en-t-t0-abcd-x-a", lang: "en", extList: []string{"t-t0-abcd", "x-a"}},
  118. // invalid
  119. {in: "", lang: "und", invalid: true},
  120. {in: "-", lang: "und", invalid: true},
  121. {in: "x", lang: "und", invalid: true},
  122. {in: "x-", lang: "und", invalid: true},
  123. {in: "x--", lang: "und", invalid: true},
  124. {in: "a-a-b-c-d", lang: "und", invalid: true},
  125. {in: "en-", lang: "en", invalid: true},
  126. {in: "enne-", lang: "und", invalid: true},
  127. {in: "en.", lang: "und", invalid: true},
  128. {in: "en.-latn", lang: "und", invalid: true},
  129. {in: "en.-en", lang: "en", invalid: true},
  130. {in: "x-a-tooManyChars-c-d", ext: "x-a-c-d", invalid: true, changed: true},
  131. {in: "a-tooManyChars-c-d", lang: "und", invalid: true},
  132. // TODO: check key-value validity
  133. // { in: "en-u-cu-xd", lang: "en", ext: "u-cu-xd", invalid: true },
  134. {in: "en-t-abcd", lang: "en", invalid: true},
  135. {in: "en-Latn-US-en", lang: "en", script: "Latn", region: "US", invalid: true},
  136. // rewrites (more tests in TestGrandfathered)
  137. {in: "zh-min-nan", lang: "nan"},
  138. {in: "zh-yue", lang: "yue"},
  139. {in: "zh-xiang", lang: "hsn", rewrite: true},
  140. {in: "zh-guoyu", lang: "cmn", rewrite: true},
  141. {in: "iw", lang: "iw"},
  142. {in: "sgn-BE-FR", lang: "sfb", rewrite: true},
  143. {in: "i-klingon", lang: "tlh", rewrite: true},
  144. }
  145. for i, tt := range tests {
  146. tests[i].i = i
  147. if tt.extList != nil {
  148. tests[i].ext = strings.Join(tt.extList, "-")
  149. }
  150. if tt.ext != "" && tt.extList == nil {
  151. tests[i].extList = []string{tt.ext}
  152. }
  153. }
  154. return tests
  155. }
  156. // partChecks runs checks for each part by calling the function returned by f.
  157. func partChecks(t *testing.T, f func(*parseTest) (Tag, bool)) {
  158. for i, tt := range parseTests() {
  159. tag, skip := f(&tt)
  160. if skip {
  161. continue
  162. }
  163. if l, _ := language.ParseBase(tt.lang); l != tag.lang() {
  164. t.Errorf("%d: lang was %q; want %q", i, tag.lang(), l)
  165. }
  166. if sc, _ := language.ParseScript(tt.script); sc != tag.script() {
  167. t.Errorf("%d: script was %q; want %q", i, tag.script(), sc)
  168. }
  169. if r, _ := language.ParseRegion(tt.region); r != tag.region() {
  170. t.Errorf("%d: region was %q; want %q", i, tag.region(), r)
  171. }
  172. v := tag.tag().Variants()
  173. if v != "" {
  174. v = v[1:]
  175. }
  176. if v != tt.variants {
  177. t.Errorf("%d: variants was %q; want %q", i, v, tt.variants)
  178. }
  179. if e := strings.Join(tag.tag().Extensions(), "-"); e != tt.ext {
  180. t.Errorf("%d: extensions were %q; want %q", i, e, tt.ext)
  181. }
  182. }
  183. }
  184. func TestParse(t *testing.T) {
  185. partChecks(t, func(tt *parseTest) (id Tag, skip bool) {
  186. id, _ = Raw.Parse(tt.in)
  187. return id, false
  188. })
  189. }
  190. func TestErrors(t *testing.T) {
  191. mkInvalid := func(s string) error {
  192. return language.NewValueError([]byte(s))
  193. }
  194. tests := []struct {
  195. in string
  196. out error
  197. }{
  198. // invalid subtags.
  199. {"ac", mkInvalid("ac")},
  200. {"AC", mkInvalid("ac")},
  201. {"aa-Uuuu", mkInvalid("Uuuu")},
  202. {"aa-AB", mkInvalid("AB")},
  203. // ill-formed wins over invalid.
  204. {"ac-u", errSyntax},
  205. {"ac-u-ca", errSyntax},
  206. {"ac-u-ca-co-pinyin", errSyntax},
  207. {"noob", errSyntax},
  208. }
  209. for _, tt := range tests {
  210. _, err := Parse(tt.in)
  211. if err != tt.out {
  212. t.Errorf("%s: was %q; want %q", tt.in, err, tt.out)
  213. }
  214. }
  215. }
  216. func TestCompose1(t *testing.T) {
  217. partChecks(t, func(tt *parseTest) (id Tag, skip bool) {
  218. l, _ := ParseBase(tt.lang)
  219. s, _ := ParseScript(tt.script)
  220. r, _ := ParseRegion(tt.region)
  221. v := []Variant{}
  222. for _, x := range strings.Split(tt.variants, "-") {
  223. p, _ := ParseVariant(x)
  224. v = append(v, p)
  225. }
  226. e := []Extension{}
  227. for _, x := range tt.extList {
  228. p, _ := ParseExtension(x)
  229. e = append(e, p)
  230. }
  231. id, _ = Raw.Compose(l, s, r, v, e)
  232. return id, false
  233. })
  234. }
  235. func TestCompose2(t *testing.T) {
  236. partChecks(t, func(tt *parseTest) (id Tag, skip bool) {
  237. l, _ := ParseBase(tt.lang)
  238. s, _ := ParseScript(tt.script)
  239. r, _ := ParseRegion(tt.region)
  240. p := []interface{}{l, s, r, s, r, l}
  241. for _, x := range strings.Split(tt.variants, "-") {
  242. if x != "" {
  243. v, _ := ParseVariant(x)
  244. p = append(p, v)
  245. }
  246. }
  247. for _, x := range tt.extList {
  248. e, _ := ParseExtension(x)
  249. p = append(p, e)
  250. }
  251. id, _ = Raw.Compose(p...)
  252. return id, false
  253. })
  254. }
  255. func TestCompose3(t *testing.T) {
  256. partChecks(t, func(tt *parseTest) (id Tag, skip bool) {
  257. id, _ = Raw.Parse(tt.in)
  258. id, _ = Raw.Compose(id)
  259. return id, false
  260. })
  261. }
  262. func mk(s string) Tag {
  263. return Raw.Make(s)
  264. }
  265. func TestParseAcceptLanguage(t *testing.T) {
  266. type res struct {
  267. t Tag
  268. q float32
  269. }
  270. en := []res{{mk("en"), 1.0}}
  271. tests := []struct {
  272. out []res
  273. in string
  274. ok bool
  275. }{
  276. {en, "en", true},
  277. {en, " en", true},
  278. {en, "en ", true},
  279. {en, " en ", true},
  280. {en, "en,", true},
  281. {en, ",en", true},
  282. {en, ",,,en,,,", true},
  283. {en, ",en;q=1", true},
  284. // We allow an empty input, contrary to spec.
  285. {nil, "", true},
  286. {[]res{{mk("aa"), 1}}, "aa;", true}, // allow unspecified weight
  287. // errors
  288. {nil, ";", false},
  289. {nil, "$", false},
  290. {nil, "e;", false},
  291. {nil, "x;", false},
  292. {nil, "x", false},
  293. {nil, "ac", false}, // non-existing language
  294. {nil, "aa;q", false},
  295. {nil, "aa;q=", false},
  296. {nil, "aa;q=.", false},
  297. // odd fallbacks
  298. {
  299. []res{{mk("en"), 0.1}},
  300. " english ;q=.1",
  301. true,
  302. },
  303. {
  304. []res{{mk("it"), 1.0}, {mk("de"), 1.0}, {mk("fr"), 1.0}},
  305. " italian, deutsch, french",
  306. true,
  307. },
  308. // lists
  309. {
  310. []res{{mk("en"), 0.1}},
  311. "en;q=.1",
  312. true,
  313. },
  314. {
  315. []res{{mk("mul"), 1.0}},
  316. "*",
  317. true,
  318. },
  319. {
  320. []res{{mk("en"), 1.0}, {mk("de"), 1.0}},
  321. "en,de",
  322. true,
  323. },
  324. {
  325. []res{{mk("en"), 1.0}, {mk("de"), .5}},
  326. "en,de;q=0.5",
  327. true,
  328. },
  329. {
  330. []res{{mk("de"), 0.8}, {mk("en"), 0.5}},
  331. " en ; q = 0.5 , , de;q=0.8",
  332. true,
  333. },
  334. {
  335. []res{{mk("en"), 1.0}, {mk("de"), 1.0}, {mk("fr"), 1.0}, {mk("tlh"), 1.0}},
  336. "en,de,fr,i-klingon",
  337. true,
  338. },
  339. // sorting
  340. {
  341. []res{{mk("tlh"), 0.4}, {mk("de"), 0.2}, {mk("fr"), 0.2}, {mk("en"), 0.1}},
  342. "en;q=0.1,de;q=0.2,fr;q=0.2,i-klingon;q=0.4",
  343. true,
  344. },
  345. // dropping
  346. {
  347. []res{{mk("fr"), 0.2}, {mk("en"), 0.1}},
  348. "en;q=0.1,de;q=0,fr;q=0.2,i-klingon;q=0.0",
  349. true,
  350. },
  351. }
  352. for i, tt := range tests {
  353. tags, qs, e := ParseAcceptLanguage(tt.in)
  354. if e == nil != tt.ok {
  355. t.Errorf("%d:%s:err: was %v; want %v", i, tt.in, e == nil, tt.ok)
  356. }
  357. for j, tag := range tags {
  358. if out := tt.out[j]; !tag.equalTags(out.t) || qs[j] != out.q {
  359. t.Errorf("%d:%s: was %s, %1f; want %s, %1f", i, tt.in, tag, qs[j], out.t, out.q)
  360. break
  361. }
  362. }
  363. }
  364. }