maketables.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554
  1. // Copyright 2012 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //go:build ignore
  5. // +build ignore
  6. // Collation table generator.
  7. // Data read from the web.
  8. package main
  9. import (
  10. "archive/zip"
  11. "bufio"
  12. "bytes"
  13. "flag"
  14. "fmt"
  15. "io"
  16. "io/ioutil"
  17. "log"
  18. "os"
  19. "regexp"
  20. "sort"
  21. "strconv"
  22. "strings"
  23. "unicode/utf8"
  24. "golang.org/x/text/collate"
  25. "golang.org/x/text/collate/build"
  26. "golang.org/x/text/internal/colltab"
  27. "golang.org/x/text/internal/gen"
  28. "golang.org/x/text/language"
  29. "golang.org/x/text/unicode/cldr"
  30. )
  31. var (
  32. test = flag.Bool("test", false,
  33. "test existing tables; can be used to compare web data with package data.")
  34. short = flag.Bool("short", false, `Use "short" alternatives, when available.`)
  35. draft = flag.Bool("draft", false, `Use draft versions, when available.`)
  36. tags = flag.String("tags", "", "build tags to be included after +build directive")
  37. pkg = flag.String("package", "collate",
  38. "the name of the package in which the generated file is to be included")
  39. tables = flagStringSetAllowAll("tables", "collate", "collate,chars",
  40. "comma-spearated list of tables to generate.")
  41. exclude = flagStringSet("exclude", "zh2", "",
  42. "comma-separated list of languages to exclude.")
  43. include = flagStringSet("include", "", "",
  44. "comma-separated list of languages to include. Include trumps exclude.")
  45. // TODO: Not included: unihan gb2312han zhuyin big5han (for size reasons)
  46. // TODO: Not included: traditional (buggy for Bengali)
  47. types = flagStringSetAllowAll("types", "standard,phonebook,phonetic,reformed,pinyin,stroke", "",
  48. "comma-separated list of types that should be included.")
  49. )
  50. // stringSet implements an ordered set based on a list. It implements flag.Value
  51. // to allow a set to be specified as a comma-separated list.
  52. type stringSet struct {
  53. s []string
  54. allowed *stringSet
  55. dirty bool // needs compaction if true
  56. all bool
  57. allowAll bool
  58. }
  59. func flagStringSet(name, def, allowed, usage string) *stringSet {
  60. ss := &stringSet{}
  61. if allowed != "" {
  62. usage += fmt.Sprintf(" (allowed values: any of %s)", allowed)
  63. ss.allowed = &stringSet{}
  64. failOnError(ss.allowed.Set(allowed))
  65. }
  66. ss.Set(def)
  67. flag.Var(ss, name, usage)
  68. return ss
  69. }
  70. func flagStringSetAllowAll(name, def, allowed, usage string) *stringSet {
  71. ss := &stringSet{allowAll: true}
  72. if allowed == "" {
  73. flag.Var(ss, name, usage+fmt.Sprintf(` Use "all" to select all.`))
  74. } else {
  75. ss.allowed = &stringSet{}
  76. failOnError(ss.allowed.Set(allowed))
  77. flag.Var(ss, name, usage+fmt.Sprintf(` (allowed values: "all" or any of %s)`, allowed))
  78. }
  79. ss.Set(def)
  80. return ss
  81. }
  82. func (ss stringSet) Len() int {
  83. return len(ss.s)
  84. }
  85. func (ss stringSet) String() string {
  86. return strings.Join(ss.s, ",")
  87. }
  88. func (ss *stringSet) Set(s string) error {
  89. if ss.allowAll && s == "all" {
  90. ss.s = nil
  91. ss.all = true
  92. return nil
  93. }
  94. ss.s = ss.s[:0]
  95. for _, s := range strings.Split(s, ",") {
  96. if s := strings.TrimSpace(s); s != "" {
  97. if ss.allowed != nil && !ss.allowed.contains(s) {
  98. return fmt.Errorf("unsupported value %q; must be one of %s", s, ss.allowed)
  99. }
  100. ss.add(s)
  101. }
  102. }
  103. ss.compact()
  104. return nil
  105. }
  106. func (ss *stringSet) add(s string) {
  107. ss.s = append(ss.s, s)
  108. ss.dirty = true
  109. }
  110. func (ss *stringSet) values() []string {
  111. ss.compact()
  112. return ss.s
  113. }
  114. func (ss *stringSet) contains(s string) bool {
  115. if ss.all {
  116. return true
  117. }
  118. for _, v := range ss.s {
  119. if v == s {
  120. return true
  121. }
  122. }
  123. return false
  124. }
  125. func (ss *stringSet) compact() {
  126. if !ss.dirty {
  127. return
  128. }
  129. a := ss.s
  130. sort.Strings(a)
  131. k := 0
  132. for i := 1; i < len(a); i++ {
  133. if a[k] != a[i] {
  134. a[k+1] = a[i]
  135. k++
  136. }
  137. }
  138. ss.s = a[:k+1]
  139. ss.dirty = false
  140. }
  141. func skipLang(l string) bool {
  142. if include.Len() > 0 {
  143. return !include.contains(l)
  144. }
  145. return exclude.contains(l)
  146. }
  147. // altInclude returns a list of alternatives (for the LDML alt attribute)
  148. // in order of preference. An empty string in this list indicates the
  149. // default entry.
  150. func altInclude() []string {
  151. l := []string{}
  152. if *short {
  153. l = append(l, "short")
  154. }
  155. l = append(l, "")
  156. // TODO: handle draft using cldr.SetDraftLevel
  157. if *draft {
  158. l = append(l, "proposed")
  159. }
  160. return l
  161. }
  162. func failOnError(e error) {
  163. if e != nil {
  164. log.Panic(e)
  165. }
  166. }
  167. func openArchive() *zip.Reader {
  168. f := gen.OpenCLDRCoreZip()
  169. buffer, err := ioutil.ReadAll(f)
  170. f.Close()
  171. failOnError(err)
  172. archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer)))
  173. failOnError(err)
  174. return archive
  175. }
  176. // parseUCA parses a Default Unicode Collation Element Table of the format
  177. // specified in https://www.unicode.org/reports/tr10/#File_Format.
  178. // It returns the variable top.
  179. func parseUCA(builder *build.Builder) {
  180. var r io.ReadCloser
  181. var err error
  182. for _, f := range openArchive().File {
  183. if strings.HasSuffix(f.Name, "allkeys_CLDR.txt") {
  184. r, err = f.Open()
  185. }
  186. }
  187. if r == nil {
  188. log.Fatal("File allkeys_CLDR.txt not found in archive.")
  189. }
  190. failOnError(err)
  191. defer r.Close()
  192. scanner := bufio.NewScanner(r)
  193. colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`)
  194. for i := 1; scanner.Scan(); i++ {
  195. line := scanner.Text()
  196. if len(line) == 0 || line[0] == '#' {
  197. continue
  198. }
  199. if line[0] == '@' {
  200. // parse properties
  201. switch {
  202. case strings.HasPrefix(line[1:], "version "):
  203. a := strings.Split(line[1:], " ")
  204. if a[1] != gen.UnicodeVersion() {
  205. log.Fatalf("incompatible version %s; want %s", a[1], gen.UnicodeVersion())
  206. }
  207. case strings.HasPrefix(line[1:], "backwards "):
  208. log.Fatalf("%d: unsupported option backwards", i)
  209. default:
  210. log.Printf("%d: unknown option %s", i, line[1:])
  211. }
  212. } else {
  213. // parse entries
  214. part := strings.Split(line, " ; ")
  215. if len(part) != 2 {
  216. log.Fatalf("%d: production rule without ';': %v", i, line)
  217. }
  218. lhs := []rune{}
  219. for _, v := range strings.Split(part[0], " ") {
  220. if v == "" {
  221. continue
  222. }
  223. lhs = append(lhs, rune(convHex(i, v)))
  224. }
  225. var n int
  226. var vars []int
  227. rhs := [][]int{}
  228. for i, m := range colelem.FindAllStringSubmatch(part[1], -1) {
  229. n += len(m[0])
  230. elem := []int{}
  231. for _, h := range strings.Split(m[2], ".") {
  232. elem = append(elem, convHex(i, h))
  233. }
  234. if m[1] == "*" {
  235. vars = append(vars, i)
  236. }
  237. rhs = append(rhs, elem)
  238. }
  239. if len(part[1]) < n+3 || part[1][n+1] != '#' {
  240. log.Fatalf("%d: expected comment; found %s", i, part[1][n:])
  241. }
  242. if *test {
  243. testInput.add(string(lhs))
  244. }
  245. failOnError(builder.Add(lhs, rhs, vars))
  246. }
  247. }
  248. if scanner.Err() != nil {
  249. log.Fatal(scanner.Err())
  250. }
  251. }
  252. func convHex(line int, s string) int {
  253. r, e := strconv.ParseInt(s, 16, 32)
  254. if e != nil {
  255. log.Fatalf("%d: %v", line, e)
  256. }
  257. return int(r)
  258. }
  259. var testInput = stringSet{}
  260. var charRe = regexp.MustCompile(`&#x([0-9A-F]*);`)
  261. var tagRe = regexp.MustCompile(`<([a-z_]*) */>`)
  262. var mainLocales = []string{}
  263. // charsets holds a list of exemplar characters per category.
  264. type charSets map[string][]string
  265. func (p charSets) fprint(w io.Writer) {
  266. fmt.Fprintln(w, "[exN]string{")
  267. for i, k := range []string{"", "contractions", "punctuation", "auxiliary", "currencySymbol", "index"} {
  268. if set := p[k]; len(set) != 0 {
  269. fmt.Fprintf(w, "\t\t%d: %q,\n", i, strings.Join(set, " "))
  270. }
  271. }
  272. fmt.Fprintln(w, "\t},")
  273. }
  274. var localeChars = make(map[string]charSets)
  275. const exemplarHeader = `
  276. type exemplarType int
  277. const (
  278. exCharacters exemplarType = iota
  279. exContractions
  280. exPunctuation
  281. exAuxiliary
  282. exCurrency
  283. exIndex
  284. exN
  285. )
  286. `
  287. func printExemplarCharacters(w io.Writer) {
  288. fmt.Fprintln(w, exemplarHeader)
  289. fmt.Fprintln(w, "var exemplarCharacters = map[string][exN]string{")
  290. for _, loc := range mainLocales {
  291. fmt.Fprintf(w, "\t%q: ", loc)
  292. localeChars[loc].fprint(w)
  293. }
  294. fmt.Fprintln(w, "}")
  295. }
  296. func decodeCLDR(d *cldr.Decoder) *cldr.CLDR {
  297. r := gen.OpenCLDRCoreZip()
  298. data, err := d.DecodeZip(r)
  299. failOnError(err)
  300. return data
  301. }
  302. // parseMain parses XML files in the main directory of the CLDR core.zip file.
  303. func parseMain() {
  304. d := &cldr.Decoder{}
  305. d.SetDirFilter("main")
  306. d.SetSectionFilter("characters")
  307. data := decodeCLDR(d)
  308. for _, loc := range data.Locales() {
  309. x := data.RawLDML(loc)
  310. if skipLang(x.Identity.Language.Type) {
  311. continue
  312. }
  313. if x.Characters != nil {
  314. x, _ = data.LDML(loc)
  315. loc = language.Make(loc).String()
  316. for _, ec := range x.Characters.ExemplarCharacters {
  317. if ec.Draft != "" {
  318. continue
  319. }
  320. if _, ok := localeChars[loc]; !ok {
  321. mainLocales = append(mainLocales, loc)
  322. localeChars[loc] = make(charSets)
  323. }
  324. localeChars[loc][ec.Type] = parseCharacters(ec.Data())
  325. }
  326. }
  327. }
  328. }
  329. func parseCharacters(chars string) []string {
  330. parseSingle := func(s string) (r rune, tail string, escaped bool) {
  331. if s[0] == '\\' {
  332. return rune(s[1]), s[2:], true
  333. }
  334. r, sz := utf8.DecodeRuneInString(s)
  335. return r, s[sz:], false
  336. }
  337. chars = strings.TrimSpace(chars)
  338. if n := len(chars) - 1; chars[n] == ']' && chars[0] == '[' {
  339. chars = chars[1:n]
  340. }
  341. list := []string{}
  342. var r, last, end rune
  343. for len(chars) > 0 {
  344. if chars[0] == '{' { // character sequence
  345. buf := []rune{}
  346. for chars = chars[1:]; len(chars) > 0; {
  347. r, chars, _ = parseSingle(chars)
  348. if r == '}' {
  349. break
  350. }
  351. if r == ' ' {
  352. log.Fatalf("space not supported in sequence %q", chars)
  353. }
  354. buf = append(buf, r)
  355. }
  356. list = append(list, string(buf))
  357. last = 0
  358. } else { // single character
  359. escaped := false
  360. r, chars, escaped = parseSingle(chars)
  361. if r != ' ' {
  362. if r == '-' && !escaped {
  363. if last == 0 {
  364. log.Fatal("'-' should be preceded by a character")
  365. }
  366. end, chars, _ = parseSingle(chars)
  367. for ; last <= end; last++ {
  368. list = append(list, string(last))
  369. }
  370. last = 0
  371. } else {
  372. list = append(list, string(r))
  373. last = r
  374. }
  375. }
  376. }
  377. }
  378. return list
  379. }
  380. var fileRe = regexp.MustCompile(`.*/collation/(.*)\.xml`)
  381. // typeMap translates legacy type keys to their BCP47 equivalent.
  382. var typeMap = map[string]string{
  383. "phonebook": "phonebk",
  384. "traditional": "trad",
  385. }
  386. // parseCollation parses XML files in the collation directory of the CLDR core.zip file.
  387. func parseCollation(b *build.Builder) {
  388. d := &cldr.Decoder{}
  389. d.SetDirFilter("collation")
  390. data := decodeCLDR(d)
  391. for _, loc := range data.Locales() {
  392. x, err := data.LDML(loc)
  393. failOnError(err)
  394. if skipLang(x.Identity.Language.Type) {
  395. continue
  396. }
  397. cs := x.Collations.Collation
  398. sl := cldr.MakeSlice(&cs)
  399. if len(types.s) == 0 {
  400. sl.SelectAnyOf("type", x.Collations.Default())
  401. } else if !types.all {
  402. sl.SelectAnyOf("type", types.s...)
  403. }
  404. sl.SelectOnePerGroup("alt", altInclude())
  405. for _, c := range cs {
  406. id, err := language.Parse(loc)
  407. if err != nil {
  408. fmt.Fprintf(os.Stderr, "invalid locale: %q", err)
  409. continue
  410. }
  411. // Support both old- and new-style defaults.
  412. d := c.Type
  413. if x.Collations.DefaultCollation == nil {
  414. d = x.Collations.Default()
  415. } else {
  416. d = x.Collations.DefaultCollation.Data()
  417. }
  418. // We assume tables are being built either for search or collation,
  419. // but not both. For search the default is always "search".
  420. if d != c.Type && c.Type != "search" {
  421. typ := c.Type
  422. if len(c.Type) > 8 {
  423. typ = typeMap[c.Type]
  424. }
  425. id, err = id.SetTypeForKey("co", typ)
  426. failOnError(err)
  427. }
  428. t := b.Tailoring(id)
  429. c.Process(processor{t})
  430. }
  431. }
  432. }
  433. type processor struct {
  434. t *build.Tailoring
  435. }
  436. func (p processor) Reset(anchor string, before int) (err error) {
  437. if before != 0 {
  438. err = p.t.SetAnchorBefore(anchor)
  439. } else {
  440. err = p.t.SetAnchor(anchor)
  441. }
  442. failOnError(err)
  443. return nil
  444. }
  445. func (p processor) Insert(level int, str, context, extend string) error {
  446. str = context + str
  447. if *test {
  448. testInput.add(str)
  449. }
  450. // TODO: mimic bug in old maketables: remove.
  451. err := p.t.Insert(colltab.Level(level-1), str, context+extend)
  452. failOnError(err)
  453. return nil
  454. }
  455. func (p processor) Index(id string) {
  456. }
  457. func testCollator(c *collate.Collator) {
  458. c0 := collate.New(language.Und)
  459. // iterator over all characters for all locales and check
  460. // whether Key is equal.
  461. buf := collate.Buffer{}
  462. // Add all common and not too uncommon runes to the test set.
  463. for i := rune(0); i < 0x30000; i++ {
  464. testInput.add(string(i))
  465. }
  466. for i := rune(0xE0000); i < 0xF0000; i++ {
  467. testInput.add(string(i))
  468. }
  469. for _, str := range testInput.values() {
  470. k0 := c0.KeyFromString(&buf, str)
  471. k := c.KeyFromString(&buf, str)
  472. if !bytes.Equal(k0, k) {
  473. failOnError(fmt.Errorf("test:%U: keys differ (%x vs %x)", []rune(str), k0, k))
  474. }
  475. buf.Reset()
  476. }
  477. fmt.Println("PASS")
  478. }
  479. func main() {
  480. gen.Init()
  481. b := build.NewBuilder()
  482. parseUCA(b)
  483. if tables.contains("chars") {
  484. parseMain()
  485. }
  486. parseCollation(b)
  487. c, err := b.Build()
  488. failOnError(err)
  489. if *test {
  490. testCollator(collate.NewFromTable(c))
  491. } else {
  492. w := &bytes.Buffer{}
  493. gen.WriteUnicodeVersion(w)
  494. gen.WriteCLDRVersion(w)
  495. if tables.contains("collate") {
  496. _, err = b.Print(w)
  497. failOnError(err)
  498. }
  499. if tables.contains("chars") {
  500. printExemplarCharacters(w)
  501. }
  502. gen.WriteGoFile("tables.go", *pkg, w.Bytes())
  503. }
  504. }