|
- // Copyright 2012 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- // +build ignore
- // Collation table generator.
- // Data read from the web.
- package main
- import (
- "archive/zip"
- "bufio"
- "bytes"
- "flag"
- "fmt"
- "io"
- "io/ioutil"
- "log"
- "os"
- "regexp"
- "sort"
- "strconv"
- "strings"
- "unicode/utf8"
- "golang.org/x/text/collate"
- "golang.org/x/text/collate/build"
- "golang.org/x/text/internal/colltab"
- "golang.org/x/text/internal/gen"
- "golang.org/x/text/language"
- "golang.org/x/text/unicode/cldr"
- )
- var (
- test = flag.Bool("test", false,
- "test existing tables; can be used to compare web data with package data.")
- short = flag.Bool("short", false, `Use "short" alternatives, when available.`)
- draft = flag.Bool("draft", false, `Use draft versions, when available.`)
- tags = flag.String("tags", "", "build tags to be included after +build directive")
- pkg = flag.String("package", "collate",
- "the name of the package in which the generated file is to be included")
- tables = flagStringSetAllowAll("tables", "collate", "collate,chars",
- "comma-spearated list of tables to generate.")
- exclude = flagStringSet("exclude", "zh2", "",
- "comma-separated list of languages to exclude.")
- include = flagStringSet("include", "", "",
- "comma-separated list of languages to include. Include trumps exclude.")
- // TODO: Not included: unihan gb2312han zhuyin big5han (for size reasons)
- // TODO: Not included: traditional (buggy for Bengali)
- types = flagStringSetAllowAll("types", "standard,phonebook,phonetic,reformed,pinyin,stroke", "",
- "comma-separated list of types that should be included.")
- )
- // stringSet implements an ordered set based on a list. It implements flag.Value
- // to allow a set to be specified as a comma-separated list.
- type stringSet struct {
- s []string
- allowed *stringSet
- dirty bool // needs compaction if true
- all bool
- allowAll bool
- }
- func flagStringSet(name, def, allowed, usage string) *stringSet {
- ss := &stringSet{}
- if allowed != "" {
- usage += fmt.Sprintf(" (allowed values: any of %s)", allowed)
- ss.allowed = &stringSet{}
- failOnError(ss.allowed.Set(allowed))
- }
- ss.Set(def)
- flag.Var(ss, name, usage)
- return ss
- }
- func flagStringSetAllowAll(name, def, allowed, usage string) *stringSet {
- ss := &stringSet{allowAll: true}
- if allowed == "" {
- flag.Var(ss, name, usage+fmt.Sprintf(` Use "all" to select all.`))
- } else {
- ss.allowed = &stringSet{}
- failOnError(ss.allowed.Set(allowed))
- flag.Var(ss, name, usage+fmt.Sprintf(` (allowed values: "all" or any of %s)`, allowed))
- }
- ss.Set(def)
- return ss
- }
- func (ss stringSet) Len() int {
- return len(ss.s)
- }
- func (ss stringSet) String() string {
- return strings.Join(ss.s, ",")
- }
- func (ss *stringSet) Set(s string) error {
- if ss.allowAll && s == "all" {
- ss.s = nil
- ss.all = true
- return nil
- }
- ss.s = ss.s[:0]
- for _, s := range strings.Split(s, ",") {
- if s := strings.TrimSpace(s); s != "" {
- if ss.allowed != nil && !ss.allowed.contains(s) {
- return fmt.Errorf("unsupported value %q; must be one of %s", s, ss.allowed)
- }
- ss.add(s)
- }
- }
- ss.compact()
- return nil
- }
- func (ss *stringSet) add(s string) {
- ss.s = append(ss.s, s)
- ss.dirty = true
- }
- func (ss *stringSet) values() []string {
- ss.compact()
- return ss.s
- }
- func (ss *stringSet) contains(s string) bool {
- if ss.all {
- return true
- }
- for _, v := range ss.s {
- if v == s {
- return true
- }
- }
- return false
- }
- func (ss *stringSet) compact() {
- if !ss.dirty {
- return
- }
- a := ss.s
- sort.Strings(a)
- k := 0
- for i := 1; i < len(a); i++ {
- if a[k] != a[i] {
- a[k+1] = a[i]
- k++
- }
- }
- ss.s = a[:k+1]
- ss.dirty = false
- }
- func skipLang(l string) bool {
- if include.Len() > 0 {
- return !include.contains(l)
- }
- return exclude.contains(l)
- }
- // altInclude returns a list of alternatives (for the LDML alt attribute)
- // in order of preference. An empty string in this list indicates the
- // default entry.
- func altInclude() []string {
- l := []string{}
- if *short {
- l = append(l, "short")
- }
- l = append(l, "")
- // TODO: handle draft using cldr.SetDraftLevel
- if *draft {
- l = append(l, "proposed")
- }
- return l
- }
- func failOnError(e error) {
- if e != nil {
- log.Panic(e)
- }
- }
- func openArchive() *zip.Reader {
- f := gen.OpenCLDRCoreZip()
- buffer, err := ioutil.ReadAll(f)
- f.Close()
- failOnError(err)
- archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer)))
- failOnError(err)
- return archive
- }
- // parseUCA parses a Default Unicode Collation Element Table of the format
- // specified in https://www.unicode.org/reports/tr10/#File_Format.
- // It returns the variable top.
- func parseUCA(builder *build.Builder) {
- var r io.ReadCloser
- var err error
- for _, f := range openArchive().File {
- if strings.HasSuffix(f.Name, "allkeys_CLDR.txt") {
- r, err = f.Open()
- }
- }
- if r == nil {
- log.Fatal("File allkeys_CLDR.txt not found in archive.")
- }
- failOnError(err)
- defer r.Close()
- scanner := bufio.NewScanner(r)
- colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`)
- for i := 1; scanner.Scan(); i++ {
- line := scanner.Text()
- if len(line) == 0 || line[0] == '#' {
- continue
- }
- if line[0] == '@' {
- // parse properties
- switch {
- case strings.HasPrefix(line[1:], "version "):
- a := strings.Split(line[1:], " ")
- if a[1] != gen.UnicodeVersion() {
- log.Fatalf("incompatible version %s; want %s", a[1], gen.UnicodeVersion())
- }
- case strings.HasPrefix(line[1:], "backwards "):
- log.Fatalf("%d: unsupported option backwards", i)
- default:
- log.Printf("%d: unknown option %s", i, line[1:])
- }
- } else {
- // parse entries
- part := strings.Split(line, " ; ")
- if len(part) != 2 {
- log.Fatalf("%d: production rule without ';': %v", i, line)
- }
- lhs := []rune{}
- for _, v := range strings.Split(part[0], " ") {
- if v == "" {
- continue
- }
- lhs = append(lhs, rune(convHex(i, v)))
- }
- var n int
- var vars []int
- rhs := [][]int{}
- for i, m := range colelem.FindAllStringSubmatch(part[1], -1) {
- n += len(m[0])
- elem := []int{}
- for _, h := range strings.Split(m[2], ".") {
- elem = append(elem, convHex(i, h))
- }
- if m[1] == "*" {
- vars = append(vars, i)
- }
- rhs = append(rhs, elem)
- }
- if len(part[1]) < n+3 || part[1][n+1] != '#' {
- log.Fatalf("%d: expected comment; found %s", i, part[1][n:])
- }
- if *test {
- testInput.add(string(lhs))
- }
- failOnError(builder.Add(lhs, rhs, vars))
- }
- }
- if scanner.Err() != nil {
- log.Fatal(scanner.Err())
- }
- }
- func convHex(line int, s string) int {
- r, e := strconv.ParseInt(s, 16, 32)
- if e != nil {
- log.Fatalf("%d: %v", line, e)
- }
- return int(r)
- }
- var testInput = stringSet{}
- var charRe = regexp.MustCompile(`&#x([0-9A-F]*);`)
- var tagRe = regexp.MustCompile(`<([a-z_]*) */>`)
- var mainLocales = []string{}
- // charsets holds a list of exemplar characters per category.
- type charSets map[string][]string
- func (p charSets) fprint(w io.Writer) {
- fmt.Fprintln(w, "[exN]string{")
- for i, k := range []string{"", "contractions", "punctuation", "auxiliary", "currencySymbol", "index"} {
- if set := p[k]; len(set) != 0 {
- fmt.Fprintf(w, "\t\t%d: %q,\n", i, strings.Join(set, " "))
- }
- }
- fmt.Fprintln(w, "\t},")
- }
- var localeChars = make(map[string]charSets)
- const exemplarHeader = `
- type exemplarType int
- const (
- exCharacters exemplarType = iota
- exContractions
- exPunctuation
- exAuxiliary
- exCurrency
- exIndex
- exN
- )
- `
- func printExemplarCharacters(w io.Writer) {
- fmt.Fprintln(w, exemplarHeader)
- fmt.Fprintln(w, "var exemplarCharacters = map[string][exN]string{")
- for _, loc := range mainLocales {
- fmt.Fprintf(w, "\t%q: ", loc)
- localeChars[loc].fprint(w)
- }
- fmt.Fprintln(w, "}")
- }
- func decodeCLDR(d *cldr.Decoder) *cldr.CLDR {
- r := gen.OpenCLDRCoreZip()
- data, err := d.DecodeZip(r)
- failOnError(err)
- return data
- }
- // parseMain parses XML files in the main directory of the CLDR core.zip file.
- func parseMain() {
- d := &cldr.Decoder{}
- d.SetDirFilter("main")
- d.SetSectionFilter("characters")
- data := decodeCLDR(d)
- for _, loc := range data.Locales() {
- x := data.RawLDML(loc)
- if skipLang(x.Identity.Language.Type) {
- continue
- }
- if x.Characters != nil {
- x, _ = data.LDML(loc)
- loc = language.Make(loc).String()
- for _, ec := range x.Characters.ExemplarCharacters {
- if ec.Draft != "" {
- continue
- }
- if _, ok := localeChars[loc]; !ok {
- mainLocales = append(mainLocales, loc)
- localeChars[loc] = make(charSets)
- }
- localeChars[loc][ec.Type] = parseCharacters(ec.Data())
- }
- }
- }
- }
- func parseCharacters(chars string) []string {
- parseSingle := func(s string) (r rune, tail string, escaped bool) {
- if s[0] == '\\' {
- return rune(s[1]), s[2:], true
- }
- r, sz := utf8.DecodeRuneInString(s)
- return r, s[sz:], false
- }
- chars = strings.TrimSpace(chars)
- if n := len(chars) - 1; chars[n] == ']' && chars[0] == '[' {
- chars = chars[1:n]
- }
- list := []string{}
- var r, last, end rune
- for len(chars) > 0 {
- if chars[0] == '{' { // character sequence
- buf := []rune{}
- for chars = chars[1:]; len(chars) > 0; {
- r, chars, _ = parseSingle(chars)
- if r == '}' {
- break
- }
- if r == ' ' {
- log.Fatalf("space not supported in sequence %q", chars)
- }
- buf = append(buf, r)
- }
- list = append(list, string(buf))
- last = 0
- } else { // single character
- escaped := false
- r, chars, escaped = parseSingle(chars)
- if r != ' ' {
- if r == '-' && !escaped {
- if last == 0 {
- log.Fatal("'-' should be preceded by a character")
- }
- end, chars, _ = parseSingle(chars)
- for ; last <= end; last++ {
- list = append(list, string(last))
- }
- last = 0
- } else {
- list = append(list, string(r))
- last = r
- }
- }
- }
- }
- return list
- }
- var fileRe = regexp.MustCompile(`.*/collation/(.*)\.xml`)
- // typeMap translates legacy type keys to their BCP47 equivalent.
- var typeMap = map[string]string{
- "phonebook": "phonebk",
- "traditional": "trad",
- }
- // parseCollation parses XML files in the collation directory of the CLDR core.zip file.
- func parseCollation(b *build.Builder) {
- d := &cldr.Decoder{}
- d.SetDirFilter("collation")
- data := decodeCLDR(d)
- for _, loc := range data.Locales() {
- x, err := data.LDML(loc)
- failOnError(err)
- if skipLang(x.Identity.Language.Type) {
- continue
- }
- cs := x.Collations.Collation
- sl := cldr.MakeSlice(&cs)
- if len(types.s) == 0 {
- sl.SelectAnyOf("type", x.Collations.Default())
- } else if !types.all {
- sl.SelectAnyOf("type", types.s...)
- }
- sl.SelectOnePerGroup("alt", altInclude())
- for _, c := range cs {
- id, err := language.Parse(loc)
- if err != nil {
- fmt.Fprintf(os.Stderr, "invalid locale: %q", err)
- continue
- }
- // Support both old- and new-style defaults.
- d := c.Type
- if x.Collations.DefaultCollation == nil {
- d = x.Collations.Default()
- } else {
- d = x.Collations.DefaultCollation.Data()
- }
- // We assume tables are being built either for search or collation,
- // but not both. For search the default is always "search".
- if d != c.Type && c.Type != "search" {
- typ := c.Type
- if len(c.Type) > 8 {
- typ = typeMap[c.Type]
- }
- id, err = id.SetTypeForKey("co", typ)
- failOnError(err)
- }
- t := b.Tailoring(id)
- c.Process(processor{t})
- }
- }
- }
- type processor struct {
- t *build.Tailoring
- }
- func (p processor) Reset(anchor string, before int) (err error) {
- if before != 0 {
- err = p.t.SetAnchorBefore(anchor)
- } else {
- err = p.t.SetAnchor(anchor)
- }
- failOnError(err)
- return nil
- }
- func (p processor) Insert(level int, str, context, extend string) error {
- str = context + str
- if *test {
- testInput.add(str)
- }
- // TODO: mimic bug in old maketables: remove.
- err := p.t.Insert(colltab.Level(level-1), str, context+extend)
- failOnError(err)
- return nil
- }
- func (p processor) Index(id string) {
- }
- func testCollator(c *collate.Collator) {
- c0 := collate.New(language.Und)
- // iterator over all characters for all locales and check
- // whether Key is equal.
- buf := collate.Buffer{}
- // Add all common and not too uncommon runes to the test set.
- for i := rune(0); i < 0x30000; i++ {
- testInput.add(string(i))
- }
- for i := rune(0xE0000); i < 0xF0000; i++ {
- testInput.add(string(i))
- }
- for _, str := range testInput.values() {
- k0 := c0.KeyFromString(&buf, str)
- k := c.KeyFromString(&buf, str)
- if !bytes.Equal(k0, k) {
- failOnError(fmt.Errorf("test:%U: keys differ (%x vs %x)", []rune(str), k0, k))
- }
- buf.Reset()
- }
- fmt.Println("PASS")
- }
- func main() {
- gen.Init()
- b := build.NewBuilder()
- parseUCA(b)
- if tables.contains("chars") {
- parseMain()
- }
- parseCollation(b)
- c, err := b.Build()
- failOnError(err)
- if *test {
- testCollator(collate.NewFromTable(c))
- } else {
- w := &bytes.Buffer{}
- gen.WriteUnicodeVersion(w)
- gen.WriteCLDRVersion(w)
- if tables.contains("collate") {
- _, err = b.Print(w)
- failOnError(err)
- }
- if tables.contains("chars") {
- printExemplarCharacters(w)
- }
- gen.WriteGoFile("tables.go", *pkg, w.Bytes())
- }
- }
|