123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353 |
- // Copyright 2017 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- // Package cldrtree builds and generates a CLDR index file, including all
- // inheritance.
- //
- package cldrtree
- //go:generate go test -gen
- // cldrtree stores CLDR data in a tree-like structure called Tree. In the CLDR
- // data each branch in the tree is indicated by either an element name or an
- // attribute value. A Tree does not distinguish between these two cases, but
- // rather assumes that all branches can be accessed by an enum with a compact
- // range of positive integer values starting from 0.
- //
- // Each Tree consists of three parts:
- // - a slice mapping compact language identifiers to an offset into a set of
- // indices,
- // - a set of indices, stored as a large blob of uint16 values that encode
- // the actual tree structure of data, and
- // - a set of buckets that each holds a collection of strings.
- // each of which is explained in more detail below.
- //
- //
- // Tree lookup
- // A tree lookup is done by providing a locale and a "path", which is a
- // sequence of enum values. The search starts with getting the index for the
- // given locale and then incrementally jumping into the index using the path
- // values. If an element cannot be found in the index, the search starts anew
- // for the locale's parent locale. The path may change during lookup by means
- // of aliasing, described below.
- //
- // Buckets
- // Buckets hold the actual string data of the leaf values of the CLDR tree.
- // This data is stored in buckets, rather than one large string, for multiple
- // reasons:
- // - it allows representing leaf values more compactly, by storing all leaf
- // values in a single bucket and then needing only needing a uint16 to index
- // into this bucket for all leaf values,
- // - (TBD) allow multiple trees to share subsets of buckets, mostly to allow
- // linking in a smaller amount of data if only a subset of the buckets is
- // needed,
- // - to be nice to go fmt and the compiler.
- //
- // indices
- // An index is a slice of uint16 for which the values are interpreted in one of
- // two ways: as a node or a set of leaf values.
- // A set of leaf values has the following form:
- // <max_size>, <bucket>, <offset>...
- // max_size indicates the maximum enum value for which an offset is defined.
- // An offset value of 0xFFFF (missingValue) also indicates an undefined value.
- // If defined offset indicates the offset within the given bucket of the string.
- // A node value has the following form:
- // <max_size>, <offset_or_alias>...
- // max_size indicates the maximum value for which an offset is defined.
- // A missing offset may also be indicated with 0. If the high bit (0x8000, or
- // inheritMask) is not set, the offset points to the offset within the index
- // for the current locale.
- // An offset with high bit set is an alias. In this case the uint16 has the form
- // bits:
- // 15: 1
- // 14-12: negative offset into path relative to current position
- // 0-11: new enum value for path element.
- // On encountering an alias, the path is modified accordingly and the lookup is
- // restarted for the given locale.
- import (
- "fmt"
- "reflect"
- "regexp"
- "strings"
- "unicode/utf8"
- "golang.org/x/text/internal/gen"
- "golang.org/x/text/language"
- "golang.org/x/text/unicode/cldr"
- )
- // TODO:
- // - allow two Trees to share the same set of buckets.
- // A Builder allows storing CLDR data in compact form.
- type Builder struct {
- table []string
- rootMeta *metaData
- locales []locale
- strToBucket map[string]stringInfo
- buckets [][]byte
- enums []*enum
- err error
- // Stats
- size int
- sizeAll int
- bucketWaste int
- }
- const (
- maxBucketSize = 8 * 1024 // 8K
- maxStrlen = 254 // allow 0xFF sentinel
- )
- func (b *Builder) setError(err error) {
- if b.err == nil {
- b.err = err
- }
- }
- func (b *Builder) addString(data string) stringInfo {
- data = b.makeString(data)
- info, ok := b.strToBucket[data]
- if !ok {
- b.size += len(data)
- x := len(b.buckets) - 1
- bucket := b.buckets[x]
- if len(bucket)+len(data) < maxBucketSize {
- info.bucket = uint16(x)
- info.bucketPos = uint16(len(bucket))
- b.buckets[x] = append(bucket, data...)
- } else {
- info.bucket = uint16(len(b.buckets))
- info.bucketPos = 0
- b.buckets = append(b.buckets, []byte(data))
- }
- b.strToBucket[data] = info
- }
- return info
- }
- func (b *Builder) addStringToBucket(data string, bucket uint16) stringInfo {
- data = b.makeString(data)
- info, ok := b.strToBucket[data]
- if !ok || info.bucket != bucket {
- if ok {
- b.bucketWaste += len(data)
- }
- b.size += len(data)
- bk := b.buckets[bucket]
- info.bucket = bucket
- info.bucketPos = uint16(len(bk))
- b.buckets[bucket] = append(bk, data...)
- b.strToBucket[data] = info
- }
- return info
- }
- func (b *Builder) makeString(data string) string {
- if len(data) > maxStrlen {
- b.setError(fmt.Errorf("string %q exceeds maximum length of %d", data, maxStrlen))
- data = data[:maxStrlen]
- for i := len(data) - 1; i > len(data)-4; i-- {
- if utf8.RuneStart(data[i]) {
- data = data[:i]
- break
- }
- }
- }
- data = string([]byte{byte(len(data))}) + data
- b.sizeAll += len(data)
- return data
- }
- type stringInfo struct {
- bufferPos uint32
- bucket uint16
- bucketPos uint16
- }
- // New creates a new Builder.
- func New(tableName string) *Builder {
- b := &Builder{
- strToBucket: map[string]stringInfo{},
- buckets: [][]byte{nil}, // initialize with first bucket.
- }
- b.rootMeta = &metaData{
- b: b,
- typeInfo: &typeInfo{},
- }
- return b
- }
- // Gen writes all the tables and types for the collected data.
- func (b *Builder) Gen(w *gen.CodeWriter) error {
- t, err := build(b)
- if err != nil {
- return err
- }
- return generate(b, t, w)
- }
- // GenTestData generates tables useful for testing data generated with Gen.
- func (b *Builder) GenTestData(w *gen.CodeWriter) error {
- return generateTestData(b, w)
- }
- type locale struct {
- tag language.Tag
- root *Index
- }
- // Locale creates an index for the given locale.
- func (b *Builder) Locale(t language.Tag) *Index {
- index := &Index{
- meta: b.rootMeta,
- }
- b.locales = append(b.locales, locale{tag: t, root: index})
- return index
- }
- // An Index holds a map of either leaf values or other indices.
- type Index struct {
- meta *metaData
- subIndex []*Index
- values []keyValue
- }
- func (i *Index) setError(err error) { i.meta.b.setError(err) }
- type keyValue struct {
- key enumIndex
- value stringInfo
- }
- // Element is a CLDR XML element.
- type Element interface {
- GetCommon() *cldr.Common
- }
- // Index creates a subindex where the type and enum values are not shared
- // with siblings by default. The name is derived from the elem. If elem is
- // an alias reference, the alias will be resolved and linked. If elem is nil
- // Index returns nil.
- func (i *Index) Index(elem Element, opt ...Option) *Index {
- if elem == nil || reflect.ValueOf(elem).IsNil() {
- return nil
- }
- c := elem.GetCommon()
- o := &options{
- parent: i,
- name: c.GetCommon().Element(),
- }
- o.fill(opt)
- o.setAlias(elem)
- return i.subIndexForKey(o)
- }
- // IndexWithName is like Section but derives the name from the given name.
- func (i *Index) IndexWithName(name string, opt ...Option) *Index {
- o := &options{parent: i, name: name}
- o.fill(opt)
- return i.subIndexForKey(o)
- }
- // IndexFromType creates a subindex the value of tye type attribute as key. It
- // will also configure the Index to share the enumeration values with all
- // sibling values. If elem is an alias, it will be resolved and linked.
- func (i *Index) IndexFromType(elem Element, opts ...Option) *Index {
- o := &options{
- parent: i,
- name: elem.GetCommon().Type,
- }
- o.fill(opts)
- o.setAlias(elem)
- useSharedType()(o)
- return i.subIndexForKey(o)
- }
- // IndexFromAlt creates a subindex the value of tye alt attribute as key. It
- // will also configure the Index to share the enumeration values with all
- // sibling values. If elem is an alias, it will be resolved and linked.
- func (i *Index) IndexFromAlt(elem Element, opts ...Option) *Index {
- o := &options{
- parent: i,
- name: elem.GetCommon().Alt,
- }
- o.fill(opts)
- o.setAlias(elem)
- useSharedType()(o)
- return i.subIndexForKey(o)
- }
- func (i *Index) subIndexForKey(opts *options) *Index {
- key := opts.name
- if len(i.values) > 0 {
- panic(fmt.Errorf("cldrtree: adding Index for %q when value already exists", key))
- }
- meta := i.meta.sub(key, opts)
- for _, x := range i.subIndex {
- if x.meta == meta {
- return x
- }
- }
- if alias := opts.alias; alias != nil {
- if a := alias.GetCommon().Alias; a != nil {
- if a.Source != "locale" {
- i.setError(fmt.Errorf("cldrtree: non-locale alias not supported %v", a.Path))
- }
- if meta.inheritOffset < 0 {
- i.setError(fmt.Errorf("cldrtree: alias was already set %v", a.Path))
- }
- path := a.Path
- for ; strings.HasPrefix(path, "../"); path = path[len("../"):] {
- meta.inheritOffset--
- }
- m := aliasRe.FindStringSubmatch(path)
- if m == nil {
- i.setError(fmt.Errorf("cldrtree: could not parse alias %q", a.Path))
- } else {
- key := m[4]
- if key == "" {
- key = m[1]
- }
- meta.inheritIndex = key
- }
- }
- }
- x := &Index{meta: meta}
- i.subIndex = append(i.subIndex, x)
- return x
- }
- var aliasRe = regexp.MustCompile(`^([a-zA-Z]+)(\[@([a-zA-Z-]+)='([a-zA-Z-]+)'\])?`)
- // SetValue sets the value, the data from a CLDR XML element, for the given key.
- func (i *Index) SetValue(key string, value Element, opt ...Option) {
- if len(i.subIndex) > 0 {
- panic(fmt.Errorf("adding value for key %q when index already exists", key))
- }
- o := &options{parent: i}
- o.fill(opt)
- c := value.GetCommon()
- if c.Alias != nil {
- i.setError(fmt.Errorf("cldrtree: alias not supported for SetValue %v", c.Alias.Path))
- }
- i.setValue(key, c.Data(), o)
- }
- func (i *Index) setValue(key, data string, o *options) {
- index, _ := i.meta.typeInfo.lookupSubtype(key, o)
- kv := keyValue{key: index}
- if len(i.values) > 0 {
- // Add string to the same bucket as the other values.
- bucket := i.values[0].value.bucket
- kv.value = i.meta.b.addStringToBucket(data, bucket)
- } else {
- kv.value = i.meta.b.addString(data)
- }
- i.values = append(i.values, kv)
- }
|