cldrtree.go 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. // Copyright 2017 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Package cldrtree builds and generates a CLDR index file, including all
  5. // inheritance.
  6. //
  7. package cldrtree
  8. //go:generate go test -gen
  9. // cldrtree stores CLDR data in a tree-like structure called Tree. In the CLDR
  10. // data each branch in the tree is indicated by either an element name or an
  11. // attribute value. A Tree does not distinguish between these two cases, but
  12. // rather assumes that all branches can be accessed by an enum with a compact
  13. // range of positive integer values starting from 0.
  14. //
  15. // Each Tree consists of three parts:
  16. // - a slice mapping compact language identifiers to an offset into a set of
  17. // indices,
  18. // - a set of indices, stored as a large blob of uint16 values that encode
  19. // the actual tree structure of data, and
  20. // - a set of buckets that each holds a collection of strings.
  21. // each of which is explained in more detail below.
  22. //
  23. //
  24. // Tree lookup
  25. // A tree lookup is done by providing a locale and a "path", which is a
  26. // sequence of enum values. The search starts with getting the index for the
  27. // given locale and then incrementally jumping into the index using the path
  28. // values. If an element cannot be found in the index, the search starts anew
  29. // for the locale's parent locale. The path may change during lookup by means
  30. // of aliasing, described below.
  31. //
  32. // Buckets
  33. // Buckets hold the actual string data of the leaf values of the CLDR tree.
  34. // This data is stored in buckets, rather than one large string, for multiple
  35. // reasons:
  36. // - it allows representing leaf values more compactly, by storing all leaf
  37. // values in a single bucket and then needing only needing a uint16 to index
  38. // into this bucket for all leaf values,
  39. // - (TBD) allow multiple trees to share subsets of buckets, mostly to allow
  40. // linking in a smaller amount of data if only a subset of the buckets is
  41. // needed,
  42. // - to be nice to go fmt and the compiler.
  43. //
  44. // indices
  45. // An index is a slice of uint16 for which the values are interpreted in one of
  46. // two ways: as a node or a set of leaf values.
  47. // A set of leaf values has the following form:
  48. // <max_size>, <bucket>, <offset>...
  49. // max_size indicates the maximum enum value for which an offset is defined.
  50. // An offset value of 0xFFFF (missingValue) also indicates an undefined value.
  51. // If defined offset indicates the offset within the given bucket of the string.
  52. // A node value has the following form:
  53. // <max_size>, <offset_or_alias>...
  54. // max_size indicates the maximum value for which an offset is defined.
  55. // A missing offset may also be indicated with 0. If the high bit (0x8000, or
  56. // inheritMask) is not set, the offset points to the offset within the index
  57. // for the current locale.
  58. // An offset with high bit set is an alias. In this case the uint16 has the form
  59. // bits:
  60. // 15: 1
  61. // 14-12: negative offset into path relative to current position
  62. // 0-11: new enum value for path element.
  63. // On encountering an alias, the path is modified accordingly and the lookup is
  64. // restarted for the given locale.
  65. import (
  66. "fmt"
  67. "reflect"
  68. "regexp"
  69. "strings"
  70. "unicode/utf8"
  71. "golang.org/x/text/internal/gen"
  72. "golang.org/x/text/language"
  73. "golang.org/x/text/unicode/cldr"
  74. )
  75. // TODO:
  76. // - allow two Trees to share the same set of buckets.
  77. // A Builder allows storing CLDR data in compact form.
  78. type Builder struct {
  79. table []string
  80. rootMeta *metaData
  81. locales []locale
  82. strToBucket map[string]stringInfo
  83. buckets [][]byte
  84. enums []*enum
  85. err error
  86. // Stats
  87. size int
  88. sizeAll int
  89. bucketWaste int
  90. }
  91. const (
  92. maxBucketSize = 8 * 1024 // 8K
  93. maxStrlen = 254 // allow 0xFF sentinel
  94. )
  95. func (b *Builder) setError(err error) {
  96. if b.err == nil {
  97. b.err = err
  98. }
  99. }
  100. func (b *Builder) addString(data string) stringInfo {
  101. data = b.makeString(data)
  102. info, ok := b.strToBucket[data]
  103. if !ok {
  104. b.size += len(data)
  105. x := len(b.buckets) - 1
  106. bucket := b.buckets[x]
  107. if len(bucket)+len(data) < maxBucketSize {
  108. info.bucket = uint16(x)
  109. info.bucketPos = uint16(len(bucket))
  110. b.buckets[x] = append(bucket, data...)
  111. } else {
  112. info.bucket = uint16(len(b.buckets))
  113. info.bucketPos = 0
  114. b.buckets = append(b.buckets, []byte(data))
  115. }
  116. b.strToBucket[data] = info
  117. }
  118. return info
  119. }
  120. func (b *Builder) addStringToBucket(data string, bucket uint16) stringInfo {
  121. data = b.makeString(data)
  122. info, ok := b.strToBucket[data]
  123. if !ok || info.bucket != bucket {
  124. if ok {
  125. b.bucketWaste += len(data)
  126. }
  127. b.size += len(data)
  128. bk := b.buckets[bucket]
  129. info.bucket = bucket
  130. info.bucketPos = uint16(len(bk))
  131. b.buckets[bucket] = append(bk, data...)
  132. b.strToBucket[data] = info
  133. }
  134. return info
  135. }
  136. func (b *Builder) makeString(data string) string {
  137. if len(data) > maxStrlen {
  138. b.setError(fmt.Errorf("string %q exceeds maximum length of %d", data, maxStrlen))
  139. data = data[:maxStrlen]
  140. for i := len(data) - 1; i > len(data)-4; i-- {
  141. if utf8.RuneStart(data[i]) {
  142. data = data[:i]
  143. break
  144. }
  145. }
  146. }
  147. data = string([]byte{byte(len(data))}) + data
  148. b.sizeAll += len(data)
  149. return data
  150. }
  151. type stringInfo struct {
  152. bufferPos uint32
  153. bucket uint16
  154. bucketPos uint16
  155. }
  156. // New creates a new Builder.
  157. func New(tableName string) *Builder {
  158. b := &Builder{
  159. strToBucket: map[string]stringInfo{},
  160. buckets: [][]byte{nil}, // initialize with first bucket.
  161. }
  162. b.rootMeta = &metaData{
  163. b: b,
  164. typeInfo: &typeInfo{},
  165. }
  166. return b
  167. }
  168. // Gen writes all the tables and types for the collected data.
  169. func (b *Builder) Gen(w *gen.CodeWriter) error {
  170. t, err := build(b)
  171. if err != nil {
  172. return err
  173. }
  174. return generate(b, t, w)
  175. }
  176. // GenTestData generates tables useful for testing data generated with Gen.
  177. func (b *Builder) GenTestData(w *gen.CodeWriter) error {
  178. return generateTestData(b, w)
  179. }
  180. type locale struct {
  181. tag language.Tag
  182. root *Index
  183. }
  184. // Locale creates an index for the given locale.
  185. func (b *Builder) Locale(t language.Tag) *Index {
  186. index := &Index{
  187. meta: b.rootMeta,
  188. }
  189. b.locales = append(b.locales, locale{tag: t, root: index})
  190. return index
  191. }
  192. // An Index holds a map of either leaf values or other indices.
  193. type Index struct {
  194. meta *metaData
  195. subIndex []*Index
  196. values []keyValue
  197. }
  198. func (i *Index) setError(err error) { i.meta.b.setError(err) }
  199. type keyValue struct {
  200. key enumIndex
  201. value stringInfo
  202. }
  203. // Element is a CLDR XML element.
  204. type Element interface {
  205. GetCommon() *cldr.Common
  206. }
  207. // Index creates a subindex where the type and enum values are not shared
  208. // with siblings by default. The name is derived from the elem. If elem is
  209. // an alias reference, the alias will be resolved and linked. If elem is nil
  210. // Index returns nil.
  211. func (i *Index) Index(elem Element, opt ...Option) *Index {
  212. if elem == nil || reflect.ValueOf(elem).IsNil() {
  213. return nil
  214. }
  215. c := elem.GetCommon()
  216. o := &options{
  217. parent: i,
  218. name: c.GetCommon().Element(),
  219. }
  220. o.fill(opt)
  221. o.setAlias(elem)
  222. return i.subIndexForKey(o)
  223. }
  224. // IndexWithName is like Section but derives the name from the given name.
  225. func (i *Index) IndexWithName(name string, opt ...Option) *Index {
  226. o := &options{parent: i, name: name}
  227. o.fill(opt)
  228. return i.subIndexForKey(o)
  229. }
  230. // IndexFromType creates a subindex the value of tye type attribute as key. It
  231. // will also configure the Index to share the enumeration values with all
  232. // sibling values. If elem is an alias, it will be resolved and linked.
  233. func (i *Index) IndexFromType(elem Element, opts ...Option) *Index {
  234. o := &options{
  235. parent: i,
  236. name: elem.GetCommon().Type,
  237. }
  238. o.fill(opts)
  239. o.setAlias(elem)
  240. useSharedType()(o)
  241. return i.subIndexForKey(o)
  242. }
  243. // IndexFromAlt creates a subindex the value of tye alt attribute as key. It
  244. // will also configure the Index to share the enumeration values with all
  245. // sibling values. If elem is an alias, it will be resolved and linked.
  246. func (i *Index) IndexFromAlt(elem Element, opts ...Option) *Index {
  247. o := &options{
  248. parent: i,
  249. name: elem.GetCommon().Alt,
  250. }
  251. o.fill(opts)
  252. o.setAlias(elem)
  253. useSharedType()(o)
  254. return i.subIndexForKey(o)
  255. }
  256. func (i *Index) subIndexForKey(opts *options) *Index {
  257. key := opts.name
  258. if len(i.values) > 0 {
  259. panic(fmt.Errorf("cldrtree: adding Index for %q when value already exists", key))
  260. }
  261. meta := i.meta.sub(key, opts)
  262. for _, x := range i.subIndex {
  263. if x.meta == meta {
  264. return x
  265. }
  266. }
  267. if alias := opts.alias; alias != nil {
  268. if a := alias.GetCommon().Alias; a != nil {
  269. if a.Source != "locale" {
  270. i.setError(fmt.Errorf("cldrtree: non-locale alias not supported %v", a.Path))
  271. }
  272. if meta.inheritOffset < 0 {
  273. i.setError(fmt.Errorf("cldrtree: alias was already set %v", a.Path))
  274. }
  275. path := a.Path
  276. for ; strings.HasPrefix(path, "../"); path = path[len("../"):] {
  277. meta.inheritOffset--
  278. }
  279. m := aliasRe.FindStringSubmatch(path)
  280. if m == nil {
  281. i.setError(fmt.Errorf("cldrtree: could not parse alias %q", a.Path))
  282. } else {
  283. key := m[4]
  284. if key == "" {
  285. key = m[1]
  286. }
  287. meta.inheritIndex = key
  288. }
  289. }
  290. }
  291. x := &Index{meta: meta}
  292. i.subIndex = append(i.subIndex, x)
  293. return x
  294. }
  295. var aliasRe = regexp.MustCompile(`^([a-zA-Z]+)(\[@([a-zA-Z-]+)='([a-zA-Z-]+)'\])?`)
  296. // SetValue sets the value, the data from a CLDR XML element, for the given key.
  297. func (i *Index) SetValue(key string, value Element, opt ...Option) {
  298. if len(i.subIndex) > 0 {
  299. panic(fmt.Errorf("adding value for key %q when index already exists", key))
  300. }
  301. o := &options{parent: i}
  302. o.fill(opt)
  303. c := value.GetCommon()
  304. if c.Alias != nil {
  305. i.setError(fmt.Errorf("cldrtree: alias not supported for SetValue %v", c.Alias.Path))
  306. }
  307. i.setValue(key, c.Data(), o)
  308. }
  309. func (i *Index) setValue(key, data string, o *options) {
  310. index, _ := i.meta.typeInfo.lookupSubtype(key, o)
  311. kv := keyValue{key: index}
  312. if len(i.values) > 0 {
  313. // Add string to the same bucket as the other values.
  314. bucket := i.values[0].value.bucket
  315. kv.value = i.meta.b.addStringToBucket(data, bucket)
  316. } else {
  317. kv.value = i.meta.b.addString(data)
  318. }
  319. i.values = append(i.values, kv)
  320. }