gen.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526
  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //go:build ignore
  5. // +build ignore
  6. package main
  7. // This file generates data for the CLDR plural rules, as defined in
  8. // https://unicode.org/reports/tr35/tr35-numbers.html#Language_Plural_Rules
  9. //
  10. // We assume a slightly simplified grammar:
  11. //
  12. // condition = and_condition ('or' and_condition)* samples
  13. // and_condition = relation ('and' relation)*
  14. // relation = expr ('=' | '!=') range_list
  15. // expr = operand ('%' '10' '0'* )?
  16. // operand = 'n' | 'i' | 'f' | 't' | 'v' | 'w'
  17. // range_list = (range | value) (',' range_list)*
  18. // range = value'..'value
  19. // value = digit+
  20. // digit = 0|1|2|3|4|5|6|7|8|9
  21. //
  22. // samples = ('@integer' sampleList)?
  23. // ('@decimal' sampleList)?
  24. // sampleList = sampleRange (',' sampleRange)* (',' ('…'|'...'))?
  25. // sampleRange = decimalValue ('~' decimalValue)?
  26. // decimalValue = value ('.' value)?
  27. //
  28. // Symbol Value
  29. // n absolute value of the source number (integer and decimals).
  30. // i integer digits of n.
  31. // v number of visible fraction digits in n, with trailing zeros.
  32. // w number of visible fraction digits in n, without trailing zeros.
  33. // f visible fractional digits in n, with trailing zeros.
  34. // t visible fractional digits in n, without trailing zeros.
  35. //
  36. // The algorithm for which the data is generated is based on the following
  37. // observations
  38. //
  39. // - the number of different sets of numbers which the plural rules use to
  40. // test inclusion is limited,
  41. // - most numbers that are tested on are < 100
  42. //
  43. // This allows us to define a bitmap for each number < 100 where a bit i
  44. // indicates whether this number is included in some defined set i.
  45. // The function matchPlural in plural.go defines how we can subsequently use
  46. // this data to determine inclusion.
  47. //
  48. // There are a few languages for which this doesn't work. For one Italian and
  49. // Azerbaijan, which both test against numbers > 100 for ordinals and Breton,
  50. // which considers whether numbers are multiples of hundreds. The model here
  51. // could be extended to handle Italian and Azerbaijan fairly easily (by
  52. // considering the numbers 100, 200, 300, ..., 800, 900 in addition to the first
  53. // 100), but for now it seems easier to just hard-code these cases.
  54. import (
  55. "bufio"
  56. "bytes"
  57. "flag"
  58. "fmt"
  59. "log"
  60. "strconv"
  61. "strings"
  62. "golang.org/x/text/internal/gen"
  63. "golang.org/x/text/internal/language"
  64. "golang.org/x/text/internal/language/compact"
  65. "golang.org/x/text/unicode/cldr"
  66. )
  67. var (
  68. test = flag.Bool("test", false,
  69. "test existing tables; can be used to compare web data with package data.")
  70. outputFile = flag.String("output", "tables.go", "output file")
  71. outputTestFile = flag.String("testoutput", "data_test.go", "output file")
  72. draft = flag.String("draft",
  73. "contributed",
  74. `Minimal draft requirements (approved, contributed, provisional, unconfirmed).`)
  75. )
  76. func main() {
  77. gen.Init()
  78. const pkg = "plural"
  79. gen.Repackage("gen_common.go", "common.go", pkg)
  80. // Read the CLDR zip file.
  81. r := gen.OpenCLDRCoreZip()
  82. defer r.Close()
  83. d := &cldr.Decoder{}
  84. d.SetDirFilter("supplemental", "main")
  85. d.SetSectionFilter("numbers", "plurals")
  86. data, err := d.DecodeZip(r)
  87. if err != nil {
  88. log.Fatalf("DecodeZip: %v", err)
  89. }
  90. w := gen.NewCodeWriter()
  91. defer w.WriteGoFile(*outputFile, pkg)
  92. gen.WriteCLDRVersion(w)
  93. genPlurals(w, data)
  94. w = gen.NewCodeWriter()
  95. defer w.WriteGoFile(*outputTestFile, pkg)
  96. genPluralsTests(w, data)
  97. }
  98. type pluralTest struct {
  99. locales string // space-separated list of locales for this test
  100. form int // Use int instead of Form to simplify generation.
  101. integer []string // Entries of the form \d+ or \d+~\d+
  102. decimal []string // Entries of the form \f+ or \f+ +~\f+, where f is \d+\.\d+
  103. }
  104. func genPluralsTests(w *gen.CodeWriter, data *cldr.CLDR) {
  105. w.WriteType(pluralTest{})
  106. for _, plurals := range data.Supplemental().Plurals {
  107. if plurals.Type == "" {
  108. // The empty type is reserved for plural ranges.
  109. continue
  110. }
  111. tests := []pluralTest{}
  112. for _, pRules := range plurals.PluralRules {
  113. for _, rule := range pRules.PluralRule {
  114. test := pluralTest{
  115. locales: pRules.Locales,
  116. form: int(countMap[rule.Count]),
  117. }
  118. scan := bufio.NewScanner(strings.NewReader(rule.Data()))
  119. scan.Split(splitTokens)
  120. var p *[]string
  121. for scan.Scan() {
  122. switch t := scan.Text(); t {
  123. case "@integer":
  124. p = &test.integer
  125. case "@decimal":
  126. p = &test.decimal
  127. case ",", "…":
  128. default:
  129. if p != nil {
  130. *p = append(*p, t)
  131. }
  132. }
  133. }
  134. tests = append(tests, test)
  135. }
  136. }
  137. w.WriteVar(plurals.Type+"Tests", tests)
  138. }
  139. }
  140. func genPlurals(w *gen.CodeWriter, data *cldr.CLDR) {
  141. for _, plurals := range data.Supplemental().Plurals {
  142. if plurals.Type == "" {
  143. continue
  144. }
  145. // Initialize setMap and inclusionMasks. They are already populated with
  146. // a few entries to serve as an example and to assign nice numbers to
  147. // common cases.
  148. // setMap contains sets of numbers represented by boolean arrays where
  149. // a true value for element i means that the number i is included.
  150. setMap := map[[numN]bool]int{
  151. // The above init func adds an entry for including all numbers.
  152. [numN]bool{1: true}: 1, // fix {1} to a nice value
  153. [numN]bool{2: true}: 2, // fix {2} to a nice value
  154. [numN]bool{0: true}: 3, // fix {0} to a nice value
  155. }
  156. // inclusionMasks contains bit masks for every number under numN to
  157. // indicate in which set the number is included. Bit 1 << x will be set
  158. // if it is included in set x.
  159. inclusionMasks := [numN]uint64{
  160. // Note: these entries are not complete: more bits will be set along the way.
  161. 0: 1 << 3,
  162. 1: 1 << 1,
  163. 2: 1 << 2,
  164. }
  165. // Create set {0..99}. We will assign this set the identifier 0.
  166. var all [numN]bool
  167. for i := range all {
  168. // Mark number i as being included in the set (which has identifier 0).
  169. inclusionMasks[i] |= 1 << 0
  170. // Mark number i as included in the set.
  171. all[i] = true
  172. }
  173. // Register the identifier for the set.
  174. setMap[all] = 0
  175. rules := []pluralCheck{}
  176. index := []byte{0}
  177. langMap := map[compact.ID]byte{0: 0}
  178. for _, pRules := range plurals.PluralRules {
  179. // Parse the rules.
  180. var conds []orCondition
  181. for _, rule := range pRules.PluralRule {
  182. form := countMap[rule.Count]
  183. conds = parsePluralCondition(conds, rule.Data(), form)
  184. }
  185. // Encode the rules.
  186. for _, c := range conds {
  187. // If an or condition only has filters, we create an entry for
  188. // this filter and the set that contains all values.
  189. empty := true
  190. for _, b := range c.used {
  191. empty = empty && !b
  192. }
  193. if empty {
  194. rules = append(rules, pluralCheck{
  195. cat: byte(opMod<<opShift) | byte(c.form),
  196. setID: 0, // all values
  197. })
  198. continue
  199. }
  200. // We have some entries with values.
  201. for i, set := range c.set {
  202. if !c.used[i] {
  203. continue
  204. }
  205. index, ok := setMap[set]
  206. if !ok {
  207. index = len(setMap)
  208. setMap[set] = index
  209. for i := range inclusionMasks {
  210. if set[i] {
  211. inclusionMasks[i] |= 1 << uint64(index)
  212. }
  213. }
  214. }
  215. rules = append(rules, pluralCheck{
  216. cat: byte(i<<opShift | andNext),
  217. setID: byte(index),
  218. })
  219. }
  220. // Now set the last entry to the plural form the rule matches.
  221. rules[len(rules)-1].cat &^= formMask
  222. rules[len(rules)-1].cat |= byte(c.form)
  223. }
  224. // Point the relevant locales to the created entries.
  225. for _, loc := range strings.Split(pRules.Locales, " ") {
  226. if strings.TrimSpace(loc) == "" {
  227. continue
  228. }
  229. lang, ok := compact.FromTag(language.MustParse(loc))
  230. if !ok {
  231. log.Printf("No compact index for locale %q", loc)
  232. }
  233. langMap[lang] = byte(len(index) - 1)
  234. }
  235. index = append(index, byte(len(rules)))
  236. }
  237. w.WriteVar(plurals.Type+"Rules", rules)
  238. w.WriteVar(plurals.Type+"Index", index)
  239. // Expand the values: first by using the parent relationship.
  240. langToIndex := make([]byte, compact.NumCompactTags)
  241. for i := range langToIndex {
  242. for p := compact.ID(i); ; p = p.Parent() {
  243. if x, ok := langMap[p]; ok {
  244. langToIndex[i] = x
  245. break
  246. }
  247. }
  248. }
  249. // Now expand by including entries with identical languages for which
  250. // one isn't set.
  251. for i, v := range langToIndex {
  252. if v == 0 {
  253. id, _ := compact.FromTag(language.Tag{
  254. LangID: compact.ID(i).Tag().LangID,
  255. })
  256. if p := langToIndex[id]; p != 0 {
  257. langToIndex[i] = p
  258. }
  259. }
  260. }
  261. w.WriteVar(plurals.Type+"LangToIndex", langToIndex)
  262. // Need to convert array to slice because of golang.org/issue/7651.
  263. // This will allow tables to be dropped when unused. This is especially
  264. // relevant for the ordinal data, which I suspect won't be used as much.
  265. w.WriteVar(plurals.Type+"InclusionMasks", inclusionMasks[:])
  266. if len(rules) > 0xFF {
  267. log.Fatalf("Too many entries for rules: %#x", len(rules))
  268. }
  269. if len(index) > 0xFF {
  270. log.Fatalf("Too many entries for index: %#x", len(index))
  271. }
  272. if len(setMap) > 64 { // maximum number of bits.
  273. log.Fatalf("Too many entries for setMap: %d", len(setMap))
  274. }
  275. w.WriteComment(
  276. "Slots used for %s: %X of 0xFF rules; %X of 0xFF indexes; %d of 64 sets",
  277. plurals.Type, len(rules), len(index), len(setMap))
  278. // Prevent comment from attaching to the next entry.
  279. fmt.Fprint(w, "\n\n")
  280. }
  281. }
  282. type orCondition struct {
  283. original string // for debugging
  284. form Form
  285. used [32]bool
  286. set [32][numN]bool
  287. }
  288. func (o *orCondition) add(op opID, mod int, v []int) (ok bool) {
  289. ok = true
  290. for _, x := range v {
  291. if x >= maxMod {
  292. ok = false
  293. break
  294. }
  295. }
  296. for i := 0; i < numN; i++ {
  297. m := i
  298. if mod != 0 {
  299. m = i % mod
  300. }
  301. if !intIn(m, v) {
  302. o.set[op][i] = false
  303. }
  304. }
  305. if ok {
  306. o.used[op] = true
  307. }
  308. return ok
  309. }
  310. func intIn(x int, a []int) bool {
  311. for _, y := range a {
  312. if x == y {
  313. return true
  314. }
  315. }
  316. return false
  317. }
  318. var operandIndex = map[string]opID{
  319. "i": opI,
  320. "n": opN,
  321. "f": opF,
  322. "v": opV,
  323. "w": opW,
  324. }
  325. // parsePluralCondition parses the condition of a single pluralRule and appends
  326. // the resulting or conditions to conds.
  327. //
  328. // Example rules:
  329. // // Category "one" in English: only allow 1 with no visible fraction
  330. // i = 1 and v = 0 @integer 1
  331. //
  332. // // Category "few" in Czech: all numbers with visible fractions
  333. // v != 0 @decimal ...
  334. //
  335. // // Category "zero" in Latvian: all multiples of 10 or the numbers 11-19 or
  336. // // numbers with a fraction 11..19 and no trailing zeros.
  337. // n % 10 = 0 or n % 100 = 11..19 or v = 2 and f % 100 = 11..19 @integer ...
  338. //
  339. // @integer and @decimal are followed by examples and are not relevant for the
  340. // rule itself. The are used here to signal the termination of the rule.
  341. func parsePluralCondition(conds []orCondition, s string, f Form) []orCondition {
  342. scan := bufio.NewScanner(strings.NewReader(s))
  343. scan.Split(splitTokens)
  344. for {
  345. cond := orCondition{original: s, form: f}
  346. // Set all numbers to be allowed for all number classes and restrict
  347. // from here on.
  348. for i := range cond.set {
  349. for j := range cond.set[i] {
  350. cond.set[i][j] = true
  351. }
  352. }
  353. andLoop:
  354. for {
  355. var token string
  356. scan.Scan() // Must exist.
  357. switch class := scan.Text(); class {
  358. case "t":
  359. class = "w" // equal to w for t == 0
  360. fallthrough
  361. case "n", "i", "f", "v", "w":
  362. op := scanToken(scan)
  363. opCode := operandIndex[class]
  364. mod := 0
  365. if op == "%" {
  366. opCode |= opMod
  367. switch v := scanUint(scan); v {
  368. case 10, 100:
  369. mod = v
  370. case 1000:
  371. // A more general solution would be to allow checking
  372. // against multiples of 100 and include entries for the
  373. // numbers 100..900 in the inclusion masks. At the
  374. // moment this would only help Azerbaijan and Italian.
  375. // Italian doesn't use '%', so this must be Azerbaijan.
  376. cond.used[opAzerbaijan00s] = true
  377. return append(conds, cond)
  378. case 1000000:
  379. cond.used[opBretonM] = true
  380. return append(conds, cond)
  381. default:
  382. log.Fatalf("Modulo value not supported %d", v)
  383. }
  384. op = scanToken(scan)
  385. }
  386. if op != "=" && op != "!=" {
  387. log.Fatalf("Unexpected op %q", op)
  388. }
  389. if op == "!=" {
  390. opCode |= opNotEqual
  391. }
  392. a := []int{}
  393. v := scanUint(scan)
  394. if class == "w" && v != 0 {
  395. log.Fatalf("Must compare against zero for operand type %q", class)
  396. }
  397. token = scanToken(scan)
  398. for {
  399. switch token {
  400. case "..":
  401. end := scanUint(scan)
  402. for ; v <= end; v++ {
  403. a = append(a, v)
  404. }
  405. token = scanToken(scan)
  406. default: // ",", "or", "and", "@..."
  407. a = append(a, v)
  408. }
  409. if token != "," {
  410. break
  411. }
  412. v = scanUint(scan)
  413. token = scanToken(scan)
  414. }
  415. if !cond.add(opCode, mod, a) {
  416. // Detected large numbers. As we ruled out Azerbaijan, this
  417. // must be the many rule for Italian ordinals.
  418. cond.set[opItalian800] = cond.set[opN]
  419. cond.used[opItalian800] = true
  420. }
  421. case "@integer", "@decimal": // "other" entry: tests only.
  422. return conds
  423. default:
  424. log.Fatalf("Unexpected operand class %q (%s)", class, s)
  425. }
  426. switch token {
  427. case "or":
  428. conds = append(conds, cond)
  429. break andLoop
  430. case "@integer", "@decimal": // examples
  431. // There is always an example in practice, so we always terminate here.
  432. if err := scan.Err(); err != nil {
  433. log.Fatal(err)
  434. }
  435. return append(conds, cond)
  436. case "and":
  437. // keep accumulating
  438. default:
  439. log.Fatalf("Unexpected token %q", token)
  440. }
  441. }
  442. }
  443. }
  444. func scanToken(scan *bufio.Scanner) string {
  445. scan.Scan()
  446. return scan.Text()
  447. }
  448. func scanUint(scan *bufio.Scanner) int {
  449. scan.Scan()
  450. val, err := strconv.ParseUint(scan.Text(), 10, 32)
  451. if err != nil {
  452. log.Fatal(err)
  453. }
  454. return int(val)
  455. }
  456. // splitTokens can be used with bufio.Scanner to tokenize CLDR plural rules.
  457. func splitTokens(data []byte, atEOF bool) (advance int, token []byte, err error) {
  458. condTokens := [][]byte{
  459. []byte(".."),
  460. []byte(","),
  461. []byte("!="),
  462. []byte("="),
  463. }
  464. advance, token, err = bufio.ScanWords(data, atEOF)
  465. for _, t := range condTokens {
  466. if len(t) >= len(token) {
  467. continue
  468. }
  469. switch p := bytes.Index(token, t); {
  470. case p == -1:
  471. case p == 0:
  472. advance = len(t)
  473. token = token[:len(t)]
  474. return advance - len(token) + len(t), token[:len(t)], err
  475. case p < advance:
  476. // Don't split when "=" overlaps "!=".
  477. if t[0] == '=' && token[p-1] == '!' {
  478. continue
  479. }
  480. advance = p
  481. token = token[:p]
  482. }
  483. }
  484. return advance, token, err
  485. }