123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310 |
- // Copyright 2015 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- // Unicode table generator.
- // Data read from the web.
- // +build ignore
- package main
- import (
- "flag"
- "log"
- "unicode"
- "unicode/utf8"
- "golang.org/x/text/internal/gen"
- "golang.org/x/text/internal/triegen"
- "golang.org/x/text/internal/ucd"
- "golang.org/x/text/unicode/norm"
- "golang.org/x/text/unicode/rangetable"
- )
- var outputFile = flag.String("output", "tables.go", "output file for generated tables; default tables.go")
- var assigned, disallowedRunes *unicode.RangeTable
- var runeCategory = map[rune]category{}
- var overrides = map[category]category{
- viramaModifier: viramaJoinT,
- greek: greekJoinT,
- hebrew: hebrewJoinT,
- }
- func setCategory(r rune, cat category) {
- if c, ok := runeCategory[r]; ok {
- if override, ok := overrides[c]; cat == joiningT && ok {
- cat = override
- } else {
- log.Fatalf("%U: multiple categories for rune (%v and %v)", r, c, cat)
- }
- }
- runeCategory[r] = cat
- }
- func init() {
- if numCategories > 1<<propShift {
- log.Fatalf("Number of categories is %d; may at most be %d", numCategories, 1<<propShift)
- }
- }
- func main() {
- gen.Init()
- // Load data
- runes := []rune{}
- // PrecisIgnorableProperties: https://tools.ietf.org/html/rfc7564#section-9.13
- ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) {
- if p.String(1) == "Default_Ignorable_Code_Point" {
- runes = append(runes, p.Rune(0))
- }
- })
- ucd.Parse(gen.OpenUCDFile("PropList.txt"), func(p *ucd.Parser) {
- switch p.String(1) {
- case "Noncharacter_Code_Point":
- runes = append(runes, p.Rune(0))
- }
- })
- // OldHangulJamo: https://tools.ietf.org/html/rfc5892#section-2.9
- ucd.Parse(gen.OpenUCDFile("HangulSyllableType.txt"), func(p *ucd.Parser) {
- switch p.String(1) {
- case "L", "V", "T":
- runes = append(runes, p.Rune(0))
- }
- })
- disallowedRunes = rangetable.New(runes...)
- assigned = rangetable.Assigned(unicode.Version)
- // Load category data.
- runeCategory['l'] = latinSmallL
- ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
- const cccVirama = 9
- if p.Int(ucd.CanonicalCombiningClass) == cccVirama {
- setCategory(p.Rune(0), viramaModifier)
- }
- })
- ucd.Parse(gen.OpenUCDFile("Scripts.txt"), func(p *ucd.Parser) {
- switch p.String(1) {
- case "Greek":
- setCategory(p.Rune(0), greek)
- case "Hebrew":
- setCategory(p.Rune(0), hebrew)
- case "Hiragana", "Katakana", "Han":
- setCategory(p.Rune(0), japanese)
- }
- })
- // Set the rule categories associated with exceptions. This overrides any
- // previously set categories. The original categories are manually
- // reintroduced in the categoryTransitions table.
- for r, e := range exceptions {
- if e.cat != 0 {
- runeCategory[r] = e.cat
- }
- }
- cat := map[string]category{
- "L": joiningL,
- "D": joiningD,
- "T": joiningT,
- "R": joiningR,
- }
- ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
- switch v := p.String(1); v {
- case "L", "D", "T", "R":
- setCategory(p.Rune(0), cat[v])
- }
- })
- writeTables()
- gen.Repackage("gen_trieval.go", "trieval.go", "precis")
- }
- type exception struct {
- prop property
- cat category
- }
- func init() {
- // Programmatically add the Arabic and Indic digits to the exceptions map.
- // See comment in the exceptions map below why these are marked disallowed.
- for i := rune(0); i <= 9; i++ {
- exceptions[0x0660+i] = exception{
- prop: disallowed,
- cat: arabicIndicDigit,
- }
- exceptions[0x06F0+i] = exception{
- prop: disallowed,
- cat: extendedArabicIndicDigit,
- }
- }
- }
- // The Exceptions class as defined in RFC 5892
- // https://tools.ietf.org/html/rfc5892#section-2.6
- var exceptions = map[rune]exception{
- 0x00DF: {prop: pValid},
- 0x03C2: {prop: pValid},
- 0x06FD: {prop: pValid},
- 0x06FE: {prop: pValid},
- 0x0F0B: {prop: pValid},
- 0x3007: {prop: pValid},
- // ContextO|J rules are marked as disallowed, taking a "guilty until proven
- // innocent" approach. The main reason for this is that the check for
- // whether a context rule should be applied can be moved to the logic for
- // handing disallowed runes, taken it off the common path. The exception to
- // this rule is for katakanaMiddleDot, as the rule logic is handled without
- // using a rule function.
- // ContextJ (Join control)
- 0x200C: {prop: disallowed, cat: zeroWidthNonJoiner},
- 0x200D: {prop: disallowed, cat: zeroWidthJoiner},
- // ContextO
- 0x00B7: {prop: disallowed, cat: middleDot},
- 0x0375: {prop: disallowed, cat: greekLowerNumeralSign},
- 0x05F3: {prop: disallowed, cat: hebrewPreceding}, // punctuation Geresh
- 0x05F4: {prop: disallowed, cat: hebrewPreceding}, // punctuation Gershayim
- 0x30FB: {prop: pValid, cat: katakanaMiddleDot},
- // These are officially ContextO, but the implementation does not require
- // special treatment of these, so we simply mark them as valid.
- 0x0660: {prop: pValid},
- 0x0661: {prop: pValid},
- 0x0662: {prop: pValid},
- 0x0663: {prop: pValid},
- 0x0664: {prop: pValid},
- 0x0665: {prop: pValid},
- 0x0666: {prop: pValid},
- 0x0667: {prop: pValid},
- 0x0668: {prop: pValid},
- 0x0669: {prop: pValid},
- 0x06F0: {prop: pValid},
- 0x06F1: {prop: pValid},
- 0x06F2: {prop: pValid},
- 0x06F3: {prop: pValid},
- 0x06F4: {prop: pValid},
- 0x06F5: {prop: pValid},
- 0x06F6: {prop: pValid},
- 0x06F7: {prop: pValid},
- 0x06F8: {prop: pValid},
- 0x06F9: {prop: pValid},
- 0x0640: {prop: disallowed},
- 0x07FA: {prop: disallowed},
- 0x302E: {prop: disallowed},
- 0x302F: {prop: disallowed},
- 0x3031: {prop: disallowed},
- 0x3032: {prop: disallowed},
- 0x3033: {prop: disallowed},
- 0x3034: {prop: disallowed},
- 0x3035: {prop: disallowed},
- 0x303B: {prop: disallowed},
- }
- // LetterDigits: https://tools.ietf.org/html/rfc5892#section-2.1
- // r in {Ll, Lu, Lo, Nd, Lm, Mn, Mc}.
- func isLetterDigits(r rune) bool {
- return unicode.In(r,
- unicode.Ll, unicode.Lu, unicode.Lm, unicode.Lo, // Letters
- unicode.Mn, unicode.Mc, // Modifiers
- unicode.Nd, // Digits
- )
- }
- func isIdDisAndFreePVal(r rune) bool {
- return unicode.In(r,
- // OtherLetterDigits: https://tools.ietf.org/html/rfc7564#section-9.18
- // r in in {Lt, Nl, No, Me}
- unicode.Lt, unicode.Nl, unicode.No, // Other letters / numbers
- unicode.Me, // Modifiers
- // Spaces: https://tools.ietf.org/html/rfc7564#section-9.14
- // r in in {Zs}
- unicode.Zs,
- // Symbols: https://tools.ietf.org/html/rfc7564#section-9.15
- // r in {Sm, Sc, Sk, So}
- unicode.Sm, unicode.Sc, unicode.Sk, unicode.So,
- // Punctuation: https://tools.ietf.org/html/rfc7564#section-9.16
- // r in {Pc, Pd, Ps, Pe, Pi, Pf, Po}
- unicode.Pc, unicode.Pd, unicode.Ps, unicode.Pe,
- unicode.Pi, unicode.Pf, unicode.Po,
- )
- }
- // HasCompat: https://tools.ietf.org/html/rfc7564#section-9.17
- func hasCompat(r rune) bool {
- return !norm.NFKC.IsNormalString(string(r))
- }
- // From https://tools.ietf.org/html/rfc5892:
- //
- // If .cp. .in. Exceptions Then Exceptions(cp);
- // Else If .cp. .in. BackwardCompatible Then BackwardCompatible(cp);
- // Else If .cp. .in. Unassigned Then UNASSIGNED;
- // Else If .cp. .in. ASCII7 Then PVALID;
- // Else If .cp. .in. JoinControl Then CONTEXTJ;
- // Else If .cp. .in. OldHangulJamo Then DISALLOWED;
- // Else If .cp. .in. PrecisIgnorableProperties Then DISALLOWED;
- // Else If .cp. .in. Controls Then DISALLOWED;
- // Else If .cp. .in. HasCompat Then ID_DIS or FREE_PVAL;
- // Else If .cp. .in. LetterDigits Then PVALID;
- // Else If .cp. .in. OtherLetterDigits Then ID_DIS or FREE_PVAL;
- // Else If .cp. .in. Spaces Then ID_DIS or FREE_PVAL;
- // Else If .cp. .in. Symbols Then ID_DIS or FREE_PVAL;
- // Else If .cp. .in. Punctuation Then ID_DIS or FREE_PVAL;
- // Else DISALLOWED;
- func writeTables() {
- propTrie := triegen.NewTrie("derivedProperties")
- w := gen.NewCodeWriter()
- defer w.WriteVersionedGoFile(*outputFile, "precis")
- gen.WriteUnicodeVersion(w)
- // Iterate over all the runes...
- for i := rune(0); i < unicode.MaxRune; i++ {
- r := rune(i)
- if !utf8.ValidRune(r) {
- continue
- }
- e, ok := exceptions[i]
- p := e.prop
- switch {
- case ok:
- case !unicode.In(r, assigned):
- p = unassigned
- case r >= 0x0021 && r <= 0x007e: // Is ASCII 7
- p = pValid
- case unicode.In(r, disallowedRunes, unicode.Cc):
- p = disallowed
- case hasCompat(r):
- p = idDisOrFreePVal
- case isLetterDigits(r):
- p = pValid
- case isIdDisAndFreePVal(r):
- p = idDisOrFreePVal
- default:
- p = disallowed
- }
- cat := runeCategory[r]
- // Don't set category for runes that are disallowed.
- if p == disallowed {
- cat = exceptions[r].cat
- }
- propTrie.Insert(r, uint64(p)|uint64(cat))
- }
- sz, err := propTrie.Gen(w)
- if err != nil {
- log.Fatal(err)
- }
- w.Size += sz
- }
|