gen.go 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311
  1. // Copyright 2015 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Unicode table generator.
  5. // Data read from the web.
  6. //go:build ignore
  7. // +build ignore
  8. package main
  9. import (
  10. "flag"
  11. "log"
  12. "unicode"
  13. "unicode/utf8"
  14. "golang.org/x/text/internal/gen"
  15. "golang.org/x/text/internal/triegen"
  16. "golang.org/x/text/internal/ucd"
  17. "golang.org/x/text/unicode/norm"
  18. "golang.org/x/text/unicode/rangetable"
  19. )
  20. var outputFile = flag.String("output", "tables.go", "output file for generated tables; default tables.go")
  21. var assigned, disallowedRunes *unicode.RangeTable
  22. var runeCategory = map[rune]category{}
  23. var overrides = map[category]category{
  24. viramaModifier: viramaJoinT,
  25. greek: greekJoinT,
  26. hebrew: hebrewJoinT,
  27. }
  28. func setCategory(r rune, cat category) {
  29. if c, ok := runeCategory[r]; ok {
  30. if override, ok := overrides[c]; cat == joiningT && ok {
  31. cat = override
  32. } else {
  33. log.Fatalf("%U: multiple categories for rune (%v and %v)", r, c, cat)
  34. }
  35. }
  36. runeCategory[r] = cat
  37. }
  38. func init() {
  39. if numCategories > 1<<propShift {
  40. log.Fatalf("Number of categories is %d; may at most be %d", numCategories, 1<<propShift)
  41. }
  42. }
  43. func main() {
  44. gen.Init()
  45. // Load data
  46. runes := []rune{}
  47. // PrecisIgnorableProperties: https://tools.ietf.org/html/rfc7564#section-9.13
  48. ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) {
  49. if p.String(1) == "Default_Ignorable_Code_Point" {
  50. runes = append(runes, p.Rune(0))
  51. }
  52. })
  53. ucd.Parse(gen.OpenUCDFile("PropList.txt"), func(p *ucd.Parser) {
  54. switch p.String(1) {
  55. case "Noncharacter_Code_Point":
  56. runes = append(runes, p.Rune(0))
  57. }
  58. })
  59. // OldHangulJamo: https://tools.ietf.org/html/rfc5892#section-2.9
  60. ucd.Parse(gen.OpenUCDFile("HangulSyllableType.txt"), func(p *ucd.Parser) {
  61. switch p.String(1) {
  62. case "L", "V", "T":
  63. runes = append(runes, p.Rune(0))
  64. }
  65. })
  66. disallowedRunes = rangetable.New(runes...)
  67. assigned = rangetable.Assigned(unicode.Version)
  68. // Load category data.
  69. runeCategory['l'] = latinSmallL
  70. ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
  71. const cccVirama = 9
  72. if p.Int(ucd.CanonicalCombiningClass) == cccVirama {
  73. setCategory(p.Rune(0), viramaModifier)
  74. }
  75. })
  76. ucd.Parse(gen.OpenUCDFile("Scripts.txt"), func(p *ucd.Parser) {
  77. switch p.String(1) {
  78. case "Greek":
  79. setCategory(p.Rune(0), greek)
  80. case "Hebrew":
  81. setCategory(p.Rune(0), hebrew)
  82. case "Hiragana", "Katakana", "Han":
  83. setCategory(p.Rune(0), japanese)
  84. }
  85. })
  86. // Set the rule categories associated with exceptions. This overrides any
  87. // previously set categories. The original categories are manually
  88. // reintroduced in the categoryTransitions table.
  89. for r, e := range exceptions {
  90. if e.cat != 0 {
  91. runeCategory[r] = e.cat
  92. }
  93. }
  94. cat := map[string]category{
  95. "L": joiningL,
  96. "D": joiningD,
  97. "T": joiningT,
  98. "R": joiningR,
  99. }
  100. ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
  101. switch v := p.String(1); v {
  102. case "L", "D", "T", "R":
  103. setCategory(p.Rune(0), cat[v])
  104. }
  105. })
  106. writeTables()
  107. gen.Repackage("gen_trieval.go", "trieval.go", "precis")
  108. }
  109. type exception struct {
  110. prop property
  111. cat category
  112. }
  113. func init() {
  114. // Programmatically add the Arabic and Indic digits to the exceptions map.
  115. // See comment in the exceptions map below why these are marked disallowed.
  116. for i := rune(0); i <= 9; i++ {
  117. exceptions[0x0660+i] = exception{
  118. prop: disallowed,
  119. cat: arabicIndicDigit,
  120. }
  121. exceptions[0x06F0+i] = exception{
  122. prop: disallowed,
  123. cat: extendedArabicIndicDigit,
  124. }
  125. }
  126. }
  127. // The Exceptions class as defined in RFC 5892
  128. // https://tools.ietf.org/html/rfc5892#section-2.6
  129. var exceptions = map[rune]exception{
  130. 0x00DF: {prop: pValid},
  131. 0x03C2: {prop: pValid},
  132. 0x06FD: {prop: pValid},
  133. 0x06FE: {prop: pValid},
  134. 0x0F0B: {prop: pValid},
  135. 0x3007: {prop: pValid},
  136. // ContextO|J rules are marked as disallowed, taking a "guilty until proven
  137. // innocent" approach. The main reason for this is that the check for
  138. // whether a context rule should be applied can be moved to the logic for
  139. // handing disallowed runes, taken it off the common path. The exception to
  140. // this rule is for katakanaMiddleDot, as the rule logic is handled without
  141. // using a rule function.
  142. // ContextJ (Join control)
  143. 0x200C: {prop: disallowed, cat: zeroWidthNonJoiner},
  144. 0x200D: {prop: disallowed, cat: zeroWidthJoiner},
  145. // ContextO
  146. 0x00B7: {prop: disallowed, cat: middleDot},
  147. 0x0375: {prop: disallowed, cat: greekLowerNumeralSign},
  148. 0x05F3: {prop: disallowed, cat: hebrewPreceding}, // punctuation Geresh
  149. 0x05F4: {prop: disallowed, cat: hebrewPreceding}, // punctuation Gershayim
  150. 0x30FB: {prop: pValid, cat: katakanaMiddleDot},
  151. // These are officially ContextO, but the implementation does not require
  152. // special treatment of these, so we simply mark them as valid.
  153. 0x0660: {prop: pValid},
  154. 0x0661: {prop: pValid},
  155. 0x0662: {prop: pValid},
  156. 0x0663: {prop: pValid},
  157. 0x0664: {prop: pValid},
  158. 0x0665: {prop: pValid},
  159. 0x0666: {prop: pValid},
  160. 0x0667: {prop: pValid},
  161. 0x0668: {prop: pValid},
  162. 0x0669: {prop: pValid},
  163. 0x06F0: {prop: pValid},
  164. 0x06F1: {prop: pValid},
  165. 0x06F2: {prop: pValid},
  166. 0x06F3: {prop: pValid},
  167. 0x06F4: {prop: pValid},
  168. 0x06F5: {prop: pValid},
  169. 0x06F6: {prop: pValid},
  170. 0x06F7: {prop: pValid},
  171. 0x06F8: {prop: pValid},
  172. 0x06F9: {prop: pValid},
  173. 0x0640: {prop: disallowed},
  174. 0x07FA: {prop: disallowed},
  175. 0x302E: {prop: disallowed},
  176. 0x302F: {prop: disallowed},
  177. 0x3031: {prop: disallowed},
  178. 0x3032: {prop: disallowed},
  179. 0x3033: {prop: disallowed},
  180. 0x3034: {prop: disallowed},
  181. 0x3035: {prop: disallowed},
  182. 0x303B: {prop: disallowed},
  183. }
  184. // LetterDigits: https://tools.ietf.org/html/rfc5892#section-2.1
  185. // r in {Ll, Lu, Lo, Nd, Lm, Mn, Mc}.
  186. func isLetterDigits(r rune) bool {
  187. return unicode.In(r,
  188. unicode.Ll, unicode.Lu, unicode.Lm, unicode.Lo, // Letters
  189. unicode.Mn, unicode.Mc, // Modifiers
  190. unicode.Nd, // Digits
  191. )
  192. }
  193. func isIdDisAndFreePVal(r rune) bool {
  194. return unicode.In(r,
  195. // OtherLetterDigits: https://tools.ietf.org/html/rfc7564#section-9.18
  196. // r in in {Lt, Nl, No, Me}
  197. unicode.Lt, unicode.Nl, unicode.No, // Other letters / numbers
  198. unicode.Me, // Modifiers
  199. // Spaces: https://tools.ietf.org/html/rfc7564#section-9.14
  200. // r in in {Zs}
  201. unicode.Zs,
  202. // Symbols: https://tools.ietf.org/html/rfc7564#section-9.15
  203. // r in {Sm, Sc, Sk, So}
  204. unicode.Sm, unicode.Sc, unicode.Sk, unicode.So,
  205. // Punctuation: https://tools.ietf.org/html/rfc7564#section-9.16
  206. // r in {Pc, Pd, Ps, Pe, Pi, Pf, Po}
  207. unicode.Pc, unicode.Pd, unicode.Ps, unicode.Pe,
  208. unicode.Pi, unicode.Pf, unicode.Po,
  209. )
  210. }
  211. // HasCompat: https://tools.ietf.org/html/rfc7564#section-9.17
  212. func hasCompat(r rune) bool {
  213. return !norm.NFKC.IsNormalString(string(r))
  214. }
  215. // From https://tools.ietf.org/html/rfc5892:
  216. //
  217. // If .cp. .in. Exceptions Then Exceptions(cp);
  218. // Else If .cp. .in. BackwardCompatible Then BackwardCompatible(cp);
  219. // Else If .cp. .in. Unassigned Then UNASSIGNED;
  220. // Else If .cp. .in. ASCII7 Then PVALID;
  221. // Else If .cp. .in. JoinControl Then CONTEXTJ;
  222. // Else If .cp. .in. OldHangulJamo Then DISALLOWED;
  223. // Else If .cp. .in. PrecisIgnorableProperties Then DISALLOWED;
  224. // Else If .cp. .in. Controls Then DISALLOWED;
  225. // Else If .cp. .in. HasCompat Then ID_DIS or FREE_PVAL;
  226. // Else If .cp. .in. LetterDigits Then PVALID;
  227. // Else If .cp. .in. OtherLetterDigits Then ID_DIS or FREE_PVAL;
  228. // Else If .cp. .in. Spaces Then ID_DIS or FREE_PVAL;
  229. // Else If .cp. .in. Symbols Then ID_DIS or FREE_PVAL;
  230. // Else If .cp. .in. Punctuation Then ID_DIS or FREE_PVAL;
  231. // Else DISALLOWED;
  232. func writeTables() {
  233. propTrie := triegen.NewTrie("derivedProperties")
  234. w := gen.NewCodeWriter()
  235. defer w.WriteVersionedGoFile(*outputFile, "precis")
  236. gen.WriteUnicodeVersion(w)
  237. // Iterate over all the runes...
  238. for i := rune(0); i < unicode.MaxRune; i++ {
  239. r := rune(i)
  240. if !utf8.ValidRune(r) {
  241. continue
  242. }
  243. e, ok := exceptions[i]
  244. p := e.prop
  245. switch {
  246. case ok:
  247. case !unicode.In(r, assigned):
  248. p = unassigned
  249. case r >= 0x0021 && r <= 0x007e: // Is ASCII 7
  250. p = pValid
  251. case unicode.In(r, disallowedRunes, unicode.Cc):
  252. p = disallowed
  253. case hasCompat(r):
  254. p = idDisOrFreePVal
  255. case isLetterDigits(r):
  256. p = pValid
  257. case isIdDisAndFreePVal(r):
  258. p = idDisOrFreePVal
  259. default:
  260. p = disallowed
  261. }
  262. cat := runeCategory[r]
  263. // Don't set category for runes that are disallowed.
  264. if p == disallowed {
  265. cat = exceptions[r].cat
  266. }
  267. propTrie.Insert(r, uint64(p)|uint64(cat))
  268. }
  269. sz, err := propTrie.Gen(w)
  270. if err != nil {
  271. log.Fatal(err)
  272. }
  273. w.Size += sz
  274. }