gen.go 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //go:build ignore
  5. // +build ignore
  6. // This program generates the trie for idna operations. The Unicode casing
  7. // algorithm requires the lookup of various properties and mappings for each
  8. // rune. The table generated by this generator combines several of the most
  9. // frequently used of these into a single trie so that they can be accessed
  10. // with a single lookup.
  11. package main
  12. import (
  13. "fmt"
  14. "io"
  15. "log"
  16. "unicode"
  17. "unicode/utf8"
  18. "golang.org/x/text/internal/gen"
  19. "golang.org/x/text/internal/triegen"
  20. "golang.org/x/text/internal/ucd"
  21. "golang.org/x/text/unicode/bidi"
  22. )
  23. func main() {
  24. gen.Init()
  25. genTables()
  26. gen.Repackage("gen_trieval.go", "trieval.go", "idna")
  27. gen.Repackage("gen_common.go", "common_test.go", "idna")
  28. }
  29. var runes = map[rune]info{}
  30. func genTables() {
  31. t := triegen.NewTrie("idna")
  32. ucd.Parse(gen.OpenUCDFile("DerivedNormalizationProps.txt"), func(p *ucd.Parser) {
  33. r := p.Rune(0)
  34. if p.String(1) == "NFC_QC" { // p.String(2) is "N" or "M"
  35. runes[r] = mayNeedNorm
  36. }
  37. })
  38. ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
  39. r := p.Rune(0)
  40. const cccVirama = 9
  41. if p.Int(ucd.CanonicalCombiningClass) == cccVirama {
  42. runes[p.Rune(0)] = viramaModifier
  43. }
  44. switch {
  45. case unicode.In(r, unicode.Mark):
  46. runes[r] |= modifier | mayNeedNorm
  47. }
  48. // TODO: by using UnicodeData.txt we don't mark undefined codepoints
  49. // that are earmarked as RTL properly. However, an undefined cp will
  50. // always fail, so there is no need to store this info.
  51. switch p, _ := bidi.LookupRune(r); p.Class() {
  52. case bidi.R, bidi.AL, bidi.AN:
  53. if x := runes[r]; x != 0 && x != mayNeedNorm {
  54. log.Fatalf("%U: rune both modifier and RTL letter/number", r)
  55. }
  56. runes[r] = rtl
  57. }
  58. })
  59. ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
  60. switch v := p.String(1); v {
  61. case "L", "D", "T", "R":
  62. runes[p.Rune(0)] |= joinType[v] << joinShift
  63. }
  64. })
  65. ucd.Parse(gen.OpenUnicodeFile("idna", "", "IdnaMappingTable.txt"), func(p *ucd.Parser) {
  66. r := p.Rune(0)
  67. // The mappings table explicitly defines surrogates as invalid.
  68. if !utf8.ValidRune(r) {
  69. return
  70. }
  71. cat := catFromEntry(p)
  72. isMapped := cat == mapped || cat == disallowedSTD3Mapped || cat == deviation
  73. if !isMapped {
  74. // Only include additional category information for non-mapped
  75. // runes. The additional information is only used after mapping and
  76. // the bits would clash with mapping information.
  77. // TODO: it would be possible to inline this data and avoid
  78. // additional lookups. This is quite tedious, though, so let's first
  79. // see if we need this.
  80. cat |= category(runes[r])
  81. }
  82. s := string(p.Runes(2))
  83. if s != "" && !isMapped {
  84. log.Fatalf("%U: Mapping with non-mapping category %d", r, cat)
  85. }
  86. t.Insert(r, uint64(makeEntry(r, s))+uint64(cat))
  87. })
  88. w := gen.NewCodeWriter()
  89. defer w.WriteVersionedGoFile("tables.go", "idna")
  90. gen.WriteUnicodeVersion(w)
  91. w.WriteVar("mappings", string(mappings))
  92. w.WriteVar("xorData", string(xorData))
  93. sz, err := t.Gen(w, triegen.Compact(&normCompacter{}))
  94. if err != nil {
  95. log.Fatal(err)
  96. }
  97. w.Size += sz
  98. }
  99. var (
  100. // mappings contains replacement strings for mapped runes, each prefixed
  101. // with a byte containing the length of the following string.
  102. mappings = []byte{}
  103. mapCache = map[string]int{}
  104. // xorData is like mappings, except that it contains XOR data.
  105. // We split these two tables so that we don't get an overflow.
  106. xorData = []byte{}
  107. xorCache = map[string]int{}
  108. )
  109. // makeEntry creates a trie entry.
  110. func makeEntry(r rune, mapped string) info {
  111. orig := string(r)
  112. if len(orig) != len(mapped) {
  113. // Store the mapped value as is in the mappings table.
  114. index := len(mappings)
  115. if x, ok := mapCache[mapped]; ok {
  116. index = x
  117. } else {
  118. mapCache[mapped] = index
  119. mappings = append(mappings, byte(len(mapped)))
  120. mappings = append(mappings, mapped...)
  121. }
  122. return info(index) << indexShift
  123. }
  124. // Create per-byte XOR mask.
  125. var b []byte
  126. for i := 0; i < len(orig); i++ {
  127. b = append(b, orig[i]^mapped[i])
  128. }
  129. // Remove leading 0 bytes, but keep at least one byte.
  130. for ; len(b) > 1 && b[0] == 0; b = b[1:] {
  131. }
  132. if len(b) == 1 {
  133. return xorBit | inlineXOR | info(b[0])<<indexShift
  134. }
  135. mapped = string(b)
  136. // Store the mapped value as is in the mappings table.
  137. index := len(xorData)
  138. if x, ok := xorCache[mapped]; ok {
  139. index = x
  140. } else {
  141. xorCache[mapped] = index
  142. xorData = append(xorData, byte(len(mapped)))
  143. xorData = append(xorData, mapped...)
  144. }
  145. return xorBit | info(index)<<indexShift
  146. }
  147. // The following code implements a triegen.Compacter that was originally
  148. // designed for normalization. The IDNA table has some similarities with the
  149. // norm table. Using this compacter, together with the XOR pattern approach,
  150. // reduces the table size by roughly 100K. It can probably be compressed further
  151. // by also including elements of the compacter used by cases, but for now it is
  152. // good enough.
  153. const maxSparseEntries = 16
  154. type normCompacter struct {
  155. sparseBlocks [][]uint64
  156. sparseOffset []uint16
  157. sparseCount int
  158. }
  159. func mostFrequentStride(a []uint64) int {
  160. counts := make(map[int]int)
  161. var v int
  162. for _, x := range a {
  163. if stride := int(x) - v; v != 0 && stride >= 0 {
  164. counts[stride]++
  165. }
  166. v = int(x)
  167. }
  168. var maxs, maxc int
  169. for stride, cnt := range counts {
  170. if cnt > maxc || (cnt == maxc && stride < maxs) {
  171. maxs, maxc = stride, cnt
  172. }
  173. }
  174. return maxs
  175. }
  176. func countSparseEntries(a []uint64) int {
  177. stride := mostFrequentStride(a)
  178. var v, count int
  179. for _, tv := range a {
  180. if int(tv)-v != stride {
  181. if tv != 0 {
  182. count++
  183. }
  184. }
  185. v = int(tv)
  186. }
  187. return count
  188. }
  189. func (c *normCompacter) Size(v []uint64) (sz int, ok bool) {
  190. if n := countSparseEntries(v); n <= maxSparseEntries {
  191. return (n+1)*4 + 2, true
  192. }
  193. return 0, false
  194. }
  195. func (c *normCompacter) Store(v []uint64) uint32 {
  196. h := uint32(len(c.sparseOffset))
  197. c.sparseBlocks = append(c.sparseBlocks, v)
  198. c.sparseOffset = append(c.sparseOffset, uint16(c.sparseCount))
  199. c.sparseCount += countSparseEntries(v) + 1
  200. return h
  201. }
  202. func (c *normCompacter) Handler() string {
  203. return "idnaSparse.lookup"
  204. }
  205. func (c *normCompacter) Print(w io.Writer) (retErr error) {
  206. p := func(f string, x ...interface{}) {
  207. if _, err := fmt.Fprintf(w, f, x...); retErr == nil && err != nil {
  208. retErr = err
  209. }
  210. }
  211. ls := len(c.sparseBlocks)
  212. p("// idnaSparseOffset: %d entries, %d bytes\n", ls, ls*2)
  213. p("var idnaSparseOffset = %#v\n\n", c.sparseOffset)
  214. ns := c.sparseCount
  215. p("// idnaSparseValues: %d entries, %d bytes\n", ns, ns*4)
  216. p("var idnaSparseValues = [%d]valueRange {", ns)
  217. for i, b := range c.sparseBlocks {
  218. p("\n// Block %#x, offset %#x", i, c.sparseOffset[i])
  219. var v int
  220. stride := mostFrequentStride(b)
  221. n := countSparseEntries(b)
  222. p("\n{value:%#04x,lo:%#02x},", stride, uint8(n))
  223. for i, nv := range b {
  224. if int(nv)-v != stride {
  225. if v != 0 {
  226. p(",hi:%#02x},", 0x80+i-1)
  227. }
  228. if nv != 0 {
  229. p("\n{value:%#04x,lo:%#02x", nv, 0x80+i)
  230. }
  231. }
  232. v = int(nv)
  233. }
  234. if v != 0 {
  235. p(",hi:%#02x},", 0x80+len(b)-1)
  236. }
  237. }
  238. p("\n}\n\n")
  239. return
  240. }