trie.go 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. // Copyright 2012 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // The trie in this file is used to associate the first full character
  5. // in a UTF-8 string to a collation element.
  6. // All but the last byte in a UTF-8 byte sequence are
  7. // used to look up offsets in the index table to be used for the next byte.
  8. // The last byte is used to index into a table of collation elements.
  9. // This file contains the code for the generation of the trie.
  10. package build
  11. import (
  12. "fmt"
  13. "hash/fnv"
  14. "io"
  15. "reflect"
  16. )
  17. const (
  18. blockSize = 64
  19. blockOffset = 2 // Subtract 2 blocks to compensate for the 0x80 added to continuation bytes.
  20. )
  21. type trieHandle struct {
  22. lookupStart uint16 // offset in table for first byte
  23. valueStart uint16 // offset in table for first byte
  24. }
  25. type trie struct {
  26. index []uint16
  27. values []uint32
  28. }
  29. // trieNode is the intermediate trie structure used for generating a trie.
  30. type trieNode struct {
  31. index []*trieNode
  32. value []uint32
  33. b byte
  34. refValue uint16
  35. refIndex uint16
  36. }
  37. func newNode() *trieNode {
  38. return &trieNode{
  39. index: make([]*trieNode, 64),
  40. value: make([]uint32, 128), // root node size is 128 instead of 64
  41. }
  42. }
  43. func (n *trieNode) isInternal() bool {
  44. return n.value != nil
  45. }
  46. func (n *trieNode) insert(r rune, value uint32) {
  47. const maskx = 0x3F // mask out two most-significant bits
  48. str := string(r)
  49. if len(str) == 1 {
  50. n.value[str[0]] = value
  51. return
  52. }
  53. for i := 0; i < len(str)-1; i++ {
  54. b := str[i] & maskx
  55. if n.index == nil {
  56. n.index = make([]*trieNode, blockSize)
  57. }
  58. nn := n.index[b]
  59. if nn == nil {
  60. nn = &trieNode{}
  61. nn.b = b
  62. n.index[b] = nn
  63. }
  64. n = nn
  65. }
  66. if n.value == nil {
  67. n.value = make([]uint32, blockSize)
  68. }
  69. b := str[len(str)-1] & maskx
  70. n.value[b] = value
  71. }
  72. type trieBuilder struct {
  73. t *trie
  74. roots []*trieHandle
  75. lookupBlocks []*trieNode
  76. valueBlocks []*trieNode
  77. lookupBlockIdx map[uint32]*trieNode
  78. valueBlockIdx map[uint32]*trieNode
  79. }
  80. func newTrieBuilder() *trieBuilder {
  81. index := &trieBuilder{}
  82. index.lookupBlocks = make([]*trieNode, 0)
  83. index.valueBlocks = make([]*trieNode, 0)
  84. index.lookupBlockIdx = make(map[uint32]*trieNode)
  85. index.valueBlockIdx = make(map[uint32]*trieNode)
  86. // The third nil is the default null block. The other two blocks
  87. // are used to guarantee an offset of at least 3 for each block.
  88. index.lookupBlocks = append(index.lookupBlocks, nil, nil, nil)
  89. index.t = &trie{}
  90. return index
  91. }
  92. func (b *trieBuilder) computeOffsets(n *trieNode) *trieNode {
  93. hasher := fnv.New32()
  94. if n.index != nil {
  95. for i, nn := range n.index {
  96. var vi, vv uint16
  97. if nn != nil {
  98. nn = b.computeOffsets(nn)
  99. n.index[i] = nn
  100. vi = nn.refIndex
  101. vv = nn.refValue
  102. }
  103. hasher.Write([]byte{byte(vi >> 8), byte(vi)})
  104. hasher.Write([]byte{byte(vv >> 8), byte(vv)})
  105. }
  106. h := hasher.Sum32()
  107. nn, ok := b.lookupBlockIdx[h]
  108. if !ok {
  109. n.refIndex = uint16(len(b.lookupBlocks)) - blockOffset
  110. b.lookupBlocks = append(b.lookupBlocks, n)
  111. b.lookupBlockIdx[h] = n
  112. } else {
  113. n = nn
  114. }
  115. } else {
  116. for _, v := range n.value {
  117. hasher.Write([]byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)})
  118. }
  119. h := hasher.Sum32()
  120. nn, ok := b.valueBlockIdx[h]
  121. if !ok {
  122. n.refValue = uint16(len(b.valueBlocks)) - blockOffset
  123. n.refIndex = n.refValue
  124. b.valueBlocks = append(b.valueBlocks, n)
  125. b.valueBlockIdx[h] = n
  126. } else {
  127. n = nn
  128. }
  129. }
  130. return n
  131. }
  132. func (b *trieBuilder) addStartValueBlock(n *trieNode) uint16 {
  133. hasher := fnv.New32()
  134. for _, v := range n.value[:2*blockSize] {
  135. hasher.Write([]byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)})
  136. }
  137. h := hasher.Sum32()
  138. nn, ok := b.valueBlockIdx[h]
  139. if !ok {
  140. n.refValue = uint16(len(b.valueBlocks))
  141. n.refIndex = n.refValue
  142. b.valueBlocks = append(b.valueBlocks, n)
  143. // Add a dummy block to accommodate the double block size.
  144. b.valueBlocks = append(b.valueBlocks, nil)
  145. b.valueBlockIdx[h] = n
  146. } else {
  147. n = nn
  148. }
  149. return n.refValue
  150. }
  151. func genValueBlock(t *trie, n *trieNode) {
  152. if n != nil {
  153. for _, v := range n.value {
  154. t.values = append(t.values, v)
  155. }
  156. }
  157. }
  158. func genLookupBlock(t *trie, n *trieNode) {
  159. for _, nn := range n.index {
  160. v := uint16(0)
  161. if nn != nil {
  162. if n.index != nil {
  163. v = nn.refIndex
  164. } else {
  165. v = nn.refValue
  166. }
  167. }
  168. t.index = append(t.index, v)
  169. }
  170. }
  171. func (b *trieBuilder) addTrie(n *trieNode) *trieHandle {
  172. h := &trieHandle{}
  173. b.roots = append(b.roots, h)
  174. h.valueStart = b.addStartValueBlock(n)
  175. if len(b.roots) == 1 {
  176. // We insert a null block after the first start value block.
  177. // This ensures that continuation bytes UTF-8 sequences of length
  178. // greater than 2 will automatically hit a null block if there
  179. // was an undefined entry.
  180. b.valueBlocks = append(b.valueBlocks, nil)
  181. }
  182. n = b.computeOffsets(n)
  183. // Offset by one extra block as the first byte starts at 0xC0 instead of 0x80.
  184. h.lookupStart = n.refIndex - 1
  185. return h
  186. }
  187. // generate generates and returns the trie for n.
  188. func (b *trieBuilder) generate() (t *trie, err error) {
  189. t = b.t
  190. if len(b.valueBlocks) >= 1<<16 {
  191. return nil, fmt.Errorf("maximum number of value blocks exceeded (%d > %d)", len(b.valueBlocks), 1<<16)
  192. }
  193. if len(b.lookupBlocks) >= 1<<16 {
  194. return nil, fmt.Errorf("maximum number of lookup blocks exceeded (%d > %d)", len(b.lookupBlocks), 1<<16)
  195. }
  196. genValueBlock(t, b.valueBlocks[0])
  197. genValueBlock(t, &trieNode{value: make([]uint32, 64)})
  198. for i := 2; i < len(b.valueBlocks); i++ {
  199. genValueBlock(t, b.valueBlocks[i])
  200. }
  201. n := &trieNode{index: make([]*trieNode, 64)}
  202. genLookupBlock(t, n)
  203. genLookupBlock(t, n)
  204. genLookupBlock(t, n)
  205. for i := 3; i < len(b.lookupBlocks); i++ {
  206. genLookupBlock(t, b.lookupBlocks[i])
  207. }
  208. return b.t, nil
  209. }
  210. func (t *trie) printArrays(w io.Writer, name string) (n, size int, err error) {
  211. p := func(f string, a ...interface{}) {
  212. nn, e := fmt.Fprintf(w, f, a...)
  213. n += nn
  214. if err == nil {
  215. err = e
  216. }
  217. }
  218. nv := len(t.values)
  219. p("// %sValues: %d entries, %d bytes\n", name, nv, nv*4)
  220. p("// Block 2 is the null block.\n")
  221. p("var %sValues = [%d]uint32 {", name, nv)
  222. var printnewline bool
  223. for i, v := range t.values {
  224. if i%blockSize == 0 {
  225. p("\n\t// Block %#x, offset %#x", i/blockSize, i)
  226. }
  227. if i%4 == 0 {
  228. printnewline = true
  229. }
  230. if v != 0 {
  231. if printnewline {
  232. p("\n\t")
  233. printnewline = false
  234. }
  235. p("%#04x:%#08x, ", i, v)
  236. }
  237. }
  238. p("\n}\n\n")
  239. ni := len(t.index)
  240. p("// %sLookup: %d entries, %d bytes\n", name, ni, ni*2)
  241. p("// Block 0 is the null block.\n")
  242. p("var %sLookup = [%d]uint16 {", name, ni)
  243. printnewline = false
  244. for i, v := range t.index {
  245. if i%blockSize == 0 {
  246. p("\n\t// Block %#x, offset %#x", i/blockSize, i)
  247. }
  248. if i%8 == 0 {
  249. printnewline = true
  250. }
  251. if v != 0 {
  252. if printnewline {
  253. p("\n\t")
  254. printnewline = false
  255. }
  256. p("%#03x:%#02x, ", i, v)
  257. }
  258. }
  259. p("\n}\n\n")
  260. return n, nv*4 + ni*2, err
  261. }
  262. func (t *trie) printStruct(w io.Writer, handle *trieHandle, name string) (n, sz int, err error) {
  263. const msg = "trie{ %sLookup[%d:], %sValues[%d:], %sLookup[:], %sValues[:]}"
  264. n, err = fmt.Fprintf(w, msg, name, handle.lookupStart*blockSize, name, handle.valueStart*blockSize, name, name)
  265. sz += int(reflect.TypeOf(trie{}).Size())
  266. return
  267. }