123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251 |
- // Copyright 2014 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- package triegen
- import (
- "bytes"
- "fmt"
- "io"
- "strings"
- "text/template"
- )
- // print writes all the data structures as well as the code necessary to use the
- // trie to w.
- func (b *builder) print(w io.Writer) error {
- b.Stats.NValueEntries = len(b.ValueBlocks) * blockSize
- b.Stats.NValueBytes = len(b.ValueBlocks) * blockSize * b.ValueSize
- b.Stats.NIndexEntries = len(b.IndexBlocks) * blockSize
- b.Stats.NIndexBytes = len(b.IndexBlocks) * blockSize * b.IndexSize
- b.Stats.NHandleBytes = len(b.Trie) * 2 * b.IndexSize
- // If we only have one root trie, all starter blocks are at position 0 and
- // we can access the arrays directly.
- if len(b.Trie) == 1 {
- // At this point we cannot refer to the generated tables directly.
- b.ASCIIBlock = b.Name + "Values"
- b.StarterBlock = b.Name + "Index"
- } else {
- // Otherwise we need to have explicit starter indexes in the trie
- // structure.
- b.ASCIIBlock = "t.ascii"
- b.StarterBlock = "t.utf8Start"
- }
- b.SourceType = "[]byte"
- if err := lookupGen.Execute(w, b); err != nil {
- return err
- }
- b.SourceType = "string"
- if err := lookupGen.Execute(w, b); err != nil {
- return err
- }
- if err := trieGen.Execute(w, b); err != nil {
- return err
- }
- for _, c := range b.Compactions {
- if err := c.c.Print(w); err != nil {
- return err
- }
- }
- return nil
- }
- func printValues(n int, values []uint64) string {
- w := &bytes.Buffer{}
- boff := n * blockSize
- fmt.Fprintf(w, "\t// Block %#x, offset %#x", n, boff)
- var newline bool
- for i, v := range values {
- if i%6 == 0 {
- newline = true
- }
- if v != 0 {
- if newline {
- fmt.Fprintf(w, "\n")
- newline = false
- }
- fmt.Fprintf(w, "\t%#02x:%#04x, ", boff+i, v)
- }
- }
- return w.String()
- }
- func printIndex(b *builder, nr int, n *node) string {
- w := &bytes.Buffer{}
- boff := nr * blockSize
- fmt.Fprintf(w, "\t// Block %#x, offset %#x", nr, boff)
- var newline bool
- for i, c := range n.children {
- if i%8 == 0 {
- newline = true
- }
- if c != nil {
- v := b.Compactions[c.index.compaction].Offset + uint32(c.index.index)
- if v != 0 {
- if newline {
- fmt.Fprintf(w, "\n")
- newline = false
- }
- fmt.Fprintf(w, "\t%#02x:%#02x, ", boff+i, v)
- }
- }
- }
- return w.String()
- }
- var (
- trieGen = template.Must(template.New("trie").Funcs(template.FuncMap{
- "printValues": printValues,
- "printIndex": printIndex,
- "title": strings.Title,
- "dec": func(x int) int { return x - 1 },
- "psize": func(n int) string {
- return fmt.Sprintf("%d bytes (%.2f KiB)", n, float64(n)/1024)
- },
- }).Parse(trieTemplate))
- lookupGen = template.Must(template.New("lookup").Parse(lookupTemplate))
- )
- // TODO: consider the return type of lookup. It could be uint64, even if the
- // internal value type is smaller. We will have to verify this with the
- // performance of unicode/norm, which is very sensitive to such changes.
- const trieTemplate = `{{$b := .}}{{$multi := gt (len .Trie) 1}}
- // {{.Name}}Trie. Total size: {{psize .Size}}. Checksum: {{printf "%08x" .Checksum}}.
- type {{.Name}}Trie struct { {{if $multi}}
- ascii []{{.ValueType}} // index for ASCII bytes
- utf8Start []{{.IndexType}} // index for UTF-8 bytes >= 0xC0
- {{end}}}
- func new{{title .Name}}Trie(i int) *{{.Name}}Trie { {{if $multi}}
- h := {{.Name}}TrieHandles[i]
- return &{{.Name}}Trie{ {{.Name}}Values[uint32(h.ascii)<<6:], {{.Name}}Index[uint32(h.multi)<<6:] }
- }
- type {{.Name}}TrieHandle struct {
- ascii, multi {{.IndexType}}
- }
- // {{.Name}}TrieHandles: {{len .Trie}} handles, {{.Stats.NHandleBytes}} bytes
- var {{.Name}}TrieHandles = [{{len .Trie}}]{{.Name}}TrieHandle{
- {{range .Trie}} { {{.ASCIIIndex}}, {{.StarterIndex}} }, // {{printf "%08x" .Checksum}}: {{.Name}}
- {{end}}}{{else}}
- return &{{.Name}}Trie{}
- }
- {{end}}
- // lookupValue determines the type of block n and looks up the value for b.
- func (t *{{.Name}}Trie) lookupValue(n uint32, b byte) {{.ValueType}}{{$last := dec (len .Compactions)}} {
- switch { {{range $i, $c := .Compactions}}
- {{if eq $i $last}}default{{else}}case n < {{$c.Cutoff}}{{end}}:{{if ne $i 0}}
- n -= {{$c.Offset}}{{end}}
- return {{print $b.ValueType}}({{$c.Handler}}){{end}}
- }
- }
- // {{.Name}}Values: {{len .ValueBlocks}} blocks, {{.Stats.NValueEntries}} entries, {{.Stats.NValueBytes}} bytes
- // The third block is the zero block.
- var {{.Name}}Values = [{{.Stats.NValueEntries}}]{{.ValueType}} {
- {{range $i, $v := .ValueBlocks}}{{printValues $i $v}}
- {{end}}}
- // {{.Name}}Index: {{len .IndexBlocks}} blocks, {{.Stats.NIndexEntries}} entries, {{.Stats.NIndexBytes}} bytes
- // Block 0 is the zero block.
- var {{.Name}}Index = [{{.Stats.NIndexEntries}}]{{.IndexType}} {
- {{range $i, $v := .IndexBlocks}}{{printIndex $b $i $v}}
- {{end}}}
- `
- // TODO: consider allowing zero-length strings after evaluating performance with
- // unicode/norm.
- const lookupTemplate = `
- // lookup{{if eq .SourceType "string"}}String{{end}} returns the trie value for the first UTF-8 encoding in s and
- // the width in bytes of this encoding. The size will be 0 if s does not
- // hold enough bytes to complete the encoding. len(s) must be greater than 0.
- func (t *{{.Name}}Trie) lookup{{if eq .SourceType "string"}}String{{end}}(s {{.SourceType}}) (v {{.ValueType}}, sz int) {
- c0 := s[0]
- switch {
- case c0 < 0x80: // is ASCII
- return {{.ASCIIBlock}}[c0], 1
- case c0 < 0xC2:
- return 0, 1 // Illegal UTF-8: not a starter, not ASCII.
- case c0 < 0xE0: // 2-byte UTF-8
- if len(s) < 2 {
- return 0, 0
- }
- i := {{.StarterBlock}}[c0]
- c1 := s[1]
- if c1 < 0x80 || 0xC0 <= c1 {
- return 0, 1 // Illegal UTF-8: not a continuation byte.
- }
- return t.lookupValue(uint32(i), c1), 2
- case c0 < 0xF0: // 3-byte UTF-8
- if len(s) < 3 {
- return 0, 0
- }
- i := {{.StarterBlock}}[c0]
- c1 := s[1]
- if c1 < 0x80 || 0xC0 <= c1 {
- return 0, 1 // Illegal UTF-8: not a continuation byte.
- }
- o := uint32(i)<<6 + uint32(c1)
- i = {{.Name}}Index[o]
- c2 := s[2]
- if c2 < 0x80 || 0xC0 <= c2 {
- return 0, 2 // Illegal UTF-8: not a continuation byte.
- }
- return t.lookupValue(uint32(i), c2), 3
- case c0 < 0xF8: // 4-byte UTF-8
- if len(s) < 4 {
- return 0, 0
- }
- i := {{.StarterBlock}}[c0]
- c1 := s[1]
- if c1 < 0x80 || 0xC0 <= c1 {
- return 0, 1 // Illegal UTF-8: not a continuation byte.
- }
- o := uint32(i)<<6 + uint32(c1)
- i = {{.Name}}Index[o]
- c2 := s[2]
- if c2 < 0x80 || 0xC0 <= c2 {
- return 0, 2 // Illegal UTF-8: not a continuation byte.
- }
- o = uint32(i)<<6 + uint32(c2)
- i = {{.Name}}Index[o]
- c3 := s[3]
- if c3 < 0x80 || 0xC0 <= c3 {
- return 0, 3 // Illegal UTF-8: not a continuation byte.
- }
- return t.lookupValue(uint32(i), c3), 4
- }
- // Illegal rune
- return 0, 1
- }
- // lookup{{if eq .SourceType "string"}}String{{end}}Unsafe returns the trie value for the first UTF-8 encoding in s.
- // s must start with a full and valid UTF-8 encoded rune.
- func (t *{{.Name}}Trie) lookup{{if eq .SourceType "string"}}String{{end}}Unsafe(s {{.SourceType}}) {{.ValueType}} {
- c0 := s[0]
- if c0 < 0x80 { // is ASCII
- return {{.ASCIIBlock}}[c0]
- }
- i := {{.StarterBlock}}[c0]
- if c0 < 0xE0 { // 2-byte UTF-8
- return t.lookupValue(uint32(i), s[1])
- }
- i = {{.Name}}Index[uint32(i)<<6+uint32(s[1])]
- if c0 < 0xF0 { // 3-byte UTF-8
- return t.lookupValue(uint32(i), s[2])
- }
- i = {{.Name}}Index[uint32(i)<<6+uint32(s[2])]
- if c0 < 0xF8 { // 4-byte UTF-8
- return t.lookupValue(uint32(i), s[3])
- }
- return 0
- }
- `
|