123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115 |
- // Copyright 2015 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- // +build ignore
- // This program generates the trie for width operations. The generated table
- // includes width category information as well as the normalization mappings.
- package main
- import (
- "bytes"
- "fmt"
- "io"
- "log"
- "math"
- "unicode/utf8"
- "golang.org/x/text/internal/gen"
- "golang.org/x/text/internal/triegen"
- )
- // See gen_common.go for flags.
- func main() {
- gen.Init()
- genTables()
- genTests()
- gen.Repackage("gen_trieval.go", "trieval.go", "width")
- gen.Repackage("gen_common.go", "common_test.go", "width")
- }
- func genTables() {
- t := triegen.NewTrie("width")
- // fold and inverse mappings. See mapComment for a description of the format
- // of each entry. Add dummy value to make an index of 0 mean no mapping.
- inverse := [][4]byte{{}}
- mapping := map[[4]byte]int{[4]byte{}: 0}
- getWidthData(func(r rune, tag elem, alt rune) {
- idx := 0
- if alt != 0 {
- var buf [4]byte
- buf[0] = byte(utf8.EncodeRune(buf[1:], alt))
- s := string(r)
- buf[buf[0]] ^= s[len(s)-1]
- var ok bool
- if idx, ok = mapping[buf]; !ok {
- idx = len(mapping)
- if idx > math.MaxUint8 {
- log.Fatalf("Index %d does not fit in a byte.", idx)
- }
- mapping[buf] = idx
- inverse = append(inverse, buf)
- }
- }
- t.Insert(r, uint64(tag|elem(idx)))
- })
- w := &bytes.Buffer{}
- gen.WriteUnicodeVersion(w)
- sz, err := t.Gen(w)
- if err != nil {
- log.Fatal(err)
- }
- sz += writeMappings(w, inverse)
- fmt.Fprintf(w, "// Total table size %d bytes (%dKiB)\n", sz, sz/1024)
- gen.WriteVersionedGoFile(*outputFile, "width", w.Bytes())
- }
- const inverseDataComment = `
- // inverseData contains 4-byte entries of the following format:
- // <length> <modified UTF-8-encoded rune> <0 padding>
- // The last byte of the UTF-8-encoded rune is xor-ed with the last byte of the
- // UTF-8 encoding of the original rune. Mappings often have the following
- // pattern:
- // A -> A (U+FF21 -> U+0041)
- // B -> B (U+FF22 -> U+0042)
- // ...
- // By xor-ing the last byte the same entry can be shared by many mappings. This
- // reduces the total number of distinct entries by about two thirds.
- // The resulting entry for the aforementioned mappings is
- // { 0x01, 0xE0, 0x00, 0x00 }
- // Using this entry to map U+FF21 (UTF-8 [EF BC A1]), we get
- // E0 ^ A1 = 41.
- // Similarly, for U+FF22 (UTF-8 [EF BC A2]), we get
- // E0 ^ A2 = 42.
- // Note that because of the xor-ing, the byte sequence stored in the entry is
- // not valid UTF-8.`
- func writeMappings(w io.Writer, data [][4]byte) int {
- fmt.Fprintln(w, inverseDataComment)
- fmt.Fprintf(w, "var inverseData = [%d][4]byte{\n", len(data))
- for _, x := range data {
- fmt.Fprintf(w, "{ 0x%02x, 0x%02x, 0x%02x, 0x%02x },\n", x[0], x[1], x[2], x[3])
- }
- fmt.Fprintln(w, "}")
- return len(data) * 4
- }
- func genTests() {
- w := &bytes.Buffer{}
- fmt.Fprintf(w, "\nvar mapRunes = map[rune]struct{r rune; e elem}{\n")
- getWidthData(func(r rune, tag elem, alt rune) {
- if alt != 0 {
- fmt.Fprintf(w, "\t0x%X: {0x%X, 0x%X},\n", r, alt, tag)
- }
- })
- fmt.Fprintln(w, "}")
- gen.WriteGoFile("runes_test.go", "width", w.Bytes())
- }
|