gen.go 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. // Copyright 2015 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //go:build ignore
  5. // +build ignore
  6. // This program generates the trie for width operations. The generated table
  7. // includes width category information as well as the normalization mappings.
  8. package main
  9. import (
  10. "bytes"
  11. "fmt"
  12. "io"
  13. "log"
  14. "math"
  15. "unicode/utf8"
  16. "golang.org/x/text/internal/gen"
  17. "golang.org/x/text/internal/triegen"
  18. )
  19. // See gen_common.go for flags.
  20. func main() {
  21. gen.Init()
  22. genTables()
  23. genTests()
  24. gen.Repackage("gen_trieval.go", "trieval.go", "width")
  25. gen.Repackage("gen_common.go", "common_test.go", "width")
  26. }
  27. func genTables() {
  28. t := triegen.NewTrie("width")
  29. // fold and inverse mappings. See mapComment for a description of the format
  30. // of each entry. Add dummy value to make an index of 0 mean no mapping.
  31. inverse := [][4]byte{{}}
  32. mapping := map[[4]byte]int{[4]byte{}: 0}
  33. getWidthData(func(r rune, tag elem, alt rune) {
  34. idx := 0
  35. if alt != 0 {
  36. var buf [4]byte
  37. buf[0] = byte(utf8.EncodeRune(buf[1:], alt))
  38. s := string(r)
  39. buf[buf[0]] ^= s[len(s)-1]
  40. var ok bool
  41. if idx, ok = mapping[buf]; !ok {
  42. idx = len(mapping)
  43. if idx > math.MaxUint8 {
  44. log.Fatalf("Index %d does not fit in a byte.", idx)
  45. }
  46. mapping[buf] = idx
  47. inverse = append(inverse, buf)
  48. }
  49. }
  50. t.Insert(r, uint64(tag|elem(idx)))
  51. })
  52. w := &bytes.Buffer{}
  53. gen.WriteUnicodeVersion(w)
  54. sz, err := t.Gen(w)
  55. if err != nil {
  56. log.Fatal(err)
  57. }
  58. sz += writeMappings(w, inverse)
  59. fmt.Fprintf(w, "// Total table size %d bytes (%dKiB)\n", sz, sz/1024)
  60. gen.WriteVersionedGoFile(*outputFile, "width", w.Bytes())
  61. }
  62. const inverseDataComment = `
  63. // inverseData contains 4-byte entries of the following format:
  64. // <length> <modified UTF-8-encoded rune> <0 padding>
  65. // The last byte of the UTF-8-encoded rune is xor-ed with the last byte of the
  66. // UTF-8 encoding of the original rune. Mappings often have the following
  67. // pattern:
  68. // A -> A (U+FF21 -> U+0041)
  69. // B -> B (U+FF22 -> U+0042)
  70. // ...
  71. // By xor-ing the last byte the same entry can be shared by many mappings. This
  72. // reduces the total number of distinct entries by about two thirds.
  73. // The resulting entry for the aforementioned mappings is
  74. // { 0x01, 0xE0, 0x00, 0x00 }
  75. // Using this entry to map U+FF21 (UTF-8 [EF BC A1]), we get
  76. // E0 ^ A1 = 41.
  77. // Similarly, for U+FF22 (UTF-8 [EF BC A2]), we get
  78. // E0 ^ A2 = 42.
  79. // Note that because of the xor-ing, the byte sequence stored in the entry is
  80. // not valid UTF-8.`
  81. func writeMappings(w io.Writer, data [][4]byte) int {
  82. fmt.Fprintln(w, inverseDataComment)
  83. fmt.Fprintf(w, "var inverseData = [%d][4]byte{\n", len(data))
  84. for _, x := range data {
  85. fmt.Fprintf(w, "{ 0x%02x, 0x%02x, 0x%02x, 0x%02x },\n", x[0], x[1], x[2], x[3])
  86. }
  87. fmt.Fprintln(w, "}")
  88. return len(data) * 4
  89. }
  90. func genTests() {
  91. w := &bytes.Buffer{}
  92. fmt.Fprintf(w, "\nvar mapRunes = map[rune]struct{r rune; e elem}{\n")
  93. getWidthData(func(r rune, tag elem, alt rune) {
  94. if alt != 0 {
  95. fmt.Fprintf(w, "\t0x%X: {0x%X, 0x%X},\n", r, alt, tag)
  96. }
  97. })
  98. fmt.Fprintln(w, "}")
  99. gen.WriteGoFile("runes_test.go", "width", w.Bytes())
  100. }