gen.go 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. // Copyright 2012 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package main
  5. import (
  6. "math"
  7. "math/rand"
  8. "strings"
  9. "unicode"
  10. "unicode/utf16"
  11. "unicode/utf8"
  12. "golang.org/x/text/language"
  13. "golang.org/x/text/unicode/norm"
  14. )
  15. // TODO: replace with functionality in language package.
  16. // parent computes the parent language for the given language.
  17. // It returns false if the parent is already root.
  18. func parent(locale string) (parent string, ok bool) {
  19. if locale == "und" {
  20. return "", false
  21. }
  22. if i := strings.LastIndex(locale, "-"); i != -1 {
  23. return locale[:i], true
  24. }
  25. return "und", true
  26. }
  27. // rewriter is used to both unique strings and create variants of strings
  28. // to add to the test set.
  29. type rewriter struct {
  30. seen map[string]bool
  31. addCases bool
  32. }
  33. func newRewriter() *rewriter {
  34. return &rewriter{
  35. seen: make(map[string]bool),
  36. }
  37. }
  38. func (r *rewriter) insert(a []string, s string) []string {
  39. if !r.seen[s] {
  40. r.seen[s] = true
  41. a = append(a, s)
  42. }
  43. return a
  44. }
  45. // rewrite takes a sequence of strings in, adds variants of the these strings
  46. // based on options and removes duplicates.
  47. func (r *rewriter) rewrite(ss []string) []string {
  48. ns := []string{}
  49. for _, s := range ss {
  50. ns = r.insert(ns, s)
  51. if r.addCases {
  52. rs := []rune(s)
  53. rn := rs[0]
  54. for c := unicode.SimpleFold(rn); c != rn; c = unicode.SimpleFold(c) {
  55. rs[0] = c
  56. ns = r.insert(ns, string(rs))
  57. }
  58. }
  59. }
  60. return ns
  61. }
  62. // exemplarySet holds a parsed set of characters from the exemplarCharacters table.
  63. type exemplarySet struct {
  64. typ exemplarType
  65. set []string
  66. charIndex int // cumulative total of phrases, including this set
  67. }
  68. type phraseGenerator struct {
  69. sets [exN]exemplarySet
  70. n int
  71. }
  72. func (g *phraseGenerator) init(id string) {
  73. ec := exemplarCharacters
  74. loc := language.Make(id).String()
  75. // get sets for locale or parent locale if the set is not defined.
  76. for i := range g.sets {
  77. for p, ok := loc, true; ok; p, ok = parent(p) {
  78. if set, ok := ec[p]; ok && set[i] != "" {
  79. g.sets[i].set = strings.Split(set[i], " ")
  80. break
  81. }
  82. }
  83. }
  84. r := newRewriter()
  85. r.addCases = *cases
  86. for i := range g.sets {
  87. g.sets[i].set = r.rewrite(g.sets[i].set)
  88. }
  89. // compute indexes
  90. for i, set := range g.sets {
  91. g.n += len(set.set)
  92. g.sets[i].charIndex = g.n
  93. }
  94. }
  95. // phrase returns the ith phrase, where i < g.n.
  96. func (g *phraseGenerator) phrase(i int) string {
  97. for _, set := range g.sets {
  98. if i < set.charIndex {
  99. return set.set[i-(set.charIndex-len(set.set))]
  100. }
  101. }
  102. panic("index out of range")
  103. }
  104. // generate generates inputs by combining all pairs of examplar strings.
  105. // If doNorm is true, all input strings are normalized to NFC.
  106. // TODO: allow other variations, statistical models, and random
  107. // trailing sequences.
  108. func (g *phraseGenerator) generate(doNorm bool) []Input {
  109. const (
  110. M = 1024 * 1024
  111. buf8Size = 30 * M
  112. buf16Size = 10 * M
  113. )
  114. // TODO: use a better way to limit the input size.
  115. if sq := int(math.Sqrt(float64(*limit))); g.n > sq {
  116. g.n = sq
  117. }
  118. size := g.n * g.n
  119. a := make([]Input, 0, size)
  120. buf8 := make([]byte, 0, buf8Size)
  121. buf16 := make([]uint16, 0, buf16Size)
  122. addInput := func(str string) {
  123. buf8 = buf8[len(buf8):]
  124. buf16 = buf16[len(buf16):]
  125. if len(str) > cap(buf8) {
  126. buf8 = make([]byte, 0, buf8Size)
  127. }
  128. if len(str) > cap(buf16) {
  129. buf16 = make([]uint16, 0, buf16Size)
  130. }
  131. if doNorm {
  132. buf8 = norm.NFD.AppendString(buf8, str)
  133. } else {
  134. buf8 = append(buf8, str...)
  135. }
  136. buf16 = appendUTF16(buf16, buf8)
  137. a = append(a, makeInput(buf8, buf16))
  138. }
  139. for i := 0; i < g.n; i++ {
  140. p1 := g.phrase(i)
  141. addInput(p1)
  142. for j := 0; j < g.n; j++ {
  143. p2 := g.phrase(j)
  144. addInput(p1 + p2)
  145. }
  146. }
  147. // permutate
  148. rnd := rand.New(rand.NewSource(int64(rand.Int())))
  149. for i := range a {
  150. j := i + rnd.Intn(len(a)-i)
  151. a[i], a[j] = a[j], a[i]
  152. a[i].index = i // allow restoring this order if input is used multiple times.
  153. }
  154. return a
  155. }
  156. func appendUTF16(buf []uint16, s []byte) []uint16 {
  157. for len(s) > 0 {
  158. r, sz := utf8.DecodeRune(s)
  159. s = s[sz:]
  160. r1, r2 := utf16.EncodeRune(r)
  161. if r1 != 0xFFFD {
  162. buf = append(buf, uint16(r1), uint16(r2))
  163. } else {
  164. buf = append(buf, uint16(r))
  165. }
  166. }
  167. return buf
  168. }