123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183 |
- // Copyright 2012 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- package main
- import (
- "math"
- "math/rand"
- "strings"
- "unicode"
- "unicode/utf16"
- "unicode/utf8"
- "golang.org/x/text/language"
- "golang.org/x/text/unicode/norm"
- )
- // TODO: replace with functionality in language package.
- // parent computes the parent language for the given language.
- // It returns false if the parent is already root.
- func parent(locale string) (parent string, ok bool) {
- if locale == "und" {
- return "", false
- }
- if i := strings.LastIndex(locale, "-"); i != -1 {
- return locale[:i], true
- }
- return "und", true
- }
- // rewriter is used to both unique strings and create variants of strings
- // to add to the test set.
- type rewriter struct {
- seen map[string]bool
- addCases bool
- }
- func newRewriter() *rewriter {
- return &rewriter{
- seen: make(map[string]bool),
- }
- }
- func (r *rewriter) insert(a []string, s string) []string {
- if !r.seen[s] {
- r.seen[s] = true
- a = append(a, s)
- }
- return a
- }
- // rewrite takes a sequence of strings in, adds variants of the these strings
- // based on options and removes duplicates.
- func (r *rewriter) rewrite(ss []string) []string {
- ns := []string{}
- for _, s := range ss {
- ns = r.insert(ns, s)
- if r.addCases {
- rs := []rune(s)
- rn := rs[0]
- for c := unicode.SimpleFold(rn); c != rn; c = unicode.SimpleFold(c) {
- rs[0] = c
- ns = r.insert(ns, string(rs))
- }
- }
- }
- return ns
- }
- // exemplarySet holds a parsed set of characters from the exemplarCharacters table.
- type exemplarySet struct {
- typ exemplarType
- set []string
- charIndex int // cumulative total of phrases, including this set
- }
- type phraseGenerator struct {
- sets [exN]exemplarySet
- n int
- }
- func (g *phraseGenerator) init(id string) {
- ec := exemplarCharacters
- loc := language.Make(id).String()
- // get sets for locale or parent locale if the set is not defined.
- for i := range g.sets {
- for p, ok := loc, true; ok; p, ok = parent(p) {
- if set, ok := ec[p]; ok && set[i] != "" {
- g.sets[i].set = strings.Split(set[i], " ")
- break
- }
- }
- }
- r := newRewriter()
- r.addCases = *cases
- for i := range g.sets {
- g.sets[i].set = r.rewrite(g.sets[i].set)
- }
- // compute indexes
- for i, set := range g.sets {
- g.n += len(set.set)
- g.sets[i].charIndex = g.n
- }
- }
- // phrase returns the ith phrase, where i < g.n.
- func (g *phraseGenerator) phrase(i int) string {
- for _, set := range g.sets {
- if i < set.charIndex {
- return set.set[i-(set.charIndex-len(set.set))]
- }
- }
- panic("index out of range")
- }
- // generate generates inputs by combining all pairs of examplar strings.
- // If doNorm is true, all input strings are normalized to NFC.
- // TODO: allow other variations, statistical models, and random
- // trailing sequences.
- func (g *phraseGenerator) generate(doNorm bool) []Input {
- const (
- M = 1024 * 1024
- buf8Size = 30 * M
- buf16Size = 10 * M
- )
- // TODO: use a better way to limit the input size.
- if sq := int(math.Sqrt(float64(*limit))); g.n > sq {
- g.n = sq
- }
- size := g.n * g.n
- a := make([]Input, 0, size)
- buf8 := make([]byte, 0, buf8Size)
- buf16 := make([]uint16, 0, buf16Size)
- addInput := func(str string) {
- buf8 = buf8[len(buf8):]
- buf16 = buf16[len(buf16):]
- if len(str) > cap(buf8) {
- buf8 = make([]byte, 0, buf8Size)
- }
- if len(str) > cap(buf16) {
- buf16 = make([]uint16, 0, buf16Size)
- }
- if doNorm {
- buf8 = norm.NFD.AppendString(buf8, str)
- } else {
- buf8 = append(buf8, str...)
- }
- buf16 = appendUTF16(buf16, buf8)
- a = append(a, makeInput(buf8, buf16))
- }
- for i := 0; i < g.n; i++ {
- p1 := g.phrase(i)
- addInput(p1)
- for j := 0; j < g.n; j++ {
- p2 := g.phrase(j)
- addInput(p1 + p2)
- }
- }
- // permutate
- rnd := rand.New(rand.NewSource(int64(rand.Int())))
- for i := range a {
- j := i + rnd.Intn(len(a)-i)
- a[i], a[j] = a[j], a[i]
- a[i].index = i // allow restoring this order if input is used multiple times.
- }
- return a
- }
- func appendUTF16(buf []uint16, s []byte) []uint16 {
- for len(s) > 0 {
- r, sz := utf8.DecodeRune(s)
- s = s[sz:]
- r1, r2 := utf16.EncodeRune(r)
- if r1 != 0xFFFD {
- buf = append(buf, uint16(r1), uint16(r2))
- } else {
- buf = append(buf, uint16(r))
- }
- }
- return buf
- }
|