colcmp.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529
  1. // Copyright 2012 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package main // import "golang.org/x/text/collate/tools/colcmp"
  5. import (
  6. "bytes"
  7. "flag"
  8. "fmt"
  9. "io"
  10. "log"
  11. "os"
  12. "runtime/pprof"
  13. "sort"
  14. "strconv"
  15. "strings"
  16. "text/template"
  17. "time"
  18. "golang.org/x/text/unicode/norm"
  19. )
  20. var (
  21. doNorm = flag.Bool("norm", false, "normalize input strings")
  22. cases = flag.Bool("case", false, "generate case variants")
  23. verbose = flag.Bool("verbose", false, "print results")
  24. debug = flag.Bool("debug", false, "output debug information")
  25. locales = flag.String("locale", "en_US", "the locale to use. May be a comma-separated list for some commands.")
  26. col = flag.String("col", "go", "collator to test")
  27. gold = flag.String("gold", "go", "collator used as the gold standard")
  28. usecmp = flag.Bool("usecmp", false,
  29. `use comparison instead of sort keys when sorting. Must be "test", "gold" or "both"`)
  30. cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")
  31. exclude = flag.String("exclude", "", "exclude errors that contain any of the characters")
  32. limit = flag.Int("limit", 5000000, "maximum number of samples to generate for one run")
  33. )
  34. func failOnError(err error) {
  35. if err != nil {
  36. log.Panic(err)
  37. }
  38. }
  39. // Test holds test data for testing a locale-collator pair.
  40. // Test also provides functionality that is commonly used by the various commands.
  41. type Test struct {
  42. ctxt *Context
  43. Name string
  44. Locale string
  45. ColName string
  46. Col Collator
  47. UseCompare bool
  48. Input []Input
  49. Duration time.Duration
  50. start time.Time
  51. msg string
  52. count int
  53. }
  54. func (t *Test) clear() {
  55. t.Col = nil
  56. t.Input = nil
  57. }
  58. const (
  59. msgGeneratingInput = "generating input"
  60. msgGeneratingKeys = "generating keys"
  61. msgSorting = "sorting"
  62. )
  63. var lastLen = 0
  64. func (t *Test) SetStatus(msg string) {
  65. if *debug || *verbose {
  66. fmt.Printf("%s: %s...\n", t.Name, msg)
  67. } else if t.ctxt.out != nil {
  68. fmt.Fprint(t.ctxt.out, strings.Repeat(" ", lastLen))
  69. fmt.Fprint(t.ctxt.out, strings.Repeat("\b", lastLen))
  70. fmt.Fprint(t.ctxt.out, msg, "...")
  71. lastLen = len(msg) + 3
  72. fmt.Fprint(t.ctxt.out, strings.Repeat("\b", lastLen))
  73. }
  74. }
  75. // Start is used by commands to signal the start of an operation.
  76. func (t *Test) Start(msg string) {
  77. t.SetStatus(msg)
  78. t.count = 0
  79. t.msg = msg
  80. t.start = time.Now()
  81. }
  82. // Stop is used by commands to signal the end of an operation.
  83. func (t *Test) Stop() (time.Duration, int) {
  84. d := time.Now().Sub(t.start)
  85. t.Duration += d
  86. if *debug || *verbose {
  87. fmt.Printf("%s: %s done. (%.3fs /%dK ops)\n", t.Name, t.msg, d.Seconds(), t.count/1000)
  88. }
  89. return d, t.count
  90. }
  91. // generateKeys generates sort keys for all the inputs.
  92. func (t *Test) generateKeys() {
  93. for i, s := range t.Input {
  94. b := t.Col.Key(s)
  95. t.Input[i].key = b
  96. if *debug {
  97. fmt.Printf("%s (%X): %X\n", string(s.UTF8), s.UTF16, b)
  98. }
  99. }
  100. }
  101. // Sort sorts the inputs. It generates sort keys if this is required by the
  102. // chosen sort method.
  103. func (t *Test) Sort() (tkey, tsort time.Duration, nkey, nsort int) {
  104. if *cpuprofile != "" {
  105. f, err := os.Create(*cpuprofile)
  106. failOnError(err)
  107. pprof.StartCPUProfile(f)
  108. defer pprof.StopCPUProfile()
  109. }
  110. if t.UseCompare || t.Col.Key(t.Input[0]) == nil {
  111. t.Start(msgSorting)
  112. sort.Sort(&testCompare{*t})
  113. tsort, nsort = t.Stop()
  114. } else {
  115. t.Start(msgGeneratingKeys)
  116. t.generateKeys()
  117. t.count = len(t.Input)
  118. tkey, nkey = t.Stop()
  119. t.Start(msgSorting)
  120. sort.Sort(t)
  121. tsort, nsort = t.Stop()
  122. }
  123. return
  124. }
  125. func (t *Test) Swap(a, b int) {
  126. t.Input[a], t.Input[b] = t.Input[b], t.Input[a]
  127. }
  128. func (t *Test) Less(a, b int) bool {
  129. t.count++
  130. return bytes.Compare(t.Input[a].key, t.Input[b].key) == -1
  131. }
  132. func (t Test) Len() int {
  133. return len(t.Input)
  134. }
  135. type testCompare struct {
  136. Test
  137. }
  138. func (t *testCompare) Less(a, b int) bool {
  139. t.count++
  140. return t.Col.Compare(t.Input[a], t.Input[b]) == -1
  141. }
  142. type testRestore struct {
  143. Test
  144. }
  145. func (t *testRestore) Less(a, b int) bool {
  146. return t.Input[a].index < t.Input[b].index
  147. }
  148. // GenerateInput generates input phrases for the locale tested by t.
  149. func (t *Test) GenerateInput() {
  150. t.Input = nil
  151. if t.ctxt.lastLocale != t.Locale {
  152. gen := phraseGenerator{}
  153. gen.init(t.Locale)
  154. t.SetStatus(msgGeneratingInput)
  155. t.ctxt.lastInput = nil // allow the previous value to be garbage collected.
  156. t.Input = gen.generate(*doNorm)
  157. t.ctxt.lastInput = t.Input
  158. t.ctxt.lastLocale = t.Locale
  159. } else {
  160. t.Input = t.ctxt.lastInput
  161. for i := range t.Input {
  162. t.Input[i].key = nil
  163. }
  164. sort.Sort(&testRestore{*t})
  165. }
  166. }
  167. // Context holds all tests and settings translated from command line options.
  168. type Context struct {
  169. test []*Test
  170. last *Test
  171. lastLocale string
  172. lastInput []Input
  173. out io.Writer
  174. }
  175. func (ts *Context) Printf(format string, a ...interface{}) {
  176. ts.assertBuf()
  177. fmt.Fprintf(ts.out, format, a...)
  178. }
  179. func (ts *Context) Print(a ...interface{}) {
  180. ts.assertBuf()
  181. fmt.Fprint(ts.out, a...)
  182. }
  183. // assertBuf sets up an io.Writer for output, if it doesn't already exist.
  184. // In debug and verbose mode, output is buffered so that the regular output
  185. // will not interfere with the additional output. Otherwise, output is
  186. // written directly to stdout for a more responsive feel.
  187. func (ts *Context) assertBuf() {
  188. if ts.out != nil {
  189. return
  190. }
  191. if *debug || *verbose {
  192. ts.out = &bytes.Buffer{}
  193. } else {
  194. ts.out = os.Stdout
  195. }
  196. }
  197. // flush flushes the contents of ts.out to stdout, if it is not stdout already.
  198. func (ts *Context) flush() {
  199. if ts.out != nil {
  200. if _, ok := ts.out.(io.ReadCloser); !ok {
  201. io.Copy(os.Stdout, ts.out.(io.Reader))
  202. }
  203. }
  204. }
  205. // parseTests creates all tests from command lines and returns
  206. // a Context to hold them.
  207. func parseTests() *Context {
  208. ctxt := &Context{}
  209. colls := strings.Split(*col, ",")
  210. for _, loc := range strings.Split(*locales, ",") {
  211. loc = strings.TrimSpace(loc)
  212. for _, name := range colls {
  213. name = strings.TrimSpace(name)
  214. col := getCollator(name, loc)
  215. ctxt.test = append(ctxt.test, &Test{
  216. ctxt: ctxt,
  217. Locale: loc,
  218. ColName: name,
  219. UseCompare: *usecmp,
  220. Col: col,
  221. })
  222. }
  223. }
  224. return ctxt
  225. }
  226. func (c *Context) Len() int {
  227. return len(c.test)
  228. }
  229. func (c *Context) Test(i int) *Test {
  230. if c.last != nil {
  231. c.last.clear()
  232. }
  233. c.last = c.test[i]
  234. return c.last
  235. }
  236. func parseInput(args []string) []Input {
  237. input := []Input{}
  238. for _, s := range args {
  239. rs := []rune{}
  240. for len(s) > 0 {
  241. var r rune
  242. r, _, s, _ = strconv.UnquoteChar(s, '\'')
  243. rs = append(rs, r)
  244. }
  245. s = string(rs)
  246. if *doNorm {
  247. s = norm.NFD.String(s)
  248. }
  249. input = append(input, makeInputString(s))
  250. }
  251. return input
  252. }
  253. // A Command is an implementation of a colcmp command.
  254. type Command struct {
  255. Run func(cmd *Context, args []string)
  256. Usage string
  257. Short string
  258. Long string
  259. }
  260. func (cmd Command) Name() string {
  261. return strings.SplitN(cmd.Usage, " ", 2)[0]
  262. }
  263. var commands = []*Command{
  264. cmdSort,
  265. cmdBench,
  266. cmdRegress,
  267. }
  268. const sortHelp = `
  269. Sort sorts a given list of strings. Strings are separated by whitespace.
  270. `
  271. var cmdSort = &Command{
  272. Run: runSort,
  273. Usage: "sort <string>*",
  274. Short: "sort a given list of strings",
  275. Long: sortHelp,
  276. }
  277. func runSort(ctxt *Context, args []string) {
  278. input := parseInput(args)
  279. if len(input) == 0 {
  280. log.Fatalf("Nothing to sort.")
  281. }
  282. if ctxt.Len() > 1 {
  283. ctxt.Print("COLL LOCALE RESULT\n")
  284. }
  285. for i := 0; i < ctxt.Len(); i++ {
  286. t := ctxt.Test(i)
  287. t.Input = append(t.Input, input...)
  288. t.Sort()
  289. if ctxt.Len() > 1 {
  290. ctxt.Printf("%-5s %-5s ", t.ColName, t.Locale)
  291. }
  292. for _, s := range t.Input {
  293. ctxt.Print(string(s.UTF8), " ")
  294. }
  295. ctxt.Print("\n")
  296. }
  297. }
  298. const benchHelp = `
  299. Bench runs a benchmark for the given list of collator implementations.
  300. If no collator implementations are given, the go collator will be used.
  301. `
  302. var cmdBench = &Command{
  303. Run: runBench,
  304. Usage: "bench",
  305. Short: "benchmark a given list of collator implementations",
  306. Long: benchHelp,
  307. }
  308. func runBench(ctxt *Context, args []string) {
  309. ctxt.Printf("%-7s %-5s %-6s %-24s %-24s %-5s %s\n", "LOCALE", "COLL", "N", "KEYS", "SORT", "AVGLN", "TOTAL")
  310. for i := 0; i < ctxt.Len(); i++ {
  311. t := ctxt.Test(i)
  312. ctxt.Printf("%-7s %-5s ", t.Locale, t.ColName)
  313. t.GenerateInput()
  314. ctxt.Printf("%-6s ", fmt.Sprintf("%dK", t.Len()/1000))
  315. tkey, tsort, nkey, nsort := t.Sort()
  316. p := func(dur time.Duration, n int) {
  317. s := ""
  318. if dur > 0 {
  319. s = fmt.Sprintf("%6.3fs ", dur.Seconds())
  320. if n > 0 {
  321. s += fmt.Sprintf("%15s", fmt.Sprintf("(%4.2f ns/op)", float64(dur)/float64(n)))
  322. }
  323. }
  324. ctxt.Printf("%-24s ", s)
  325. }
  326. p(tkey, nkey)
  327. p(tsort, nsort)
  328. total := 0
  329. for _, s := range t.Input {
  330. total += len(s.key)
  331. }
  332. ctxt.Printf("%-5d ", total/t.Len())
  333. ctxt.Printf("%6.3fs\n", t.Duration.Seconds())
  334. if *debug {
  335. for _, s := range t.Input {
  336. fmt.Print(string(s.UTF8), " ")
  337. }
  338. fmt.Println()
  339. }
  340. }
  341. }
  342. const regressHelp = `
  343. Regress runs a monkey test by comparing the results of randomly generated tests
  344. between two implementations of a collator. The user may optionally pass a list
  345. of strings to regress against instead of the default test set.
  346. `
  347. var cmdRegress = &Command{
  348. Run: runRegress,
  349. Usage: "regress -gold=<col> -test=<col> [string]*",
  350. Short: "run a monkey test between two collators",
  351. Long: regressHelp,
  352. }
  353. const failedKeyCompare = `
  354. %s:%d: incorrect comparison result for input:
  355. a: %q (%.4X)
  356. key: %s
  357. b: %q (%.4X)
  358. key: %s
  359. Compare(a, b) = %d; want %d.
  360. gold keys:
  361. a: %s
  362. b: %s
  363. `
  364. const failedCompare = `
  365. %s:%d: incorrect comparison result for input:
  366. a: %q (%.4X)
  367. b: %q (%.4X)
  368. Compare(a, b) = %d; want %d.
  369. `
  370. func keyStr(b []byte) string {
  371. buf := &bytes.Buffer{}
  372. for _, v := range b {
  373. fmt.Fprintf(buf, "%.2X ", v)
  374. }
  375. return buf.String()
  376. }
  377. func runRegress(ctxt *Context, args []string) {
  378. input := parseInput(args)
  379. for i := 0; i < ctxt.Len(); i++ {
  380. t := ctxt.Test(i)
  381. if len(input) > 0 {
  382. t.Input = append(t.Input, input...)
  383. } else {
  384. t.GenerateInput()
  385. }
  386. t.Sort()
  387. count := 0
  388. gold := getCollator(*gold, t.Locale)
  389. for i := 1; i < len(t.Input); i++ {
  390. ia := t.Input[i-1]
  391. ib := t.Input[i]
  392. if bytes.IndexAny(ib.UTF8, *exclude) != -1 {
  393. i++
  394. continue
  395. }
  396. if bytes.IndexAny(ia.UTF8, *exclude) != -1 {
  397. continue
  398. }
  399. goldCmp := gold.Compare(ia, ib)
  400. if cmp := bytes.Compare(ia.key, ib.key); cmp != goldCmp {
  401. count++
  402. a := string(ia.UTF8)
  403. b := string(ib.UTF8)
  404. fmt.Printf(failedKeyCompare, t.Locale, i-1, a, []rune(a), keyStr(ia.key), b, []rune(b), keyStr(ib.key), cmp, goldCmp, keyStr(gold.Key(ia)), keyStr(gold.Key(ib)))
  405. } else if cmp := t.Col.Compare(ia, ib); cmp != goldCmp {
  406. count++
  407. a := string(ia.UTF8)
  408. b := string(ib.UTF8)
  409. fmt.Printf(failedCompare, t.Locale, i-1, a, []rune(a), b, []rune(b), cmp, goldCmp)
  410. }
  411. }
  412. if count > 0 {
  413. ctxt.Printf("Found %d inconsistencies in %d entries.\n", count, t.Len()-1)
  414. }
  415. }
  416. }
  417. const helpTemplate = `
  418. colcmp is a tool for testing and benchmarking collation
  419. Usage: colcmp command [arguments]
  420. The commands are:
  421. {{range .}}
  422. {{.Name | printf "%-11s"}} {{.Short}}{{end}}
  423. Use "col help [topic]" for more information about that topic.
  424. `
  425. const detailedHelpTemplate = `
  426. Usage: colcmp {{.Usage}}
  427. {{.Long | trim}}
  428. `
  429. func runHelp(args []string) {
  430. t := template.New("help")
  431. t.Funcs(template.FuncMap{"trim": strings.TrimSpace})
  432. if len(args) < 1 {
  433. template.Must(t.Parse(helpTemplate))
  434. failOnError(t.Execute(os.Stderr, &commands))
  435. } else {
  436. for _, cmd := range commands {
  437. if cmd.Name() == args[0] {
  438. template.Must(t.Parse(detailedHelpTemplate))
  439. failOnError(t.Execute(os.Stderr, cmd))
  440. os.Exit(0)
  441. }
  442. }
  443. log.Fatalf("Unknown command %q. Run 'colcmp help'.", args[0])
  444. }
  445. os.Exit(0)
  446. }
  447. func main() {
  448. flag.Parse()
  449. log.SetFlags(0)
  450. ctxt := parseTests()
  451. if flag.NArg() < 1 {
  452. runHelp(nil)
  453. }
  454. args := flag.Args()[1:]
  455. if flag.Arg(0) == "help" {
  456. runHelp(args)
  457. }
  458. for _, cmd := range commands {
  459. if cmd.Name() == flag.Arg(0) {
  460. cmd.Run(ctxt, args)
  461. ctxt.flush()
  462. return
  463. }
  464. }
  465. runHelp(flag.Args())
  466. }