libgo/go/exp/locale/collate/tools/colcmp/colcmp.go

   1 // Copyright 2012 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package main
   6
   7 import (
   8         "bytes"
   9         "exp/norm"
  10         "flag"
  11         "fmt"
  12         "io"
  13         "log"
  14         "os"
  15         "runtime/pprof"
  16         "sort"
  17         "strconv"
  18         "strings"
  19         "text/template"
  20         "time"
  21 )
  22
  23 var (
  24         doNorm  = flag.Bool("norm", false, "normalize input strings")
  25         cases   = flag.Bool("case", false, "generate case variants")
  26         verbose = flag.Bool("verbose", false, "print results")
  27         debug   = flag.Bool("debug", false, "output debug information")
  28         locale  = flag.String("locale", "en_US", "the locale to use. May be a comma-separated list for some commands.")
  29         col     = flag.String("col", "go", "collator to test")
  30         gold    = flag.String("gold", "go", "collator used as the gold standard")
  31         usecmp  = flag.Bool("usecmp", false,
  32                 `use comparison instead of sort keys when sorting.  Must be "test", "gold" or "both"`)
  33         cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")
  34         exclude    = flag.String("exclude", "", "exclude errors that contain any of the characters")
  35         limit      = flag.Int("limit", 5000000, "maximum number of samples to generate for one run")
  36 )
  37
  38 func failOnError(err error) {
  39         if err != nil {
  40                 log.Panic(err)
  41         }
  42 }
  43
  44 // Test holds test data for testing a locale-collator pair.
  45 // Test also provides functionality that is commonly used by the various commands.
  46 type Test struct {
  47         ctxt    *Context
  48         Name    string
  49         Locale  string
  50         ColName string
  51
  52         Col        Collator
  53         UseCompare bool
  54
  55         Input    []Input
  56         Duration time.Duration
  57
  58         start time.Time
  59         msg   string
  60         count int
  61 }
  62
  63 func (t *Test) clear() {
  64         t.Col = nil
  65         t.Input = nil
  66 }
  67
  68 const (
  69         msgGeneratingInput = "generating input"
  70         msgGeneratingKeys  = "generating keys"
  71         msgSorting         = "sorting"
  72 )
  73
  74 var lastLen = 0
  75
  76 func (t *Test) SetStatus(msg string) {
  77         if *debug || *verbose {
  78                 fmt.Printf("%s: %s...\n", t.Name, msg)
  79         } else if t.ctxt.out != nil {
  80                 fmt.Fprint(t.ctxt.out, strings.Repeat(" ", lastLen))
  81                 fmt.Fprint(t.ctxt.out, strings.Repeat("\b", lastLen))
  82                 fmt.Fprint(t.ctxt.out, msg, "...")
  83                 lastLen = len(msg) + 3
  84                 fmt.Fprint(t.ctxt.out, strings.Repeat("\b", lastLen))
  85         }
  86 }
  87
  88 // Start is used by commands to signal the start of an operation.
  89 func (t *Test) Start(msg string) {
  90         t.SetStatus(msg)
  91         t.count = 0
  92         t.msg = msg
  93         t.start = time.Now()
  94 }
  95
  96 // Stop is used by commands to signal the end of an operation.
  97 func (t *Test) Stop() (time.Duration, int) {
  98         d := time.Now().Sub(t.start)
  99         t.Duration += d
 100         if *debug || *verbose {
 101                 fmt.Printf("%s: %s done. (%.3fs /%dK ops)\n", t.Name, t.msg, d.Seconds(), t.count/1000)
 102         }
 103         return d, t.count
 104 }
 105
 106 // generateKeys generates sort keys for all the inputs.
 107 func (t *Test) generateKeys() {
 108         for i, s := range t.Input {
 109                 b := t.Col.Key(s)
 110                 t.Input[i].key = b
 111                 if *debug {
 112                         fmt.Printf("%s (%X): %X\n", string(s.UTF8), s.UTF16, b)
 113                 }
 114         }
 115 }
 116
 117 // Sort sorts the inputs. It generates sort keys if this is required by the
 118 // chosen sort method.
 119 func (t *Test) Sort() (tkey, tsort time.Duration, nkey, nsort int) {
 120         if *cpuprofile != "" {
 121                 f, err := os.Create(*cpuprofile)
 122                 failOnError(err)
 123                 pprof.StartCPUProfile(f)
 124                 defer pprof.StopCPUProfile()
 125         }
 126         if t.UseCompare || t.Col.Key(t.Input[0]) == nil {
 127                 t.Start(msgSorting)
 128                 sort.Sort(&testCompare{*t})
 129                 tsort, nsort = t.Stop()
 130         } else {
 131                 t.Start(msgGeneratingKeys)
 132                 t.generateKeys()
 133                 t.count = len(t.Input)
 134                 tkey, nkey = t.Stop()
 135                 t.Start(msgSorting)
 136                 sort.Sort(t)
 137                 tsort, nsort = t.Stop()
 138         }
 139         return
 140 }
 141
 142 func (t *Test) Swap(a, b int) {
 143         t.Input[a], t.Input[b] = t.Input[b], t.Input[a]
 144 }
 145
 146 func (t *Test) Less(a, b int) bool {
 147         t.count++
 148         return bytes.Compare(t.Input[a].key, t.Input[b].key) == -1
 149 }
 150
 151 func (t Test) Len() int {
 152         return len(t.Input)
 153 }
 154
 155 type testCompare struct {
 156         Test
 157 }
 158
 159 func (t *testCompare) Less(a, b int) bool {
 160         t.count++
 161         return t.Col.Compare(t.Input[a], t.Input[b]) == -1
 162 }
 163
 164 type testRestore struct {
 165         Test
 166 }
 167
 168 func (t *testRestore) Less(a, b int) bool {
 169         return t.Input[a].index < t.Input[b].index
 170 }
 171
 172 // GenerateInput generates input phrases for the locale tested by t.
 173 func (t *Test) GenerateInput() {
 174         t.Input = nil
 175         if t.ctxt.lastLocale != t.Locale {
 176                 gen := phraseGenerator{}
 177                 gen.init(t.Locale)
 178                 t.SetStatus(msgGeneratingInput)
 179                 t.ctxt.lastInput = nil // allow the previous value to be garbage collected.
 180                 t.Input = gen.generate(*doNorm)
 181                 t.ctxt.lastInput = t.Input
 182                 t.ctxt.lastLocale = t.Locale
 183         } else {
 184                 t.Input = t.ctxt.lastInput
 185                 for i := range t.Input {
 186                         t.Input[i].key = nil
 187                 }
 188                 sort.Sort(&testRestore{*t})
 189         }
 190 }
 191
 192 // Context holds all tests and settings translated from command line options.
 193 type Context struct {
 194         test []*Test
 195         last *Test
 196
 197         lastLocale string
 198         lastInput  []Input
 199
 200         out io.Writer
 201 }
 202
 203 func (ts *Context) Printf(format string, a ...interface{}) {
 204         ts.assertBuf()
 205         fmt.Fprintf(ts.out, format, a...)
 206 }
 207
 208 func (ts *Context) Print(a ...interface{}) {
 209         ts.assertBuf()
 210         fmt.Fprint(ts.out, a...)
 211 }
 212
 213 // assertBuf sets up an io.Writer for ouput, if it doesn't already exist.
 214 // In debug and verbose mode, output is buffered so that the regular output
 215 // will not interfere with the additional output.  Otherwise, output is
 216 // written directly to stdout for a more responsive feel.
 217 func (ts *Context) assertBuf() {
 218         if ts.out != nil {
 219                 return
 220         }
 221         if *debug || *verbose {
 222                 ts.out = &bytes.Buffer{}
 223         } else {
 224                 ts.out = os.Stdout
 225         }
 226 }
 227
 228 // flush flushes the contents of ts.out to stdout, if it is not stdout already.
 229 func (ts *Context) flush() {
 230         if ts.out != nil {
 231                 if _, ok := ts.out.(io.ReadCloser); !ok {
 232                         io.Copy(os.Stdout, ts.out.(io.Reader))
 233                 }
 234         }
 235 }
 236
 237 // parseTests creates all tests from command lines and returns
 238 // a Context to hold them.
 239 func parseTests() *Context {
 240         ctxt := &Context{}
 241         colls := strings.Split(*col, ",")
 242         for _, loc := range strings.Split(*locale, ",") {
 243                 loc = strings.TrimSpace(loc)
 244                 for _, name := range colls {
 245                         name = strings.TrimSpace(name)
 246                         col := getCollator(name, loc)
 247                         ctxt.test = append(ctxt.test, &Test{
 248                                 ctxt:       ctxt,
 249                                 Locale:     loc,
 250                                 ColName:    name,
 251                                 UseCompare: *usecmp,
 252                                 Col:        col,
 253                         })
 254                 }
 255         }
 256         return ctxt
 257 }
 258
 259 func (c *Context) Len() int {
 260         return len(c.test)
 261 }
 262
 263 func (c *Context) Test(i int) *Test {
 264         if c.last != nil {
 265                 c.last.clear()
 266         }
 267         c.last = c.test[i]
 268         return c.last
 269 }
 270
 271 func parseInput(args []string) []Input {
 272         input := []Input{}
 273         for _, s := range args {
 274                 rs := []rune{}
 275                 for len(s) > 0 {
 276                         var r rune
 277                         r, _, s, _ = strconv.UnquoteChar(s, '\'')
 278                         rs = append(rs, r)
 279                 }
 280                 s = string(rs)
 281                 if *doNorm {
 282                         s = norm.NFC.String(s)
 283                 }
 284                 input = append(input, makeInputString(s))
 285         }
 286         return input
 287 }
 288
 289 // A Command is an implementation of a colcmp command.
 290 type Command struct {
 291         Run   func(cmd *Context, args []string)
 292         Usage string
 293         Short string
 294         Long  string
 295 }
 296
 297 func (cmd Command) Name() string {
 298         return strings.SplitN(cmd.Usage, " ", 2)[0]
 299 }
 300
 301 var commands = []*Command{
 302         cmdSort,
 303         cmdBench,
 304         cmdRegress,
 305 }
 306
 307 const sortHelp = `
 308 Sort sorts a given list of strings.  Strings are separated by whitespace.
 309 `
 310
 311 var cmdSort = &Command{
 312         Run:   runSort,
 313         Usage: "sort <string>*",
 314         Short: "sort a given list of strings",
 315         Long:  sortHelp,
 316 }
 317
 318 func runSort(ctxt *Context, args []string) {
 319         input := parseInput(args)
 320         if len(input) == 0 {
 321                 log.Fatalf("Nothing to sort.")
 322         }
 323         if ctxt.Len() > 1 {
 324                 ctxt.Print("COLL  LOCALE RESULT\n")
 325         }
 326         for i := 0; i < ctxt.Len(); i++ {
 327                 t := ctxt.Test(i)
 328                 t.Input = append(t.Input, input...)
 329                 t.Sort()
 330                 if ctxt.Len() > 1 {
 331                         ctxt.Printf("%-5s %-5s  ", t.ColName, t.Locale)
 332                 }
 333                 for _, s := range t.Input {
 334                         ctxt.Print(string(s.UTF8), " ")
 335                 }
 336                 ctxt.Print("\n")
 337         }
 338 }
 339
 340 const benchHelp = `
 341 Bench runs a benchmark for the given list of collator implementations.
 342 If no collator implementations are given, the go collator will be used.
 343 `
 344
 345 var cmdBench = &Command{
 346         Run:   runBench,
 347         Usage: "bench",
 348         Short: "benchmark a given list of collator implementations",
 349         Long:  benchHelp,
 350 }
 351
 352 func runBench(ctxt *Context, args []string) {
 353         ctxt.Printf("%-7s %-5s %-6s %-24s %-24s %-5s %s\n", "LOCALE", "COLL", "N", "KEYS", "SORT", "AVGLN", "TOTAL")
 354         for i := 0; i < ctxt.Len(); i++ {
 355                 t := ctxt.Test(i)
 356                 ctxt.Printf("%-7s %-5s ", t.Locale, t.ColName)
 357                 t.GenerateInput()
 358                 ctxt.Printf("%-6s ", fmt.Sprintf("%dK", t.Len()/1000))
 359                 tkey, tsort, nkey, nsort := t.Sort()
 360                 p := func(dur time.Duration, n int) {
 361                         s := ""
 362                         if dur > 0 {
 363                                 s = fmt.Sprintf("%6.3fs ", dur.Seconds())
 364                                 if n > 0 {
 365                                         s += fmt.Sprintf("%15s", fmt.Sprintf("(%4.2f ns/op)", float64(dur)/float64(n)))
 366                                 }
 367                         }
 368                         ctxt.Printf("%-24s ", s)
 369                 }
 370                 p(tkey, nkey)
 371                 p(tsort, nsort)
 372
 373                 total := 0
 374                 for _, s := range t.Input {
 375                         total += len(s.key)
 376                 }
 377                 ctxt.Printf("%-5d ", total/t.Len())
 378                 ctxt.Printf("%6.3fs\n", t.Duration.Seconds())
 379                 if *debug {
 380                         for _, s := range t.Input {
 381                                 fmt.Print(string(s.UTF8), " ")
 382                         }
 383                         fmt.Println()
 384                 }
 385         }
 386 }
 387
 388 const regressHelp = `
 389 Regress runs a monkey test by comparing the results of randomly generated tests
 390 between two implementations of a collator. The user may optionally pass a list
 391 of strings to regress against instead of the default test set.
 392 `
 393
 394 var cmdRegress = &Command{
 395         Run:   runRegress,
 396         Usage: "regress -gold=<col> -test=<col> [string]*",
 397         Short: "run a monkey test between two collators",
 398         Long:  regressHelp,
 399 }
 400
 401 const failedKeyCompare = `
 402 %s:%d: incorrect comparison result for input:
 403     a:   %q (%.4X)
 404     key: %s
 405     b:   %q (%.4X)
 406     key: %s
 407     Compare(a, b) = %d; want %d.
 408
 409   gold keys:
 410         a:   %s
 411         b:   %s
 412 `
 413
 414 const failedCompare = `
 415 %s:%d: incorrect comparison result for input:
 416     a:   %q (%.4X)
 417     b:   %q (%.4X)
 418     Compare(a, b) = %d; want %d.
 419 `
 420
 421 func keyStr(b []byte) string {
 422         buf := &bytes.Buffer{}
 423         for _, v := range b {
 424                 fmt.Fprintf(buf, "%.2X ", v)
 425         }
 426         return buf.String()
 427 }
 428
 429 func runRegress(ctxt *Context, args []string) {
 430         input := parseInput(args)
 431         for i := 0; i < ctxt.Len(); i++ {
 432                 t := ctxt.Test(i)
 433                 if len(input) > 0 {
 434                         t.Input = append(t.Input, input...)
 435                 } else {
 436                         t.GenerateInput()
 437                 }
 438                 t.Sort()
 439                 count := 0
 440                 gold := getCollator(*gold, t.Locale)
 441                 for i := 1; i < len(t.Input); i++ {
 442                         ia := t.Input[i-1]
 443                         ib := t.Input[i]
 444                         if bytes.IndexAny(ib.UTF8, *exclude) != -1 {
 445                                 i++
 446                                 continue
 447                         }
 448                         if bytes.IndexAny(ia.UTF8, *exclude) != -1 {
 449                                 continue
 450                         }
 451                         goldCmp := gold.Compare(ia, ib)
 452                         if cmp := bytes.Compare(ia.key, ib.key); cmp != goldCmp {
 453                                 count++
 454                                 a := string(ia.UTF8)
 455                                 b := string(ib.UTF8)
 456                                 fmt.Printf(failedKeyCompare, t.Locale, i-1, a, []rune(a), keyStr(ia.key), b, []rune(b), keyStr(ib.key), cmp, goldCmp, keyStr(gold.Key(ia)), keyStr(gold.Key(ib)))
 457                         } else if cmp := t.Col.Compare(ia, ib); cmp != goldCmp {
 458                                 count++
 459                                 a := string(ia.UTF8)
 460                                 b := string(ib.UTF8)
 461                                 fmt.Printf(failedCompare, t.Locale, i-1, a, []rune(a), b, []rune(b), cmp, goldCmp)
 462                         }
 463                 }
 464                 if count > 0 {
 465                         ctxt.Printf("Found %d inconsistencies in %d entries.\n", count, t.Len()-1)
 466                 }
 467         }
 468 }
 469
 470 const helpTemplate = `
 471 colcmp is a tool for testing and benchmarking collation
 472
 473 Usage: colcmp command [arguments]
 474
 475 The commands are:
 476 {{range .}}
 477     {{.Name | printf "%-11s"}} {{.Short}}{{end}}
 478
 479 Use "col help [topic]" for more information about that topic.
 480 `
 481
 482 const detailedHelpTemplate = `
 483 Usage: colcmp {{.Usage}}
 484
 485 {{.Long | trim}}
 486 `
 487
 488 func runHelp(args []string) {
 489         t := template.New("help")
 490         t.Funcs(template.FuncMap{"trim": strings.TrimSpace})
 491         if len(args) < 1 {
 492                 template.Must(t.Parse(helpTemplate))
 493                 failOnError(t.Execute(os.Stderr, &commands))
 494         } else {
 495                 for _, cmd := range commands {
 496                         if cmd.Name() == args[0] {
 497                                 template.Must(t.Parse(detailedHelpTemplate))
 498                                 failOnError(t.Execute(os.Stderr, cmd))
 499                                 os.Exit(0)
 500                         }
 501                 }
 502                 log.Fatalf("Unknown command %q. Run 'colcmp help'.", args[0])
 503         }
 504         os.Exit(0)
 505 }
 506
 507 func main() {
 508         flag.Parse()
 509         log.SetFlags(0)
 510
 511         ctxt := parseTests()
 512
 513         if flag.NArg() < 1 {
 514                 runHelp(nil)
 515         }
 516         args := flag.Args()[1:]
 517         if flag.Arg(0) == "help" {
 518                 runHelp(args)
 519         }
 520         for _, cmd := range commands {
 521                 if cmd.Name() == flag.Arg(0) {
 522                         cmd.Run(ctxt, args)
 523                         ctxt.flush()
 524                         return
 525                 }
 526         }
 527         runHelp(flag.Args())
 528 }