submodule/skia/tools/calmbench/ab.py

   1 #!/usr/bin/python
   2 # encoding: utf-8
   3
   4 # Copyright 2017 Google Inc.
   5 #
   6 # Use of this source code is governed by a BSD-style license that can be found
   7 # in the LICENSE file.
   8 #
   9 # This is an A/B test utility script used by calmbench.py
  10 #
  11 # For each bench, we get a distribution of min_ms measurements from nanobench.
  12 # From that, we try to recover the 1/3 and 2/3 quantiles of the distribution.
  13 # If range (1/3 quantile, 2/3 quantile) is completely disjoint between A and B,
  14 # we report that as a regression.
  15 #
  16 # The more measurements we have for a bench, the more accurate our quantiles
  17 # are. However, taking more measurements is time consuming. Hence we'll prune
  18 # out benches and only take more measurements for benches whose current quantile
  19 # ranges are disjoint.
  20 #
  21 # P.S. The current script is brute forcely translated from a ruby script. So it
  22 # may be ugly...
  23
  24
  25 from __future__ import print_function
  26 import re
  27 import os
  28 import sys
  29 import time
  30 import json
  31 import subprocess
  32 import shlex
  33 import multiprocessing
  34 import traceback
  35 from argparse import ArgumentParser
  36 from multiprocessing import Process
  37 from threading import Thread
  38 from threading import Lock
  39 from pdb import set_trace
  40
  41
  42 HELP = """
  43 \033[31mPlease call calmbench.py to drive this script if you're not doing so.
  44 This script is not supposed to be used by itself. (At least, it's not easy to
  45 use by itself. The calmbench bots may use this script directly.)
  46 \033[0m
  47 """
  48
  49 FACTOR  = 3     # lower/upper quantile factor
  50 DIFF_T  = 0.99  # different enough threshold
  51 TERM    = 10    # terminate after this no. of iterations without suspect changes
  52 MAXTRY  = 30    # max number of nanobench tries to narrow down suspects
  53
  54 UNITS   = "ns µs ms s".split()
  55
  56
  57 timesLock = Lock()
  58 timesA  = {}
  59 timesB  = {}
  60
  61
  62 def parse_args():
  63   parser = ArgumentParser(description=HELP)
  64
  65   parser.add_argument('outdir', type=str, help="output directory")
  66   parser.add_argument('a', type=str, help="name of A")
  67   parser.add_argument('b', type=str, help="name of B")
  68   parser.add_argument('nano_a', type=str, help="path to A's nanobench binary")
  69   parser.add_argument('nano_b', type=str, help="path to B's nanobench binary")
  70   parser.add_argument('arg_a', type=str, help="args for A's nanobench run")
  71   parser.add_argument('arg_b', type=str, help="args for B's nanobench run")
  72   parser.add_argument('repeat', type=int, help="number of initial runs")
  73   parser.add_argument('skip_b', type=str, help=("whether to skip running B"
  74                                                 " ('true' or 'false')"))
  75   parser.add_argument('config', type=str, help="nanobenh config")
  76   parser.add_argument('threads', type=int, help="number of threads to run")
  77   parser.add_argument('noinit', type=str, help=("whether to skip running B"
  78                                                 " ('true' or 'false')"))
  79
  80   parser.add_argument('--concise', dest='concise', action="store_true",
  81       help="If set, no verbose thread info will be printed.")
  82   parser.set_defaults(concise=False)
  83
  84   # Additional args for bots
  85   BHELP = "bot specific options"
  86   parser.add_argument('--githash', type=str, default="", help=BHELP)
  87   parser.add_argument('--keys', type=str, default=[], nargs='+', help=BHELP)
  88
  89   args = parser.parse_args()
  90   args.skip_b = args.skip_b == "true"
  91   args.noinit = args.noinit == "true"
  92
  93   if args.threads == -1:
  94     args.threads = 1
  95     if args.config in ["8888", "565"]: # multi-thread for CPU only
  96         args.threads = max(1, multiprocessing.cpu_count() / 2)
  97
  98   return args
  99
 100 def append_dict_sorted_array(dict_array, key, value):
 101   if key not in dict_array:
 102     dict_array[key] = []
 103   dict_array[key].append(value)
 104   dict_array[key].sort()
 105
 106
 107 def add_time(args, name, bench, t, unit):
 108   normalized_t = t * 1000 ** UNITS.index(unit);
 109   if name.startswith(args.a):
 110     append_dict_sorted_array(timesA, bench, normalized_t)
 111   else:
 112     append_dict_sorted_array(timesB, bench, normalized_t)
 113
 114
 115 def append_times_from_file(args, name, filename):
 116   with open(filename) as f:
 117     lines = f.readlines()
 118   for line in lines:
 119     items = line.split()
 120     if len(items) > 10:
 121       bench = items[10]
 122       matches = re.search("([+-]?\d*.?\d+)(s|ms|µs|ns)", items[3])
 123       if (not matches or items[9] != args.config):
 124         continue
 125       time_num = matches.group(1)
 126       time_unit = matches.group(2)
 127       add_time(args, name, bench, float(time_num), time_unit)
 128
 129
 130 class ThreadWithException(Thread):
 131   def __init__(self, target):
 132     super(ThreadWithException, self).__init__(target = target)
 133     self.exception = None
 134
 135   def run(self):
 136     try:
 137       self._Thread__target(*self._Thread__args, **self._Thread__kwargs)
 138     except BaseException as e:
 139       self.exception = e
 140
 141   def join(self, timeout=None):
 142     super(ThreadWithException, self).join(timeout)
 143
 144
 145 class ThreadRunner:
 146   """Simplest and stupidiest threaded executer."""
 147   def __init__(self, args):
 148     self.concise = args.concise
 149     self.threads = []
 150
 151   def add(self, args, fn):
 152     if len(self.threads) >= args.threads:
 153       self.wait()
 154     t = ThreadWithException(target = fn)
 155     t.daemon = True
 156     self.threads.append(t)
 157     t.start()
 158
 159   def wait(self):
 160     def spin():
 161       i = 0
 162       spinners = [".  ", ".. ", "..."]
 163       while len(self.threads) > 0:
 164         timesLock.acquire()
 165         sys.stderr.write(
 166             "\r" + spinners[i % len(spinners)] +
 167             " (%d threads running)" % len(self.threads) +
 168             "           \r" # spaces for erasing characters
 169         )
 170         timesLock.release()
 171         time.sleep(0.5)
 172         i += 1
 173
 174     if not self.concise:
 175       ts = Thread(target = spin);
 176       ts.start()
 177
 178     for t in self.threads:
 179       t.join()
 180
 181     exceptions = []
 182     for t in self.threads:
 183       if t.exception:
 184         exceptions.append(t.exception)
 185
 186     self.threads = []
 187
 188     if not self.concise:
 189       ts.join()
 190
 191     if len(exceptions):
 192       for exc in exceptions:
 193         print(exc)
 194       raise exceptions[0]
 195
 196
 197 def split_arg(arg):
 198   raw = shlex.split(arg)
 199   result = []
 200   for r in raw:
 201     if '~' in r:
 202       result.append(os.path.expanduser(r))
 203     else:
 204       result.append(r)
 205   return result
 206
 207
 208 def run(args, threadRunner, name, nano, arg, i):
 209   def task():
 210     file_i = "%s/%s.out%d" % (args.outdir, name, i)
 211
 212     should_run = not args.noinit and not (name == args.b and args.skip_b)
 213     if i <= 0:
 214       should_run = True # always run for suspects
 215
 216     if should_run:
 217       if i > 0:
 218         timesLock.acquire()
 219         print("Init run %d for %s..." % (i, name))
 220         timesLock.release()
 221       subprocess.check_call(["touch", file_i])
 222       with open(file_i, 'w') as f:
 223         subprocess.check_call([nano] + split_arg(arg) +
 224                               ["--config", args.config], stderr=f, stdout=f)
 225
 226     timesLock.acquire()
 227     append_times_from_file(args, name, file_i)
 228     timesLock.release()
 229
 230   threadRunner.add(args, task)
 231
 232
 233 def init_run(args):
 234   threadRunner = ThreadRunner(args)
 235   for i in range(1, max(args.repeat, args.threads / 2) + 1):
 236     run(args, threadRunner, args.a, args.nano_a, args.arg_a, i)
 237     run(args, threadRunner, args.b, args.nano_b, args.arg_b, i)
 238   threadRunner.wait()
 239
 240
 241 def get_lower_upper(values):
 242   i = max(0, (len(values) - 1) / FACTOR)
 243   return values[i], values[-i - 1]
 244
 245
 246 def different_enough(lower1, upper2):
 247   return upper2 < DIFF_T * lower1
 248
 249
 250 # TODO(liyuqian): we used this hacky criteria mainly because that I didn't have
 251 # time to study more rigorous statistical tests. We should adopt a more rigorous
 252 # test in the future.
 253 def get_suspects():
 254   suspects = []
 255   for bench in timesA.keys():
 256     if bench not in timesB:
 257       continue
 258     lowerA, upperA = get_lower_upper(timesA[bench])
 259     lowerB, upperB = get_lower_upper(timesB[bench])
 260     if different_enough(lowerA, upperB) or different_enough(lowerB, upperA):
 261       suspects.append(bench)
 262   return suspects
 263
 264
 265 def process_bench_pattern(s):
 266   if ".skp" in s: # skp bench won't match their exact names...
 267     return "^\"" + s[0:(s.index(".skp") + 3)] + "\""
 268   else:
 269     return "^\"" + s + "\"$"
 270
 271
 272 def suspects_arg(suspects):
 273   patterns = map(process_bench_pattern, suspects)
 274   return " --match " + (" ".join(patterns))
 275
 276
 277 def median(array):
 278   return array[len(array) / 2]
 279
 280
 281 def regression(bench):
 282   a = median(timesA[bench])
 283   b = median(timesB[bench])
 284   if (a == 0): # bad bench, just return no regression
 285     return 1
 286   return b / a
 287
 288
 289 def percentage(x):
 290   return (x - 1) * 100
 291
 292
 293 def format_r(r):
 294   return ('%6.2f' % percentage(r)) + "%"
 295
 296
 297 def normalize_r(r):
 298   if r > 1.0:
 299     return r - 1.0
 300   else:
 301     return 1.0 - 1/r
 302
 303
 304 def test():
 305   args = parse_args()
 306
 307   init_run(args)
 308   last_unchanged_iter = 0
 309   last_suspect_number = -1
 310   tryCnt = 0
 311   it = 0
 312   while tryCnt < MAXTRY:
 313     it += 1
 314     suspects = get_suspects()
 315     if len(suspects) != last_suspect_number:
 316       last_suspect_number = len(suspects)
 317       last_unchanged_iter = it
 318     if (len(suspects) == 0 or it - last_unchanged_iter >= TERM):
 319       break
 320
 321     print("Number of suspects at iteration %d: %d" % (it, len(suspects)))
 322     threadRunner = ThreadRunner(args)
 323     for j in range(1, max(1, args.threads / 2) + 1):
 324       run(args, threadRunner, args.a, args.nano_a,
 325           args.arg_a + suspects_arg(suspects), -j)
 326       run(args, threadRunner, args.b, args.nano_b,
 327           args.arg_b + suspects_arg(suspects), -j)
 328       tryCnt += 1
 329     threadRunner.wait()
 330
 331   suspects = get_suspects()
 332   if len(suspects) == 0:
 333     print(("%s and %s does not seem to have significant " + \
 334            "performance differences.") % (args.a, args.b))
 335   else:
 336     suspects.sort(key = regression)
 337     print("%s (compared to %s) is likely" % (args.a, args.b))
 338     for suspect in suspects:
 339       r = regression(suspect)
 340       if r < 1:
 341         print("\033[31m  %s slower in %s\033[0m" % (format_r(1/r), suspect))
 342       else:
 343         print("\033[32m  %s faster in %s\033[0m" % (format_r(r), suspect))
 344
 345   with open("%s/bench_%s_%s.json" % (args.outdir, args.a, args.b), 'w') as f:
 346     results = {}
 347     for bench in timesA:
 348       r = regression(bench) if bench in suspects else 1.0
 349       results[bench] = {
 350         args.config: {
 351           "signed_regression": normalize_r(r),
 352           "lower_quantile_ms": get_lower_upper(timesA[bench])[0] * 1e-6,
 353           "upper_quantile_ms": get_lower_upper(timesA[bench])[1] * 1e-6,
 354           "options": {
 355             # TODO(liyuqian): let ab.py call nanobench with --outResultsFile so
 356             # nanobench could generate the json for us that's exactly the same
 357             # as that being used by perf bots. Currently, we cannot guarantee
 358             # that bench is the name (e.g., bench may have additional resolution
 359             # information appended after name).
 360             "name": bench
 361           }
 362         }
 363       }
 364
 365     output = {"results": results}
 366     if args.githash:
 367       output["gitHash"] = args.githash
 368     if args.keys:
 369       keys = {}
 370       for i in range(len(args.keys) / 2):
 371         keys[args.keys[i * 2]] = args.keys[i * 2 + 1]
 372       output["key"] = keys
 373     f.write(json.dumps(output, indent=4))
 374     print(("\033[36mJSON results available in %s\033[0m" % f.name))
 375
 376   with open("%s/bench_%s_%s.csv" % (args.outdir, args.a, args.b), 'w') as out:
 377     out.write(("bench, significant?, raw regresion, " +
 378                    "%(A)s quantile (ns), %(B)s quantile (ns), " +
 379                    "%(A)s (ns), %(B)s (ns)\n") % {'A': args.a, 'B': args.b})
 380     for bench in suspects + timesA.keys():
 381       if (bench not in timesA or bench not in timesB):
 382         continue
 383       ta = timesA[bench]
 384       tb = timesB[bench]
 385       out.write(
 386           "%s, %s, %f, " % (bench, bench in suspects, regression(bench)) +
 387           ' '.join(map(str, get_lower_upper(ta))) + ", " +
 388           ' '.join(map(str, get_lower_upper(tb))) + ", " +
 389           ("%s, %s\n" % (' '.join(map(str, ta)), ' '.join(map(str, tb))))
 390       )
 391     print(("\033[36m" +
 392            "Compared %d benches. " +
 393            "%d of them seem to be significantly differrent." +
 394            "\033[0m") %
 395            (len([x for x in timesA if x in timesB]), len(suspects)))
 396     print("\033[36mPlease see detailed bench results in %s\033[0m" % out.name)
 397
 398
 399 if __name__ == "__main__":
 400   try:
 401     test()
 402   except Exception as e:
 403     print(e)
 404     print(HELP)
 405     traceback.print_exc()
 406     raise e