4 # Copyright 2017 Google Inc.
6 # Use of this source code is governed by a BSD-style license that can be found
9 # This is an A/B test utility script used by calmbench.py
11 # For each bench, we get a distribution of min_ms measurements from nanobench.
12 # From that, we try to recover the 1/3 and 2/3 quantiles of the distribution.
13 # If range (1/3 quantile, 2/3 quantile) is completely disjoint between A and B,
14 # we report that as a regression.
16 # The more measurements we have for a bench, the more accurate our quantiles
17 # are. However, taking more measurements is time consuming. Hence we'll prune
18 # out benches and only take more measurements for benches whose current quantile
19 # ranges are disjoint.
21 # P.S. The current script is brute forcely translated from a ruby script. So it
25 from __future__ import print_function
33 import multiprocessing
35 from argparse import ArgumentParser
36 from multiprocessing import Process
37 from threading import Thread
38 from threading import Lock
39 from pdb import set_trace
43 \033[31mPlease call calmbench.py to drive this script if you're not doing so.
44 This script is not supposed to be used by itself. (At least, it's not easy to
45 use by itself. The calmbench bots may use this script directly.)
49 FACTOR = 3 # lower/upper quantile factor
50 DIFF_T = 0.99 # different enough threshold
51 TERM = 10 # terminate after this no. of iterations without suspect changes
52 MAXTRY = 30 # max number of nanobench tries to narrow down suspects
54 UNITS = "ns µs ms s".split()
63 parser = ArgumentParser(description=HELP)
65 parser.add_argument('outdir', type=str, help="output directory")
66 parser.add_argument('a', type=str, help="name of A")
67 parser.add_argument('b', type=str, help="name of B")
68 parser.add_argument('nano_a', type=str, help="path to A's nanobench binary")
69 parser.add_argument('nano_b', type=str, help="path to B's nanobench binary")
70 parser.add_argument('arg_a', type=str, help="args for A's nanobench run")
71 parser.add_argument('arg_b', type=str, help="args for B's nanobench run")
72 parser.add_argument('repeat', type=int, help="number of initial runs")
73 parser.add_argument('skip_b', type=str, help=("whether to skip running B"
74 " ('true' or 'false')"))
75 parser.add_argument('config', type=str, help="nanobenh config")
76 parser.add_argument('threads', type=int, help="number of threads to run")
77 parser.add_argument('noinit', type=str, help=("whether to skip running B"
78 " ('true' or 'false')"))
80 parser.add_argument('--concise', dest='concise', action="store_true",
81 help="If set, no verbose thread info will be printed.")
82 parser.set_defaults(concise=False)
84 # Additional args for bots
85 BHELP = "bot specific options"
86 parser.add_argument('--githash', type=str, default="", help=BHELP)
87 parser.add_argument('--keys', type=str, default=[], nargs='+', help=BHELP)
89 args = parser.parse_args()
90 args.skip_b = args.skip_b == "true"
91 args.noinit = args.noinit == "true"
93 if args.threads == -1:
95 if args.config in ["8888", "565"]: # multi-thread for CPU only
96 args.threads = max(1, multiprocessing.cpu_count() / 2)
100 def append_dict_sorted_array(dict_array, key, value):
101 if key not in dict_array:
103 dict_array[key].append(value)
104 dict_array[key].sort()
107 def add_time(args, name, bench, t, unit):
108 normalized_t = t * 1000 ** UNITS.index(unit);
109 if name.startswith(args.a):
110 append_dict_sorted_array(timesA, bench, normalized_t)
112 append_dict_sorted_array(timesB, bench, normalized_t)
115 def append_times_from_file(args, name, filename):
116 with open(filename) as f:
117 lines = f.readlines()
122 matches = re.search("([+-]?\d*.?\d+)(s|ms|µs|ns)", items[3])
123 if (not matches or items[9] != args.config):
125 time_num = matches.group(1)
126 time_unit = matches.group(2)
127 add_time(args, name, bench, float(time_num), time_unit)
130 class ThreadWithException(Thread):
131 def __init__(self, target):
132 super(ThreadWithException, self).__init__(target = target)
133 self.exception = None
137 self._Thread__target(*self._Thread__args, **self._Thread__kwargs)
138 except BaseException as e:
141 def join(self, timeout=None):
142 super(ThreadWithException, self).join(timeout)
146 """Simplest and stupidiest threaded executer."""
147 def __init__(self, args):
148 self.concise = args.concise
151 def add(self, args, fn):
152 if len(self.threads) >= args.threads:
154 t = ThreadWithException(target = fn)
156 self.threads.append(t)
162 spinners = [". ", ".. ", "..."]
163 while len(self.threads) > 0:
166 "\r" + spinners[i % len(spinners)] +
167 " (%d threads running)" % len(self.threads) +
168 " \r" # spaces for erasing characters
175 ts = Thread(target = spin);
178 for t in self.threads:
182 for t in self.threads:
184 exceptions.append(t.exception)
192 for exc in exceptions:
198 raw = shlex.split(arg)
202 result.append(os.path.expanduser(r))
208 def run(args, threadRunner, name, nano, arg, i):
210 file_i = "%s/%s.out%d" % (args.outdir, name, i)
212 should_run = not args.noinit and not (name == args.b and args.skip_b)
214 should_run = True # always run for suspects
219 print("Init run %d for %s..." % (i, name))
221 subprocess.check_call(["touch", file_i])
222 with open(file_i, 'w') as f:
223 subprocess.check_call([nano] + split_arg(arg) +
224 ["--config", args.config], stderr=f, stdout=f)
227 append_times_from_file(args, name, file_i)
230 threadRunner.add(args, task)
234 threadRunner = ThreadRunner(args)
235 for i in range(1, max(args.repeat, args.threads / 2) + 1):
236 run(args, threadRunner, args.a, args.nano_a, args.arg_a, i)
237 run(args, threadRunner, args.b, args.nano_b, args.arg_b, i)
241 def get_lower_upper(values):
242 i = max(0, (len(values) - 1) / FACTOR)
243 return values[i], values[-i - 1]
246 def different_enough(lower1, upper2):
247 return upper2 < DIFF_T * lower1
250 # TODO(liyuqian): we used this hacky criteria mainly because that I didn't have
251 # time to study more rigorous statistical tests. We should adopt a more rigorous
252 # test in the future.
255 for bench in timesA.keys():
256 if bench not in timesB:
258 lowerA, upperA = get_lower_upper(timesA[bench])
259 lowerB, upperB = get_lower_upper(timesB[bench])
260 if different_enough(lowerA, upperB) or different_enough(lowerB, upperA):
261 suspects.append(bench)
265 def process_bench_pattern(s):
266 if ".skp" in s: # skp bench won't match their exact names...
267 return "^\"" + s[0:(s.index(".skp") + 3)] + "\""
269 return "^\"" + s + "\"$"
272 def suspects_arg(suspects):
273 patterns = map(process_bench_pattern, suspects)
274 return " --match " + (" ".join(patterns))
278 return array[len(array) / 2]
281 def regression(bench):
282 a = median(timesA[bench])
283 b = median(timesB[bench])
284 if (a == 0): # bad bench, just return no regression
294 return ('%6.2f' % percentage(r)) + "%"
308 last_unchanged_iter = 0
309 last_suspect_number = -1
312 while tryCnt < MAXTRY:
314 suspects = get_suspects()
315 if len(suspects) != last_suspect_number:
316 last_suspect_number = len(suspects)
317 last_unchanged_iter = it
318 if (len(suspects) == 0 or it - last_unchanged_iter >= TERM):
321 print("Number of suspects at iteration %d: %d" % (it, len(suspects)))
322 threadRunner = ThreadRunner(args)
323 for j in range(1, max(1, args.threads / 2) + 1):
324 run(args, threadRunner, args.a, args.nano_a,
325 args.arg_a + suspects_arg(suspects), -j)
326 run(args, threadRunner, args.b, args.nano_b,
327 args.arg_b + suspects_arg(suspects), -j)
331 suspects = get_suspects()
332 if len(suspects) == 0:
333 print(("%s and %s does not seem to have significant " + \
334 "performance differences.") % (args.a, args.b))
336 suspects.sort(key = regression)
337 print("%s (compared to %s) is likely" % (args.a, args.b))
338 for suspect in suspects:
339 r = regression(suspect)
341 print("\033[31m %s slower in %s\033[0m" % (format_r(1/r), suspect))
343 print("\033[32m %s faster in %s\033[0m" % (format_r(r), suspect))
345 with open("%s/bench_%s_%s.json" % (args.outdir, args.a, args.b), 'w') as f:
348 r = regression(bench) if bench in suspects else 1.0
351 "signed_regression": normalize_r(r),
352 "lower_quantile_ms": get_lower_upper(timesA[bench])[0] * 1e-6,
353 "upper_quantile_ms": get_lower_upper(timesA[bench])[1] * 1e-6,
355 # TODO(liyuqian): let ab.py call nanobench with --outResultsFile so
356 # nanobench could generate the json for us that's exactly the same
357 # as that being used by perf bots. Currently, we cannot guarantee
358 # that bench is the name (e.g., bench may have additional resolution
359 # information appended after name).
365 output = {"results": results}
367 output["gitHash"] = args.githash
370 for i in range(len(args.keys) / 2):
371 keys[args.keys[i * 2]] = args.keys[i * 2 + 1]
373 f.write(json.dumps(output, indent=4))
374 print(("\033[36mJSON results available in %s\033[0m" % f.name))
376 with open("%s/bench_%s_%s.csv" % (args.outdir, args.a, args.b), 'w') as out:
377 out.write(("bench, significant?, raw regresion, " +
378 "%(A)s quantile (ns), %(B)s quantile (ns), " +
379 "%(A)s (ns), %(B)s (ns)\n") % {'A': args.a, 'B': args.b})
380 for bench in suspects + timesA.keys():
381 if (bench not in timesA or bench not in timesB):
386 "%s, %s, %f, " % (bench, bench in suspects, regression(bench)) +
387 ' '.join(map(str, get_lower_upper(ta))) + ", " +
388 ' '.join(map(str, get_lower_upper(tb))) + ", " +
389 ("%s, %s\n" % (' '.join(map(str, ta)), ' '.join(map(str, tb))))
392 "Compared %d benches. " +
393 "%d of them seem to be significantly differrent." +
395 (len([x for x in timesA if x in timesB]), len(suspects)))
396 print("\033[36mPlease see detailed bench results in %s\033[0m" % out.name)
399 if __name__ == "__main__":
402 except Exception as e:
405 traceback.print_exc()