src/tools/auto_bisect/bisect_results.py

   1 # Copyright 2014 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 import math
   6 import os
   7
   8 import bisect_utils
   9 import math_utils
  10 import ttest
  11
  12
  13 def ConfidenceScore(good_results_lists, bad_results_lists):
  14   """Calculates a confidence score.
  15
  16   This score is a percentage which represents our degree of confidence in the
  17   proposition that the good results and bad results are distinct groups, and
  18   their differences aren't due to chance alone.
  19
  20
  21   Args:
  22     good_results_lists: A list of lists of "good" result numbers.
  23     bad_results_lists: A list of lists of "bad" result numbers.
  24
  25   Returns:
  26     A number in the range [0, 100].
  27   """
  28   # If there's only one item in either list, this means only one revision was
  29   # classified good or bad; this isn't good enough evidence to make a decision.
  30   # If an empty list was passed, that also implies zero confidence.
  31   if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1:
  32     return 0.0
  33
  34   # Flatten the lists of results lists.
  35   sample1 = sum(good_results_lists, [])
  36   sample2 = sum(bad_results_lists, [])
  37
  38   # If there were only empty lists in either of the lists (this is unexpected
  39   # and normally shouldn't happen), then we also want to return 0.
  40   if not sample1 or not sample2:
  41     return 0.0
  42
  43   # The p-value is approximately the probability of obtaining the given set
  44   # of good and bad values just by chance.
  45   _, _, p_value = ttest.WelchsTTest(sample1, sample2)
  46   return 100.0 * (1.0 - p_value)
  47
  48
  49 class BisectResults(object):
  50
  51   def __init__(self, depot_registry, source_control):
  52     self._depot_registry = depot_registry
  53     self.revision_data = {}
  54     self.error = None
  55     self._source_control = source_control
  56
  57   @staticmethod
  58   def _FindOtherRegressions(revision_data_sorted, bad_greater_than_good):
  59     """Compiles a list of other possible regressions from the revision data.
  60
  61     Args:
  62       revision_data_sorted: Sorted list of (revision, revision data) pairs.
  63       bad_greater_than_good: Whether the result value at the "bad" revision is
  64           numerically greater than the result value at the "good" revision.
  65
  66     Returns:
  67       A list of [current_rev, previous_rev, confidence] for other places where
  68       there may have been a regression.
  69     """
  70     other_regressions = []
  71     previous_values = []
  72     previous_id = None
  73     for current_id, current_data in revision_data_sorted:
  74       current_values = current_data['value']
  75       if current_values:
  76         current_values = current_values['values']
  77         if previous_values:
  78           confidence = ConfidenceScore(previous_values, [current_values])
  79           mean_of_prev_runs = math_utils.Mean(sum(previous_values, []))
  80           mean_of_current_runs = math_utils.Mean(current_values)
  81
  82           # Check that the potential regression is in the same direction as
  83           # the overall regression. If the mean of the previous runs < the
  84           # mean of the current runs, this local regression is in same
  85           # direction.
  86           prev_less_than_current = mean_of_prev_runs < mean_of_current_runs
  87           is_same_direction = (prev_less_than_current if
  88               bad_greater_than_good else not prev_less_than_current)
  89
  90           # Only report potential regressions with high confidence.
  91           if is_same_direction and confidence > 50:
  92             other_regressions.append([current_id, previous_id, confidence])
  93         previous_values.append(current_values)
  94         previous_id = current_id
  95     return other_regressions
  96
  97   def GetResultsDict(self):
  98     """Prepares and returns information about the final resulsts as a dict.
  99
 100     Returns:
 101       A dictionary with the following fields
 102
 103       'first_working_revision': First good revision.
 104       'last_broken_revision': Last bad revision.
 105       'culprit_revisions': A list of revisions, which contain the bad change
 106           introducing the failure.
 107       'other_regressions': A list of tuples representing other regressions,
 108           which may have occured.
 109       'regression_size': For performance bisects, this is a relative change of
 110           the mean metric value. For other bisects this field always contains
 111           'zero-to-nonzero'.
 112       'regression_std_err': For performance bisects, it is a pooled standard
 113           error for groups of good and bad runs. Not used for other bisects.
 114       'confidence': For performance bisects, it is a confidence that the good
 115           and bad runs are distinct groups. Not used for non-performance
 116           bisects.
 117       'revision_data_sorted': dict mapping revision ids to data about that
 118           revision. Each piece of revision data consists of a dict with the
 119           following keys:
 120
 121           'passed': Represents whether the performance test was successful at
 122               that revision. Possible values include: 1 (passed), 0 (failed),
 123               '?' (skipped), 'F' (build failed).
 124           'depot': The depot that this revision is from (i.e. WebKit)
 125           'external': If the revision is a 'src' revision, 'external' contains
 126               the revisions of each of the external libraries.
 127           'sort': A sort value for sorting the dict in order of commits.
 128
 129           For example:
 130           {
 131             'CL #1':
 132             {
 133               'passed': False,
 134               'depot': 'chromium',
 135               'external': None,
 136               'sort': 0
 137             }
 138           }
 139     """
 140     revision_data_sorted = sorted(self.revision_data.iteritems(),
 141                                   key = lambda x: x[1]['sort'])
 142
 143     # Find range where it possibly broke.
 144     first_working_revision = None
 145     first_working_revision_index = -1
 146     last_broken_revision = None
 147     last_broken_revision_index = -1
 148
 149     culprit_revisions = []
 150     other_regressions = []
 151     regression_size = 0.0
 152     regression_std_err = 0.0
 153     confidence = 0.0
 154
 155     for i in xrange(len(revision_data_sorted)):
 156       k, v = revision_data_sorted[i]
 157       if v['passed'] == 1:
 158         if not first_working_revision:
 159           first_working_revision = k
 160           first_working_revision_index = i
 161
 162       if not v['passed']:
 163         last_broken_revision = k
 164         last_broken_revision_index = i
 165
 166     if last_broken_revision != None and first_working_revision != None:
 167       broken_means = []
 168       for i in xrange(0, last_broken_revision_index + 1):
 169         if revision_data_sorted[i][1]['value']:
 170           broken_means.append(revision_data_sorted[i][1]['value']['values'])
 171
 172       working_means = []
 173       for i in xrange(first_working_revision_index, len(revision_data_sorted)):
 174         if revision_data_sorted[i][1]['value']:
 175           working_means.append(revision_data_sorted[i][1]['value']['values'])
 176
 177       # Flatten the lists to calculate mean of all values.
 178       working_mean = sum(working_means, [])
 179       broken_mean = sum(broken_means, [])
 180
 181       # Calculate the approximate size of the regression
 182       mean_of_bad_runs = math_utils.Mean(broken_mean)
 183       mean_of_good_runs = math_utils.Mean(working_mean)
 184
 185       regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,
 186                                                       mean_of_bad_runs)
 187       if math.isnan(regression_size):
 188         regression_size = 'zero-to-nonzero'
 189
 190       regression_std_err = math.fabs(math_utils.PooledStandardError(
 191           [working_mean, broken_mean]) /
 192           max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0
 193
 194       # Give a "confidence" in the bisect. At the moment we use how distinct the
 195       # values are before and after the last broken revision, and how noisy the
 196       # overall graph is.
 197       confidence = ConfidenceScore(working_means, broken_means)
 198
 199       culprit_revisions = []
 200
 201       cwd = os.getcwd()
 202       self._depot_registry.ChangeToDepotDir(
 203           self.revision_data[last_broken_revision]['depot'])
 204
 205       if self.revision_data[last_broken_revision]['depot'] == 'cros':
 206         # Want to get a list of all the commits and what depots they belong
 207         # to so that we can grab info about each.
 208         cmd = ['repo', 'forall', '-c',
 209             'pwd ; git log --pretty=oneline --before=%d --after=%d' % (
 210             last_broken_revision, first_working_revision + 1)]
 211         output, return_code = bisect_utils.RunProcessAndRetrieveOutput(cmd)
 212
 213         changes = []
 214         assert not return_code, ('An error occurred while running '
 215                                  '"%s"' % ' '.join(cmd))
 216         last_depot = None
 217         cwd = os.getcwd()
 218         for l in output.split('\n'):
 219           if l:
 220             # Output will be in form:
 221             # /path_to_depot
 222             # /path_to_other_depot
 223             # <SHA1>
 224             # /path_again
 225             # <SHA1>
 226             # etc.
 227             if l[0] == '/':
 228               last_depot = l
 229             else:
 230               contents = l.split(' ')
 231               if len(contents) > 1:
 232                 changes.append([last_depot, contents[0]])
 233         for c in changes:
 234           os.chdir(c[0])
 235           info = self._source_control.QueryRevisionInfo(c[1])
 236           culprit_revisions.append((c[1], info, None))
 237       else:
 238         for i in xrange(last_broken_revision_index, len(revision_data_sorted)):
 239           k, v = revision_data_sorted[i]
 240           if k == first_working_revision:
 241             break
 242           self._depot_registry.ChangeToDepotDir(v['depot'])
 243           info = self._source_control.QueryRevisionInfo(k)
 244           culprit_revisions.append((k, info, v['depot']))
 245       os.chdir(cwd)
 246
 247       # Check for any other possible regression ranges.
 248       other_regressions = self._FindOtherRegressions(
 249           revision_data_sorted, mean_of_bad_runs > mean_of_good_runs)
 250
 251     return {
 252         'first_working_revision': first_working_revision,
 253         'last_broken_revision': last_broken_revision,
 254         'culprit_revisions': culprit_revisions,
 255         'other_regressions': other_regressions,
 256         'regression_size': regression_size,
 257         'regression_std_err': regression_std_err,
 258         'confidence': confidence,
 259         'revision_data_sorted': revision_data_sorted
 260     }