1 # Copyright 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
13 def ConfidenceScore(good_results_lists, bad_results_lists):
14 """Calculates a confidence score.
16 This score is a percentage which represents our degree of confidence in the
17 proposition that the good results and bad results are distinct groups, and
18 their differences aren't due to chance alone.
22 good_results_lists: A list of lists of "good" result numbers.
23 bad_results_lists: A list of lists of "bad" result numbers.
26 A number in the range [0, 100].
28 # If there's only one item in either list, this means only one revision was
29 # classified good or bad; this isn't good enough evidence to make a decision.
30 # If an empty list was passed, that also implies zero confidence.
31 if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1:
34 # Flatten the lists of results lists.
35 sample1 = sum(good_results_lists, [])
36 sample2 = sum(bad_results_lists, [])
38 # If there were only empty lists in either of the lists (this is unexpected
39 # and normally shouldn't happen), then we also want to return 0.
40 if not sample1 or not sample2:
43 # The p-value is approximately the probability of obtaining the given set
44 # of good and bad values just by chance.
45 _, _, p_value = ttest.WelchsTTest(sample1, sample2)
46 return 100.0 * (1.0 - p_value)
49 class BisectResults(object):
51 def __init__(self, depot_registry, source_control):
52 self._depot_registry = depot_registry
53 self.revision_data = {}
55 self._source_control = source_control
58 def _FindOtherRegressions(revision_data_sorted, bad_greater_than_good):
59 """Compiles a list of other possible regressions from the revision data.
62 revision_data_sorted: Sorted list of (revision, revision data) pairs.
63 bad_greater_than_good: Whether the result value at the "bad" revision is
64 numerically greater than the result value at the "good" revision.
67 A list of [current_rev, previous_rev, confidence] for other places where
68 there may have been a regression.
70 other_regressions = []
73 for current_id, current_data in revision_data_sorted:
74 current_values = current_data['value']
76 current_values = current_values['values']
78 confidence = ConfidenceScore(previous_values, [current_values])
79 mean_of_prev_runs = math_utils.Mean(sum(previous_values, []))
80 mean_of_current_runs = math_utils.Mean(current_values)
82 # Check that the potential regression is in the same direction as
83 # the overall regression. If the mean of the previous runs < the
84 # mean of the current runs, this local regression is in same
86 prev_less_than_current = mean_of_prev_runs < mean_of_current_runs
87 is_same_direction = (prev_less_than_current if
88 bad_greater_than_good else not prev_less_than_current)
90 # Only report potential regressions with high confidence.
91 if is_same_direction and confidence > 50:
92 other_regressions.append([current_id, previous_id, confidence])
93 previous_values.append(current_values)
94 previous_id = current_id
95 return other_regressions
97 def GetResultsDict(self):
98 """Prepares and returns information about the final resulsts as a dict.
101 A dictionary with the following fields
103 'first_working_revision': First good revision.
104 'last_broken_revision': Last bad revision.
105 'culprit_revisions': A list of revisions, which contain the bad change
106 introducing the failure.
107 'other_regressions': A list of tuples representing other regressions,
108 which may have occured.
109 'regression_size': For performance bisects, this is a relative change of
110 the mean metric value. For other bisects this field always contains
112 'regression_std_err': For performance bisects, it is a pooled standard
113 error for groups of good and bad runs. Not used for other bisects.
114 'confidence': For performance bisects, it is a confidence that the good
115 and bad runs are distinct groups. Not used for non-performance
117 'revision_data_sorted': dict mapping revision ids to data about that
118 revision. Each piece of revision data consists of a dict with the
121 'passed': Represents whether the performance test was successful at
122 that revision. Possible values include: 1 (passed), 0 (failed),
123 '?' (skipped), 'F' (build failed).
124 'depot': The depot that this revision is from (i.e. WebKit)
125 'external': If the revision is a 'src' revision, 'external' contains
126 the revisions of each of the external libraries.
127 'sort': A sort value for sorting the dict in order of commits.
140 revision_data_sorted = sorted(self.revision_data.iteritems(),
141 key = lambda x: x[1]['sort'])
143 # Find range where it possibly broke.
144 first_working_revision = None
145 first_working_revision_index = -1
146 last_broken_revision = None
147 last_broken_revision_index = -1
149 culprit_revisions = []
150 other_regressions = []
151 regression_size = 0.0
152 regression_std_err = 0.0
155 for i in xrange(len(revision_data_sorted)):
156 k, v = revision_data_sorted[i]
158 if not first_working_revision:
159 first_working_revision = k
160 first_working_revision_index = i
163 last_broken_revision = k
164 last_broken_revision_index = i
166 if last_broken_revision != None and first_working_revision != None:
168 for i in xrange(0, last_broken_revision_index + 1):
169 if revision_data_sorted[i][1]['value']:
170 broken_means.append(revision_data_sorted[i][1]['value']['values'])
173 for i in xrange(first_working_revision_index, len(revision_data_sorted)):
174 if revision_data_sorted[i][1]['value']:
175 working_means.append(revision_data_sorted[i][1]['value']['values'])
177 # Flatten the lists to calculate mean of all values.
178 working_mean = sum(working_means, [])
179 broken_mean = sum(broken_means, [])
181 # Calculate the approximate size of the regression
182 mean_of_bad_runs = math_utils.Mean(broken_mean)
183 mean_of_good_runs = math_utils.Mean(working_mean)
185 regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,
187 if math.isnan(regression_size):
188 regression_size = 'zero-to-nonzero'
190 regression_std_err = math.fabs(math_utils.PooledStandardError(
191 [working_mean, broken_mean]) /
192 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0
194 # Give a "confidence" in the bisect. At the moment we use how distinct the
195 # values are before and after the last broken revision, and how noisy the
197 confidence = ConfidenceScore(working_means, broken_means)
199 culprit_revisions = []
202 self._depot_registry.ChangeToDepotDir(
203 self.revision_data[last_broken_revision]['depot'])
205 if self.revision_data[last_broken_revision]['depot'] == 'cros':
206 # Want to get a list of all the commits and what depots they belong
207 # to so that we can grab info about each.
208 cmd = ['repo', 'forall', '-c',
209 'pwd ; git log --pretty=oneline --before=%d --after=%d' % (
210 last_broken_revision, first_working_revision + 1)]
211 output, return_code = bisect_utils.RunProcessAndRetrieveOutput(cmd)
214 assert not return_code, ('An error occurred while running '
215 '"%s"' % ' '.join(cmd))
218 for l in output.split('\n'):
220 # Output will be in form:
222 # /path_to_other_depot
230 contents = l.split(' ')
231 if len(contents) > 1:
232 changes.append([last_depot, contents[0]])
235 info = self._source_control.QueryRevisionInfo(c[1])
236 culprit_revisions.append((c[1], info, None))
238 for i in xrange(last_broken_revision_index, len(revision_data_sorted)):
239 k, v = revision_data_sorted[i]
240 if k == first_working_revision:
242 self._depot_registry.ChangeToDepotDir(v['depot'])
243 info = self._source_control.QueryRevisionInfo(k)
244 culprit_revisions.append((k, info, v['depot']))
247 # Check for any other possible regression ranges.
248 other_regressions = self._FindOtherRegressions(
249 revision_data_sorted, mean_of_bad_runs > mean_of_good_runs)
252 'first_working_revision': first_working_revision,
253 'last_broken_revision': last_broken_revision,
254 'culprit_revisions': culprit_revisions,
255 'other_regressions': other_regressions,
256 'regression_size': regression_size,
257 'regression_std_err': regression_std_err,
258 'confidence': confidence,
259 'revision_data_sorted': revision_data_sorted