import bisect_utils
import math_utils
+import source_control
import ttest
-def ConfidenceScore(good_results_lists, bad_results_lists):
- """Calculates a confidence score.
+class BisectResults(object):
+ """Contains results of the completed bisect.
+
+ Properties:
+ error: Error message if the bisect failed.
+
+ If the error is None, the following properties are present:
+ warnings: List of warnings from the bisect run.
+ state: BisectState object from which these results were generated.
+ first_working_revision: First good revision.
+ last_broken_revision: Last bad revision.
+
+ If both of above revisions are not None, the follow properties are present:
+ culprit_revisions: A list of revisions, which contain the bad change
+ introducing the failure.
+ other_regressions: A list of tuples representing other regressions, which
+ may have occured.
+ regression_size: For performance bisects, this is a relative change of
+ the mean metric value. For other bisects this field always contains
+ 'zero-to-nonzero'.
+ regression_std_err: For performance bisects, it is a pooled standard error
+ for groups of good and bad runs. Not used for other bisects.
+ confidence: For performance bisects, it is a confidence that the good and
+ bad runs are distinct groups. Not used for non-performance bisects.
+ """
- This score is a percentage which represents our degree of confidence in the
- proposition that the good results and bad results are distinct groups, and
- their differences aren't due to chance alone.
+ def __init__(self, bisect_state=None, depot_registry=None, opts=None,
+ runtime_warnings=None, error=None, abort_reason=None):
+ """Computes final bisect results after a bisect run is complete.
+ This constructor should be called in one of the following ways:
+ BisectResults(state, depot_registry, opts, runtime_warnings)
+ BisectResults(error=error)
- Args:
- good_results_lists: A list of lists of "good" result numbers.
- bad_results_lists: A list of lists of "bad" result numbers.
+ First option creates an object representing successful bisect results, while
+ second option creates an error result.
- Returns:
- A number in the range [0, 100].
- """
- # If there's only one item in either list, this means only one revision was
- # classified good or bad; this isn't good enough evidence to make a decision.
- # If an empty list was passed, that also implies zero confidence.
- if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1:
- return 0.0
+ Args:
+ bisect_state: BisectState object representing latest bisect state.
+ depot_registry: DepotDirectoryRegistry object with information on each
+ repository in the bisect_state.
+ opts: Options passed to the bisect run.
+ runtime_warnings: A list of warnings from the bisect run.
+ error: Error message. When error is not None, other arguments are ignored.
+ """
- # Flatten the lists of results lists.
- sample1 = sum(good_results_lists, [])
- sample2 = sum(bad_results_lists, [])
+ self.error = error
+ self.abort_reason = abort_reason
+ if error is not None or abort_reason is not None:
+ return
- # If there were only empty lists in either of the lists (this is unexpected
- # and normally shouldn't happen), then we also want to return 0.
- if not sample1 or not sample2:
- return 0.0
+ assert (bisect_state is not None and depot_registry is not None and
+ opts is not None and runtime_warnings is not None), (
+ 'Incorrect use of the BisectResults constructor. When error is '
+ 'None, all other arguments are required')
- # The p-value is approximately the probability of obtaining the given set
- # of good and bad values just by chance.
- _, _, p_value = ttest.WelchsTTest(sample1, sample2)
- return 100.0 * (1.0 - p_value)
+ self.state = bisect_state
+ rev_states = bisect_state.GetRevisionStates()
+ first_working_rev, last_broken_rev = self.FindBreakingRevRange(rev_states)
+ self.first_working_revision = first_working_rev
+ self.last_broken_revision = last_broken_rev
-class BisectResults(object):
+ self.warnings = runtime_warnings
+
+ if first_working_rev is not None and last_broken_rev is not None:
+ statistics = self._ComputeRegressionStatistics(
+ rev_states, first_working_rev, last_broken_rev)
+
+ self.regression_size = statistics['regression_size']
+ self.regression_std_err = statistics['regression_std_err']
+ self.confidence = statistics['confidence']
+
+ self.culprit_revisions = self._FindCulpritRevisions(
+ rev_states, depot_registry, first_working_rev, last_broken_rev)
- def __init__(self, depot_registry, source_control):
- self._depot_registry = depot_registry
- self.revision_data = {}
- self.error = None
- self._source_control = source_control
+ self.other_regressions = self._FindOtherRegressions(
+ rev_states, statistics['bad_greater_than_good'])
+
+ self.warnings += self._GetResultBasedWarnings(
+ self.culprit_revisions, opts, self.confidence)
+ elif first_working_rev is not None:
+ # Setting these attributes so that bisect printer does not break when the
+ # regression cannot be reproduced (no broken revision was found)
+ self.regression_size = 0
+ self.regression_std_err = 0
+ self.confidence = 0
+ self.culprit_revisions = []
+ self.other_regressions = []
+
+ @staticmethod
+ def _GetResultBasedWarnings(culprit_revisions, opts, confidence):
+ warnings = []
+ if len(culprit_revisions) > 1:
+ warnings.append('Due to build errors, regression range could '
+ 'not be narrowed down to a single commit.')
+ if opts.repeat_test_count == 1:
+ warnings.append('Tests were only set to run once. This may '
+ 'be insufficient to get meaningful results.')
+ if 0 < confidence < bisect_utils.HIGH_CONFIDENCE:
+ warnings.append('Confidence is not high. Try bisecting again '
+ 'with increased repeat_count, larger range, or '
+ 'on another metric.')
+ if not confidence:
+ warnings.append('Confidence score is 0%. Try bisecting again on '
+ 'another platform or another metric.')
+ return warnings
@staticmethod
- def _FindOtherRegressions(revision_data_sorted, bad_greater_than_good):
+ def ConfidenceScore(sample1, sample2,
+ accept_single_bad_or_good=False):
+ """Calculates a confidence score.
+
+ This score is a percentage which represents our degree of confidence in the
+ proposition that the good results and bad results are distinct groups, and
+ their differences aren't due to chance alone.
+
+
+ Args:
+ sample1: A flat list of "good" result numbers.
+ sample2: A flat list of "bad" result numbers.
+ accept_single_bad_or_good: If True, computes confidence even if there is
+ just one bad or good revision, otherwise single good or bad revision
+ always returns 0.0 confidence. This flag will probably get away when
+ we will implement expanding the bisect range by one more revision for
+ such case.
+
+ Returns:
+ A number in the range [0, 100].
+ """
+ # If there's only one item in either list, this means only one revision was
+ # classified good or bad; this isn't good enough evidence to make a
+ # decision. If an empty list was passed, that also implies zero confidence.
+ if not accept_single_bad_or_good:
+ if len(sample1) <= 1 or len(sample2) <= 1:
+ return 0.0
+
+ # If there were only empty lists in either of the lists (this is unexpected
+ # and normally shouldn't happen), then we also want to return 0.
+ if not sample1 or not sample2:
+ return 0.0
+
+ # The p-value is approximately the probability of obtaining the given set
+ # of good and bad values just by chance.
+ _, _, p_value = ttest.WelchsTTest(sample1, sample2)
+ return 100.0 * (1.0 - p_value)
+
+ @classmethod
+ def _FindOtherRegressions(cls, revision_states, bad_greater_than_good):
"""Compiles a list of other possible regressions from the revision data.
Args:
- revision_data_sorted: Sorted list of (revision, revision data) pairs.
+ revision_states: Sorted list of RevisionState objects.
bad_greater_than_good: Whether the result value at the "bad" revision is
numerically greater than the result value at the "good" revision.
"""
other_regressions = []
previous_values = []
- previous_id = None
- for current_id, current_data in revision_data_sorted:
- current_values = current_data['value']
- if current_values:
- current_values = current_values['values']
+ prev_state = None
+ for revision_state in revision_states:
+ if revision_state.value:
+ current_values = revision_state.value['values']
if previous_values:
- confidence = ConfidenceScore(previous_values, [current_values])
+ confidence_params = (sum(previous_values, []),
+ sum([current_values], []))
+ confidence = cls.ConfidenceScore(*confidence_params,
+ accept_single_bad_or_good=True)
mean_of_prev_runs = math_utils.Mean(sum(previous_values, []))
mean_of_current_runs = math_utils.Mean(current_values)
# the overall regression. If the mean of the previous runs < the
# mean of the current runs, this local regression is in same
# direction.
- prev_less_than_current = mean_of_prev_runs < mean_of_current_runs
- is_same_direction = (prev_less_than_current if
- bad_greater_than_good else not prev_less_than_current)
+ prev_greater_than_current = mean_of_prev_runs > mean_of_current_runs
+ is_same_direction = (prev_greater_than_current if
+ bad_greater_than_good else not prev_greater_than_current)
# Only report potential regressions with high confidence.
if is_same_direction and confidence > 50:
- other_regressions.append([current_id, previous_id, confidence])
+ other_regressions.append([revision_state, prev_state, confidence])
previous_values.append(current_values)
- previous_id = current_id
+ prev_state = revision_state
return other_regressions
- def GetResultsDict(self):
- """Prepares and returns information about the final resulsts as a dict.
-
- Returns:
- A dictionary with the following fields
-
- 'first_working_revision': First good revision.
- 'last_broken_revision': Last bad revision.
- 'culprit_revisions': A list of revisions, which contain the bad change
- introducing the failure.
- 'other_regressions': A list of tuples representing other regressions,
- which may have occured.
- 'regression_size': For performance bisects, this is a relative change of
- the mean metric value. For other bisects this field always contains
- 'zero-to-nonzero'.
- 'regression_std_err': For performance bisects, it is a pooled standard
- error for groups of good and bad runs. Not used for other bisects.
- 'confidence': For performance bisects, it is a confidence that the good
- and bad runs are distinct groups. Not used for non-performance
- bisects.
- 'revision_data_sorted': dict mapping revision ids to data about that
- revision. Each piece of revision data consists of a dict with the
- following keys:
-
- 'passed': Represents whether the performance test was successful at
- that revision. Possible values include: 1 (passed), 0 (failed),
- '?' (skipped), 'F' (build failed).
- 'depot': The depot that this revision is from (i.e. WebKit)
- 'external': If the revision is a 'src' revision, 'external' contains
- the revisions of each of the external libraries.
- 'sort': A sort value for sorting the dict in order of commits.
-
- For example:
- {
- 'CL #1':
- {
- 'passed': False,
- 'depot': 'chromium',
- 'external': None,
- 'sort': 0
- }
- }
- """
- revision_data_sorted = sorted(self.revision_data.iteritems(),
- key = lambda x: x[1]['sort'])
-
- # Find range where it possibly broke.
+ @staticmethod
+ def FindBreakingRevRange(revision_states):
first_working_revision = None
- first_working_revision_index = -1
last_broken_revision = None
- last_broken_revision_index = -1
+
+ for revision_state in revision_states:
+ if revision_state.passed == 1 and not first_working_revision:
+ first_working_revision = revision_state
+
+ if not revision_state.passed:
+ last_broken_revision = revision_state
+
+ return first_working_revision, last_broken_revision
+
+ @staticmethod
+ def _FindCulpritRevisions(revision_states, depot_registry, first_working_rev,
+ last_broken_rev):
+ cwd = os.getcwd()
culprit_revisions = []
- other_regressions = []
- regression_size = 0.0
- regression_std_err = 0.0
- confidence = 0.0
-
- for i in xrange(len(revision_data_sorted)):
- k, v = revision_data_sorted[i]
- if v['passed'] == 1:
- if not first_working_revision:
- first_working_revision = k
- first_working_revision_index = i
-
- if not v['passed']:
- last_broken_revision = k
- last_broken_revision_index = i
-
- if last_broken_revision != None and first_working_revision != None:
- broken_means = []
- for i in xrange(0, last_broken_revision_index + 1):
- if revision_data_sorted[i][1]['value']:
- broken_means.append(revision_data_sorted[i][1]['value']['values'])
-
- working_means = []
- for i in xrange(first_working_revision_index, len(revision_data_sorted)):
- if revision_data_sorted[i][1]['value']:
- working_means.append(revision_data_sorted[i][1]['value']['values'])
-
- # Flatten the lists to calculate mean of all values.
- working_mean = sum(working_means, [])
- broken_mean = sum(broken_means, [])
-
- # Calculate the approximate size of the regression
- mean_of_bad_runs = math_utils.Mean(broken_mean)
- mean_of_good_runs = math_utils.Mean(working_mean)
-
- regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,
+ for i in xrange(last_broken_rev.index, first_working_rev.index):
+ depot_registry.ChangeToDepotDir(revision_states[i].depot)
+ info = source_control.QueryRevisionInfo(revision_states[i].revision)
+ culprit_revisions.append((revision_states[i].revision, info,
+ revision_states[i].depot))
+
+ os.chdir(cwd)
+ return culprit_revisions
+
+ @classmethod
+ def _ComputeRegressionStatistics(cls, rev_states, first_working_rev,
+ last_broken_rev):
+ # TODO(sergiyb): We assume that value has "values" key, which may not be
+ # the case for failure-bisects, where there is a single value only.
+ broken_means = [state.value['values']
+ for state in rev_states[:last_broken_rev.index+1]
+ if state.value]
+
+ working_means = [state.value['values']
+ for state in rev_states[first_working_rev.index:]
+ if state.value]
+
+ # Flatten the lists to calculate mean of all values.
+ working_mean = sum(working_means, [])
+ broken_mean = sum(broken_means, [])
+
+ # Calculate the approximate size of the regression
+ mean_of_bad_runs = math_utils.Mean(broken_mean)
+ mean_of_good_runs = math_utils.Mean(working_mean)
+
+ regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,
mean_of_bad_runs)
- if math.isnan(regression_size):
- regression_size = 'zero-to-nonzero'
-
- regression_std_err = math.fabs(math_utils.PooledStandardError(
- [working_mean, broken_mean]) /
- max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0
-
- # Give a "confidence" in the bisect. At the moment we use how distinct the
- # values are before and after the last broken revision, and how noisy the
- # overall graph is.
- confidence = ConfidenceScore(working_means, broken_means)
-
- culprit_revisions = []
-
- cwd = os.getcwd()
- self._depot_registry.ChangeToDepotDir(
- self.revision_data[last_broken_revision]['depot'])
-
- if self.revision_data[last_broken_revision]['depot'] == 'cros':
- # Want to get a list of all the commits and what depots they belong
- # to so that we can grab info about each.
- cmd = ['repo', 'forall', '-c',
- 'pwd ; git log --pretty=oneline --before=%d --after=%d' % (
- last_broken_revision, first_working_revision + 1)]
- output, return_code = bisect_utils.RunProcessAndRetrieveOutput(cmd)
-
- changes = []
- assert not return_code, ('An error occurred while running '
- '"%s"' % ' '.join(cmd))
- last_depot = None
- cwd = os.getcwd()
- for l in output.split('\n'):
- if l:
- # Output will be in form:
- # /path_to_depot
- # /path_to_other_depot
- # <SHA1>
- # /path_again
- # <SHA1>
- # etc.
- if l[0] == '/':
- last_depot = l
- else:
- contents = l.split(' ')
- if len(contents) > 1:
- changes.append([last_depot, contents[0]])
- for c in changes:
- os.chdir(c[0])
- info = self._source_control.QueryRevisionInfo(c[1])
- culprit_revisions.append((c[1], info, None))
- else:
- for i in xrange(last_broken_revision_index, len(revision_data_sorted)):
- k, v = revision_data_sorted[i]
- if k == first_working_revision:
- break
- self._depot_registry.ChangeToDepotDir(v['depot'])
- info = self._source_control.QueryRevisionInfo(k)
- culprit_revisions.append((k, info, v['depot']))
- os.chdir(cwd)
-
- # Check for any other possible regression ranges.
- other_regressions = self._FindOtherRegressions(
- revision_data_sorted, mean_of_bad_runs > mean_of_good_runs)
-
- return {
- 'first_working_revision': first_working_revision,
- 'last_broken_revision': last_broken_revision,
- 'culprit_revisions': culprit_revisions,
- 'other_regressions': other_regressions,
- 'regression_size': regression_size,
- 'regression_std_err': regression_std_err,
- 'confidence': confidence,
- 'revision_data_sorted': revision_data_sorted
- }
+ if math.isnan(regression_size):
+ regression_size = 'zero-to-nonzero'
+
+ regression_std_err = math.fabs(math_utils.PooledStandardError(
+ [working_mean, broken_mean]) /
+ max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0
+
+ # Give a "confidence" in the bisect. At the moment we use how distinct the
+ # values are before and after the last broken revision, and how noisy the
+ # overall graph is.
+ confidence_params = (sum(working_means, []), sum(broken_means, []))
+ confidence = cls.ConfidenceScore(*confidence_params)
+
+ bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs
+
+ return {'regression_size': regression_size,
+ 'regression_std_err': regression_std_err,
+ 'confidence': confidence,
+ 'bad_greater_than_good': bad_greater_than_good}