X-Git-Url: http://review.tizen.org/git/?a=blobdiff_plain;f=src%2Ftools%2Fauto_bisect%2Fbisect_results.py;h=df93de6799577b361816e83bc3cf736d2a1ac55c;hb=1afa4dd80ef85af7c90efaea6959db1d92330844;hp=144962fd7038fa516f57b955830031fc8b5aeedb;hpb=90762837333c13ccf56f2ad88e4481fc71e8d281;p=platform%2Fframework%2Fweb%2Fcrosswalk.git diff --git a/src/tools/auto_bisect/bisect_results.py b/src/tools/auto_bisect/bisect_results.py index 144962f..df93de6 100644 --- a/src/tools/auto_bisect/bisect_results.py +++ b/src/tools/auto_bisect/bisect_results.py @@ -7,59 +7,163 @@ import os import bisect_utils import math_utils +import source_control import ttest -def ConfidenceScore(good_results_lists, bad_results_lists): - """Calculates a confidence score. +class BisectResults(object): + """Contains results of the completed bisect. + + Properties: + error: Error message if the bisect failed. + + If the error is None, the following properties are present: + warnings: List of warnings from the bisect run. + state: BisectState object from which these results were generated. + first_working_revision: First good revision. + last_broken_revision: Last bad revision. + + If both of above revisions are not None, the follow properties are present: + culprit_revisions: A list of revisions, which contain the bad change + introducing the failure. + other_regressions: A list of tuples representing other regressions, which + may have occured. + regression_size: For performance bisects, this is a relative change of + the mean metric value. For other bisects this field always contains + 'zero-to-nonzero'. + regression_std_err: For performance bisects, it is a pooled standard error + for groups of good and bad runs. Not used for other bisects. + confidence: For performance bisects, it is a confidence that the good and + bad runs are distinct groups. Not used for non-performance bisects. + """ - This score is a percentage which represents our degree of confidence in the - proposition that the good results and bad results are distinct groups, and - their differences aren't due to chance alone. + def __init__(self, bisect_state=None, depot_registry=None, opts=None, + runtime_warnings=None, error=None, abort_reason=None): + """Computes final bisect results after a bisect run is complete. + This constructor should be called in one of the following ways: + BisectResults(state, depot_registry, opts, runtime_warnings) + BisectResults(error=error) - Args: - good_results_lists: A list of lists of "good" result numbers. - bad_results_lists: A list of lists of "bad" result numbers. + First option creates an object representing successful bisect results, while + second option creates an error result. - Returns: - A number in the range [0, 100]. - """ - # If there's only one item in either list, this means only one revision was - # classified good or bad; this isn't good enough evidence to make a decision. - # If an empty list was passed, that also implies zero confidence. - if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1: - return 0.0 + Args: + bisect_state: BisectState object representing latest bisect state. + depot_registry: DepotDirectoryRegistry object with information on each + repository in the bisect_state. + opts: Options passed to the bisect run. + runtime_warnings: A list of warnings from the bisect run. + error: Error message. When error is not None, other arguments are ignored. + """ - # Flatten the lists of results lists. - sample1 = sum(good_results_lists, []) - sample2 = sum(bad_results_lists, []) + self.error = error + self.abort_reason = abort_reason + if error is not None or abort_reason is not None: + return - # If there were only empty lists in either of the lists (this is unexpected - # and normally shouldn't happen), then we also want to return 0. - if not sample1 or not sample2: - return 0.0 + assert (bisect_state is not None and depot_registry is not None and + opts is not None and runtime_warnings is not None), ( + 'Incorrect use of the BisectResults constructor. When error is ' + 'None, all other arguments are required') - # The p-value is approximately the probability of obtaining the given set - # of good and bad values just by chance. - _, _, p_value = ttest.WelchsTTest(sample1, sample2) - return 100.0 * (1.0 - p_value) + self.state = bisect_state + rev_states = bisect_state.GetRevisionStates() + first_working_rev, last_broken_rev = self.FindBreakingRevRange(rev_states) + self.first_working_revision = first_working_rev + self.last_broken_revision = last_broken_rev -class BisectResults(object): + self.warnings = runtime_warnings + + if first_working_rev is not None and last_broken_rev is not None: + statistics = self._ComputeRegressionStatistics( + rev_states, first_working_rev, last_broken_rev) + + self.regression_size = statistics['regression_size'] + self.regression_std_err = statistics['regression_std_err'] + self.confidence = statistics['confidence'] + + self.culprit_revisions = self._FindCulpritRevisions( + rev_states, depot_registry, first_working_rev, last_broken_rev) - def __init__(self, depot_registry, source_control): - self._depot_registry = depot_registry - self.revision_data = {} - self.error = None - self._source_control = source_control + self.other_regressions = self._FindOtherRegressions( + rev_states, statistics['bad_greater_than_good']) + + self.warnings += self._GetResultBasedWarnings( + self.culprit_revisions, opts, self.confidence) + elif first_working_rev is not None: + # Setting these attributes so that bisect printer does not break when the + # regression cannot be reproduced (no broken revision was found) + self.regression_size = 0 + self.regression_std_err = 0 + self.confidence = 0 + self.culprit_revisions = [] + self.other_regressions = [] + + @staticmethod + def _GetResultBasedWarnings(culprit_revisions, opts, confidence): + warnings = [] + if len(culprit_revisions) > 1: + warnings.append('Due to build errors, regression range could ' + 'not be narrowed down to a single commit.') + if opts.repeat_test_count == 1: + warnings.append('Tests were only set to run once. This may ' + 'be insufficient to get meaningful results.') + if 0 < confidence < bisect_utils.HIGH_CONFIDENCE: + warnings.append('Confidence is not high. Try bisecting again ' + 'with increased repeat_count, larger range, or ' + 'on another metric.') + if not confidence: + warnings.append('Confidence score is 0%. Try bisecting again on ' + 'another platform or another metric.') + return warnings @staticmethod - def _FindOtherRegressions(revision_data_sorted, bad_greater_than_good): + def ConfidenceScore(sample1, sample2, + accept_single_bad_or_good=False): + """Calculates a confidence score. + + This score is a percentage which represents our degree of confidence in the + proposition that the good results and bad results are distinct groups, and + their differences aren't due to chance alone. + + + Args: + sample1: A flat list of "good" result numbers. + sample2: A flat list of "bad" result numbers. + accept_single_bad_or_good: If True, computes confidence even if there is + just one bad or good revision, otherwise single good or bad revision + always returns 0.0 confidence. This flag will probably get away when + we will implement expanding the bisect range by one more revision for + such case. + + Returns: + A number in the range [0, 100]. + """ + # If there's only one item in either list, this means only one revision was + # classified good or bad; this isn't good enough evidence to make a + # decision. If an empty list was passed, that also implies zero confidence. + if not accept_single_bad_or_good: + if len(sample1) <= 1 or len(sample2) <= 1: + return 0.0 + + # If there were only empty lists in either of the lists (this is unexpected + # and normally shouldn't happen), then we also want to return 0. + if not sample1 or not sample2: + return 0.0 + + # The p-value is approximately the probability of obtaining the given set + # of good and bad values just by chance. + _, _, p_value = ttest.WelchsTTest(sample1, sample2) + return 100.0 * (1.0 - p_value) + + @classmethod + def _FindOtherRegressions(cls, revision_states, bad_greater_than_good): """Compiles a list of other possible regressions from the revision data. Args: - revision_data_sorted: Sorted list of (revision, revision data) pairs. + revision_states: Sorted list of RevisionState objects. bad_greater_than_good: Whether the result value at the "bad" revision is numerically greater than the result value at the "good" revision. @@ -69,13 +173,15 @@ class BisectResults(object): """ other_regressions = [] previous_values = [] - previous_id = None - for current_id, current_data in revision_data_sorted: - current_values = current_data['value'] - if current_values: - current_values = current_values['values'] + prev_state = None + for revision_state in revision_states: + if revision_state.value: + current_values = revision_state.value['values'] if previous_values: - confidence = ConfidenceScore(previous_values, [current_values]) + confidence_params = (sum(previous_values, []), + sum([current_values], [])) + confidence = cls.ConfidenceScore(*confidence_params, + accept_single_bad_or_good=True) mean_of_prev_runs = math_utils.Mean(sum(previous_values, [])) mean_of_current_runs = math_utils.Mean(current_values) @@ -83,178 +189,85 @@ class BisectResults(object): # the overall regression. If the mean of the previous runs < the # mean of the current runs, this local regression is in same # direction. - prev_less_than_current = mean_of_prev_runs < mean_of_current_runs - is_same_direction = (prev_less_than_current if - bad_greater_than_good else not prev_less_than_current) + prev_greater_than_current = mean_of_prev_runs > mean_of_current_runs + is_same_direction = (prev_greater_than_current if + bad_greater_than_good else not prev_greater_than_current) # Only report potential regressions with high confidence. if is_same_direction and confidence > 50: - other_regressions.append([current_id, previous_id, confidence]) + other_regressions.append([revision_state, prev_state, confidence]) previous_values.append(current_values) - previous_id = current_id + prev_state = revision_state return other_regressions - def GetResultsDict(self): - """Prepares and returns information about the final resulsts as a dict. - - Returns: - A dictionary with the following fields - - 'first_working_revision': First good revision. - 'last_broken_revision': Last bad revision. - 'culprit_revisions': A list of revisions, which contain the bad change - introducing the failure. - 'other_regressions': A list of tuples representing other regressions, - which may have occured. - 'regression_size': For performance bisects, this is a relative change of - the mean metric value. For other bisects this field always contains - 'zero-to-nonzero'. - 'regression_std_err': For performance bisects, it is a pooled standard - error for groups of good and bad runs. Not used for other bisects. - 'confidence': For performance bisects, it is a confidence that the good - and bad runs are distinct groups. Not used for non-performance - bisects. - 'revision_data_sorted': dict mapping revision ids to data about that - revision. Each piece of revision data consists of a dict with the - following keys: - - 'passed': Represents whether the performance test was successful at - that revision. Possible values include: 1 (passed), 0 (failed), - '?' (skipped), 'F' (build failed). - 'depot': The depot that this revision is from (i.e. WebKit) - 'external': If the revision is a 'src' revision, 'external' contains - the revisions of each of the external libraries. - 'sort': A sort value for sorting the dict in order of commits. - - For example: - { - 'CL #1': - { - 'passed': False, - 'depot': 'chromium', - 'external': None, - 'sort': 0 - } - } - """ - revision_data_sorted = sorted(self.revision_data.iteritems(), - key = lambda x: x[1]['sort']) - - # Find range where it possibly broke. + @staticmethod + def FindBreakingRevRange(revision_states): first_working_revision = None - first_working_revision_index = -1 last_broken_revision = None - last_broken_revision_index = -1 + + for revision_state in revision_states: + if revision_state.passed == 1 and not first_working_revision: + first_working_revision = revision_state + + if not revision_state.passed: + last_broken_revision = revision_state + + return first_working_revision, last_broken_revision + + @staticmethod + def _FindCulpritRevisions(revision_states, depot_registry, first_working_rev, + last_broken_rev): + cwd = os.getcwd() culprit_revisions = [] - other_regressions = [] - regression_size = 0.0 - regression_std_err = 0.0 - confidence = 0.0 - - for i in xrange(len(revision_data_sorted)): - k, v = revision_data_sorted[i] - if v['passed'] == 1: - if not first_working_revision: - first_working_revision = k - first_working_revision_index = i - - if not v['passed']: - last_broken_revision = k - last_broken_revision_index = i - - if last_broken_revision != None and first_working_revision != None: - broken_means = [] - for i in xrange(0, last_broken_revision_index + 1): - if revision_data_sorted[i][1]['value']: - broken_means.append(revision_data_sorted[i][1]['value']['values']) - - working_means = [] - for i in xrange(first_working_revision_index, len(revision_data_sorted)): - if revision_data_sorted[i][1]['value']: - working_means.append(revision_data_sorted[i][1]['value']['values']) - - # Flatten the lists to calculate mean of all values. - working_mean = sum(working_means, []) - broken_mean = sum(broken_means, []) - - # Calculate the approximate size of the regression - mean_of_bad_runs = math_utils.Mean(broken_mean) - mean_of_good_runs = math_utils.Mean(working_mean) - - regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs, + for i in xrange(last_broken_rev.index, first_working_rev.index): + depot_registry.ChangeToDepotDir(revision_states[i].depot) + info = source_control.QueryRevisionInfo(revision_states[i].revision) + culprit_revisions.append((revision_states[i].revision, info, + revision_states[i].depot)) + + os.chdir(cwd) + return culprit_revisions + + @classmethod + def _ComputeRegressionStatistics(cls, rev_states, first_working_rev, + last_broken_rev): + # TODO(sergiyb): We assume that value has "values" key, which may not be + # the case for failure-bisects, where there is a single value only. + broken_means = [state.value['values'] + for state in rev_states[:last_broken_rev.index+1] + if state.value] + + working_means = [state.value['values'] + for state in rev_states[first_working_rev.index:] + if state.value] + + # Flatten the lists to calculate mean of all values. + working_mean = sum(working_means, []) + broken_mean = sum(broken_means, []) + + # Calculate the approximate size of the regression + mean_of_bad_runs = math_utils.Mean(broken_mean) + mean_of_good_runs = math_utils.Mean(working_mean) + + regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs, mean_of_bad_runs) - if math.isnan(regression_size): - regression_size = 'zero-to-nonzero' - - regression_std_err = math.fabs(math_utils.PooledStandardError( - [working_mean, broken_mean]) / - max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 - - # Give a "confidence" in the bisect. At the moment we use how distinct the - # values are before and after the last broken revision, and how noisy the - # overall graph is. - confidence = ConfidenceScore(working_means, broken_means) - - culprit_revisions = [] - - cwd = os.getcwd() - self._depot_registry.ChangeToDepotDir( - self.revision_data[last_broken_revision]['depot']) - - if self.revision_data[last_broken_revision]['depot'] == 'cros': - # Want to get a list of all the commits and what depots they belong - # to so that we can grab info about each. - cmd = ['repo', 'forall', '-c', - 'pwd ; git log --pretty=oneline --before=%d --after=%d' % ( - last_broken_revision, first_working_revision + 1)] - output, return_code = bisect_utils.RunProcessAndRetrieveOutput(cmd) - - changes = [] - assert not return_code, ('An error occurred while running ' - '"%s"' % ' '.join(cmd)) - last_depot = None - cwd = os.getcwd() - for l in output.split('\n'): - if l: - # Output will be in form: - # /path_to_depot - # /path_to_other_depot - # - # /path_again - # - # etc. - if l[0] == '/': - last_depot = l - else: - contents = l.split(' ') - if len(contents) > 1: - changes.append([last_depot, contents[0]]) - for c in changes: - os.chdir(c[0]) - info = self._source_control.QueryRevisionInfo(c[1]) - culprit_revisions.append((c[1], info, None)) - else: - for i in xrange(last_broken_revision_index, len(revision_data_sorted)): - k, v = revision_data_sorted[i] - if k == first_working_revision: - break - self._depot_registry.ChangeToDepotDir(v['depot']) - info = self._source_control.QueryRevisionInfo(k) - culprit_revisions.append((k, info, v['depot'])) - os.chdir(cwd) - - # Check for any other possible regression ranges. - other_regressions = self._FindOtherRegressions( - revision_data_sorted, mean_of_bad_runs > mean_of_good_runs) - - return { - 'first_working_revision': first_working_revision, - 'last_broken_revision': last_broken_revision, - 'culprit_revisions': culprit_revisions, - 'other_regressions': other_regressions, - 'regression_size': regression_size, - 'regression_std_err': regression_std_err, - 'confidence': confidence, - 'revision_data_sorted': revision_data_sorted - } + if math.isnan(regression_size): + regression_size = 'zero-to-nonzero' + + regression_std_err = math.fabs(math_utils.PooledStandardError( + [working_mean, broken_mean]) / + max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 + + # Give a "confidence" in the bisect. At the moment we use how distinct the + # values are before and after the last broken revision, and how noisy the + # overall graph is. + confidence_params = (sum(working_means, []), sum(broken_means, [])) + confidence = cls.ConfidenceScore(*confidence_params) + + bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs + + return {'regression_size': regression_size, + 'regression_std_err': regression_std_err, + 'confidence': confidence, + 'bad_greater_than_good': bad_greater_than_good}