src/tools/findit/crash_utils.py

   1 # Copyright (c) 2014 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 import atexit
   6 import cgi
   7 import ConfigParser
   8 import json
   9 import os
  10 import Queue
  11 import threading
  12 import time
  13
  14 from common import utils
  15 from result import Result
  16
  17
  18 INFINITY = float('inf')
  19
  20 MAX_THREAD_NUMBER = 10
  21 TASK_QUEUE = None
  22
  23
  24 def SignalWorkerThreads():
  25   global TASK_QUEUE
  26   if not TASK_QUEUE:
  27     return
  28
  29   for i in range(MAX_THREAD_NUMBER):
  30     TASK_QUEUE.put(None)
  31
  32   # Give worker threads a chance to exit.
  33   # Workaround the harmless bug in python 2.7 below.
  34   time.sleep(1)
  35
  36
  37 atexit.register(SignalWorkerThreads)
  38
  39
  40 def Worker():
  41   global TASK_QUEUE
  42   while True:
  43     try:
  44       task = TASK_QUEUE.get()
  45       if not task:
  46         return
  47     except TypeError:
  48       # According to http://bugs.python.org/issue14623, this is a harmless bug
  49       # in python 2.7 which won't be fixed.
  50       # The exception is raised on daemon threads when python interpreter is
  51       # shutting down.
  52       return
  53
  54     function, args, kwargs, result_semaphore = task
  55     try:
  56       function(*args, **kwargs)
  57     except:
  58       pass
  59     finally:
  60       # Signal one task is done in case of exception.
  61       result_semaphore.release()
  62
  63
  64 def RunTasks(tasks):
  65   """Run given tasks. Not thread-safe: no concurrent calls of this function.
  66
  67   Return after all tasks were completed. A task is a dict as below:
  68     {
  69       'function': the function to call,
  70       'args': the positional argument to pass to the function,
  71       'kwargs': the key-value arguments to pass to the function,
  72     }
  73   """
  74   if not tasks:
  75     return
  76
  77   global TASK_QUEUE
  78   if not TASK_QUEUE:
  79     TASK_QUEUE = Queue.Queue()
  80     for index in range(MAX_THREAD_NUMBER):
  81       thread = threading.Thread(target=Worker, name='worker_%s' % index)
  82       # Set as daemon, so no join is needed.
  83       thread.daemon = True
  84       thread.start()
  85
  86   result_semaphore = threading.Semaphore(0)
  87   # Push task to task queue for execution.
  88   for task in tasks:
  89     TASK_QUEUE.put(
  90         (task['function'], task.get('args', []),
  91          task.get('kwargs', {}), result_semaphore))
  92
  93   # Wait until all tasks to be executed.
  94   for _ in tasks:
  95     result_semaphore.acquire()
  96
  97
  98 def GetRepositoryType(revision_number):
  99   """Returns the repository type of this revision number.
 100
 101   Args:
 102     revision_number: A revision number or git hash.
 103
 104   Returns:
 105     'git' or 'svn', depending on the revision_number.
 106   """
 107   if utils.IsGitHash(revision_number):
 108     return 'git'
 109   else:
 110     return 'svn'
 111
 112
 113 def ParseURLsFromConfig(file_name):
 114   """Parses URLS from the config file.
 115
 116   The file should be in python config format, where svn section is in the
 117   format "svn:component_path".
 118   Each of the section for svn should contain changelog_url, revision_url,
 119   diff_url and blame_url.
 120
 121   Args:
 122     file_name: The name of the file that contains URL information.
 123
 124   Returns:
 125     A dictionary that maps repository type to list of URLs. For svn, it maps
 126     key 'svn' to another dictionary, which maps component path to the URLs
 127     as explained above. For git, it maps to the URLs as explained above.
 128   """
 129   config = ConfigParser.ConfigParser()
 130
 131   # Get the absolute path of the config file, and read the file. If it fails,
 132   # return none.
 133   config_file_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
 134                                   file_name)
 135   config.read(config_file_path)
 136   if not config:
 137     return None
 138
 139   # Iterate through the config file, check for sections.
 140   config_dict = {}
 141   for section in config.sections():
 142     # These two do not need another layer of dictionary, so add it and go
 143     # to next section.
 144     if ':' not in section:
 145       for option in config.options(section):
 146         if section not in config_dict:
 147           config_dict[section] = {}
 148
 149         url = config.get(section, option)
 150         config_dict[section][option] = url
 151
 152       continue
 153
 154     # Get repository type and component name from the section name.
 155     repository_type_and_component = section.split(':')
 156     repository_type = repository_type_and_component[0]
 157     component_path = repository_type_and_component[1]
 158
 159     # Add 'svn' as the key, if it is not already there.
 160     if repository_type not in config_dict:
 161       config_dict[repository_type] = {}
 162     url_map_for_repository = config_dict[repository_type]
 163
 164     # Add the path to the 'svn', if it is not already there.
 165     if component_path not in url_map_for_repository:
 166       url_map_for_repository[component_path] = {}
 167     type_to_url = url_map_for_repository[component_path]
 168
 169     # Add all URLs to this map.
 170     for option in config.options(section):
 171       url = config.get(section, option)
 172       type_to_url[option] = url
 173
 174   return config_dict
 175
 176
 177 def NormalizePath(path, parsed_deps):
 178   """Normalizes the path.
 179
 180   Args:
 181     path: A string representing a path.
 182     parsed_deps: A map from component path to its component name, repository,
 183                  etc.
 184
 185   Returns:
 186     A tuple containing a component this path is in (e.g blink, skia, etc)
 187     and a path in that component's repository. Returns None if the component
 188     repository is not supported, i.e from googlecode.
 189   """
 190   # First normalize the path by retreiving the normalized path.
 191   normalized_path = os.path.normpath(path).replace('\\', '/')
 192
 193   # Iterate through all component paths in the parsed DEPS, in the decreasing
 194   # order of the length of the file path.
 195   for component_path in sorted(parsed_deps,
 196                                key=(lambda path: -len(path))):
 197     # new_component_path is the component path with 'src/' removed.
 198     new_component_path = component_path
 199     if new_component_path.startswith('src/') and new_component_path != 'src/':
 200       new_component_path = new_component_path[len('src/'):]
 201
 202     # We need to consider when the lowercased component path is in the path,
 203     # because syzyasan build returns lowercased file path.
 204     lower_component_path = new_component_path.lower()
 205
 206     # If this path is the part of file path, this file must be from this
 207     # component.
 208     if new_component_path in normalized_path or \
 209         lower_component_path in normalized_path:
 210
 211       # Case when the retreived path is in lowercase.
 212       if lower_component_path in normalized_path:
 213         current_component_path = lower_component_path
 214       else:
 215         current_component_path = new_component_path
 216
 217       # Normalize the path by stripping everything off the component's relative
 218       # path.
 219       normalized_path = normalized_path.split(current_component_path, 1)[1]
 220       lower_normalized_path = normalized_path.lower()
 221
 222       # Add 'src/' or 'Source/' at the front of the normalized path, depending
 223       # on what prefix the component path uses. For example, blink uses
 224       # 'Source' but chromium uses 'src/', and blink component path is
 225       # 'src/third_party/WebKit/Source', so add 'Source/' in front of the
 226       # normalized path.
 227       if not (lower_normalized_path.startswith('src/') or
 228               lower_normalized_path.startswith('source/')):
 229
 230         if (lower_component_path.endswith('src/') or
 231             lower_component_path.endswith('source/')):
 232           normalized_path = (current_component_path.split('/')[-2] + '/' +
 233                              normalized_path)
 234
 235         else:
 236           normalized_path = 'src/' + normalized_path
 237
 238       component_name = parsed_deps[component_path]['name']
 239
 240       return (component_path, component_name, normalized_path)
 241
 242   # If the path does not match any component, default to chromium.
 243   return ('src/', 'chromium', normalized_path)
 244
 245
 246 def SplitRange(regression):
 247   """Splits a range as retrieved from clusterfuzz.
 248
 249   Args:
 250     regression: A string in format 'r1234:r5678'.
 251
 252   Returns:
 253     A list containing two numbers represented in string, for example
 254     ['1234','5678'].
 255   """
 256   if not regression:
 257     return None
 258
 259   revisions = regression.split(':')
 260
 261   # If regression information is not available, return none.
 262   if len(revisions) != 2:
 263     return None
 264
 265   range_start = revisions[0]
 266   range_end = revisions[1]
 267
 268   # Strip 'r' off the range start/end. Not using lstrip to avoid the case when
 269   # the range is in git hash and it starts with 'r'.
 270   if range_start.startswith('r'):
 271     range_start = range_start[1:]
 272
 273   if range_end.startswith('r'):
 274     range_end = range_end[1:]
 275
 276   return [range_start, range_end]
 277
 278
 279 def LoadJSON(json_string):
 280   """Loads json object from string, or None.
 281
 282   Args:
 283     json_string: A string to get object from.
 284
 285   Returns:
 286     JSON object if the string represents a JSON object, None otherwise.
 287   """
 288   try:
 289     data = json.loads(json_string)
 290   except ValueError:
 291     data = None
 292
 293   return data
 294
 295
 296 def GetDataFromURL(url):
 297   """Retrieves raw data from URL, tries 10 times.
 298
 299   Args:
 300     url: URL to get data from.
 301     retries: Number of times to retry connection.
 302
 303   Returns:
 304     None if the data retrieval fails, or the raw data.
 305   """
 306   status_code, data = utils.GetHttpClient().Get(url, retries=10)
 307   if status_code == 200:
 308     return data
 309   else:
 310     # Return None if it fails to read data.
 311     return None
 312
 313
 314 def FindMinLineDistance(crashed_line_list, changed_line_numbers,
 315                         line_range=3):
 316   """Calculates how far the changed line is from one of the crashes.
 317
 318   Finds the minimum distance between the lines that the file crashed on
 319   and the lines that the file changed. For example, if the file crashed on
 320   line 200 and the CL changes line 203,204 and 205, the function returns 3.
 321
 322   Args:
 323     crashed_line_list: A list of lines that the file crashed on.
 324     changed_line_numbers: A list of lines that the file changed.
 325     line_range: Number of lines to look back for.
 326
 327   Returns:
 328     The minimum distance. If either of the input lists is empty,
 329     it returns inf.
 330
 331   """
 332   min_distance = INFINITY
 333   crashed_line = -1
 334   changed_line = -1
 335
 336   crashed_line_numbers = set()
 337   for crashed_line_range in crashed_line_list:
 338     for crashed_line in crashed_line_range:
 339       for line in range(crashed_line - line_range, crashed_line + 1):
 340         crashed_line_numbers.add(line)
 341
 342   for line in crashed_line_numbers:
 343     for distance in changed_line_numbers:
 344       # Find the current distance and update the min if current distance is
 345       # less than current min.
 346       current_distance = abs(line - distance)
 347       if current_distance < min_distance:
 348         min_distance = current_distance
 349         crashed_line = line
 350         changed_line = distance
 351
 352   return (min_distance, crashed_line, changed_line)
 353
 354
 355 def GuessIfSameSubPath(path1, path2):
 356   """Guesses if two paths represent same path.
 357
 358   Compares the name of the folders in the path (by split('/')), and checks
 359   if they match either more than 3 or min of path lengths.
 360
 361   Args:
 362     path1: First path.
 363     path2: Second path to compare.
 364
 365   Returns:
 366     True if it they are thought to be a same path, False otherwise.
 367   """
 368   path1 = path1.split('/')
 369   path2 = path2.split('/')
 370
 371   intersection = set(path1).intersection(set(path2))
 372   return len(intersection) >= (min(3, min(len(path1), len(path2))))
 373
 374
 375 def FindMinStackFrameNumber(stack_frame_indices, priorities):
 376   """Finds the minimum stack number, from the list of stack numbers.
 377
 378   Args:
 379     stack_frame_indices: A list of lists containing stack position.
 380     priorities: A list of of priority for each file.
 381
 382   Returns:
 383     Inf if stack_frame_indices is empty, minimum stack number otherwise.
 384   """
 385   # Get the indexes of the highest priority (or low priority number).
 386   highest_priority = min(priorities)
 387   highest_priority_indices = []
 388   for i in range(len(priorities)):
 389     if priorities[i] == highest_priority:
 390       highest_priority_indices.append(i)
 391
 392   # Gather the list of stack frame numbers for the files that change the
 393   # crash lines.
 394   flattened = []
 395   for i in highest_priority_indices:
 396     flattened += stack_frame_indices[i]
 397
 398   # If no stack frame information is available, return inf. Else, return min.
 399   if not flattened:
 400     return INFINITY
 401   else:
 402     return min(flattened)
 403
 404
 405 def AddHyperlink(text, link):
 406   """Returns a string with HTML link tag.
 407
 408   Args:
 409     text: A string to add link.
 410     link: A link to add to the string.
 411
 412   Returns:
 413     A string with hyperlink added.
 414   """
 415   sanitized_link = cgi.escape(link, quote=True)
 416   sanitized_text = cgi.escape(str(text))
 417   return '<a href="%s">%s</a>' % (sanitized_link, sanitized_text)
 418
 419
 420 def PrettifyList(items):
 421   """Returns a string representation of a list.
 422
 423   It adds comma in between the elements and removes the brackets.
 424   Args:
 425     items: A list to prettify.
 426   Returns:
 427     A string representation of the list.
 428   """
 429   return ', '.join(map(str, items))
 430
 431
 432 def PrettifyFrameInfo(frame_indices, functions):
 433   """Return a string to represent the frames with functions."""
 434   frames = []
 435   for frame_index, function in zip(frame_indices, functions):
 436     frames.append('frame #%s, "%s"' % (frame_index, function.split('(')[0]))
 437   return '; '.join(frames)
 438
 439
 440 def PrettifyFiles(file_list):
 441   """Returns a string representation of a list of file names.
 442
 443   Args:
 444     file_list: A list of tuple, (file_name, file_url).
 445   Returns:
 446     A string representation of file names with their urls.
 447   """
 448   ret = ['\n']
 449   for file_name, file_url in file_list:
 450     ret.append('      %s\n' % AddHyperlink(file_name, file_url))
 451   return ''.join(ret)
 452
 453
 454 def Intersection(crashed_line_list, stack_frame_index, changed_line_numbers,
 455                  function, line_range=3):
 456   """Finds the overlap betwee changed lines and crashed lines.
 457
 458   Finds the intersection of the lines that caused the crash and
 459   lines that the file changes. The intersection looks within 3 lines
 460   of the line that caused the crash.
 461
 462   Args:
 463     crashed_line_list: A list of lines that the file crashed on.
 464     stack_frame_index: A list of positions in stack for each of the lines.
 465     changed_line_numbers: A list of lines that the file changed.
 466     function: A list of functions that the file crashed on.
 467     line_range: Number of lines to look backwards from crashed lines.
 468
 469   Returns:
 470     line_number_intersection: Intersection between crashed_line_list and
 471                        changed_line_numbers.
 472     stack_frame_index_intersection: Stack number for each of the intersections.
 473   """
 474   line_number_intersection = []
 475   stack_frame_index_intersection = []
 476   function_intersection = []
 477
 478   # Iterate through the crashed lines, and its occurence in stack.
 479   for (lines, stack_frame_index, function_name) in zip(
 480       crashed_line_list, stack_frame_index, function):
 481     # Also check previous 'line_range' lines. Create a set of all changed lines
 482     # and lines within 3 lines range before the crashed line.
 483     line_minus_n = set()
 484     for line in lines:
 485       for line_in_range in range(line - line_range, line + 1):
 486         line_minus_n.add(line_in_range)
 487
 488     for changed_line in changed_line_numbers:
 489       # If a CL does not change crahsed line, check next line.
 490       if changed_line not in line_minus_n:
 491         continue
 492
 493       intersected_line = set()
 494       # If the changed line is exactly the crashed line, add that line.
 495       for line in lines:
 496         if line in changed_line_numbers:
 497           intersected_line.add(line)
 498
 499         # If the changed line is in 3 lines of the crashed line, add the line.
 500         else:
 501           intersected_line.add(changed_line)
 502
 503       # Avoid adding the same line twice.
 504       if intersected_line not in line_number_intersection:
 505         line_number_intersection.append(list(intersected_line))
 506         stack_frame_index_intersection.append(stack_frame_index)
 507         function_intersection.append(function_name)
 508       break
 509
 510   return (line_number_intersection, stack_frame_index_intersection,
 511           function_intersection)
 512
 513
 514 def MatchListToResultList(matches):
 515   """Convert list of matches to the list of result objects.
 516
 517   Args:
 518     matches: A list of match objects along with its stack priority and revision
 519              number/git hash
 520   Returns:
 521     A list of result object.
 522
 523   """
 524   result_list = []
 525
 526   for _, cl, match in matches:
 527     suspected_cl = cl
 528     revision_url = match.revision_url
 529     component_name = match.component_name
 530     author = match.author
 531     reason = match.reason
 532     review_url = match.review_url
 533     reviewers = match.reviewers
 534     # For matches, line content do not exist.
 535     line_content = None
 536     message = match.message
 537
 538     result = Result(suspected_cl, revision_url, component_name, author, reason,
 539                     review_url, reviewers, line_content, message)
 540     result_list.append(result)
 541
 542   return result_list
 543
 544
 545 def BlameListToResultList(blame_list):
 546   """Convert blame list to the list of result objects.
 547
 548   Args:
 549     blame_list: A list of blame objects.
 550
 551   Returns:
 552     A list of result objects.
 553   """
 554   result_list = []
 555
 556   for blame in blame_list:
 557     suspected_cl = blame.revision
 558     revision_url = blame.url
 559     component_name = blame.component_name
 560     author = blame.author
 561     reason = (
 562         'The CL last changed line %s of file %s, which is stack frame %d.' %
 563         (blame.line_number, blame.file, blame.stack_frame_index))
 564     # Blame object does not have review url and reviewers.
 565     review_url = None
 566     reviewers = None
 567     line_content = blame.line_content
 568     message = blame.message
 569
 570     result = Result(suspected_cl, revision_url, component_name, author, reason,
 571                     review_url, reviewers, line_content, message)
 572     result_list.append(result)
 573
 574   return result_list