src/tools/findit/git_repository_parser.py

   1 # Copyright (c) 2014 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 import base64
   6 import xml.dom.minidom as minidom
   7 from xml.parsers.expat import ExpatError
   8
   9 import crash_utils
  10 from repository_parser_interface import ParserInterface
  11
  12 FILE_CHANGE_TYPE_MAP = {
  13     'add': 'A',
  14     'copy': 'C',
  15     'delete': 'D',
  16     'modify': 'M',
  17     'rename': 'R'
  18 }
  19
  20
  21 def _ConvertToFileChangeType(file_action):
  22   # TODO(stgao): verify impact on code that checks the file change type.
  23   return file_action[0].upper()
  24
  25
  26 class GitParser(ParserInterface):
  27   """Parser for Git repository in googlesource.
  28
  29   Attributes:
  30     parsed_deps: A map from component path to its repository name, regression,
  31                  etc.
  32     url_parts_map: A map from url type to its url parts. This parts are added
  33                    the base url to form different urls.
  34   """
  35
  36   def __init__(self, parsed_deps, url_parts_map):
  37     self.component_to_url_map = parsed_deps
  38     self.url_parts_map = url_parts_map
  39
  40   def ParseChangelog(self, component_path, range_start, range_end):
  41     file_to_revision_map = {}
  42     revision_map = {}
  43     base_url = self.component_to_url_map[component_path]['repository']
  44     changelog_url = base_url + self.url_parts_map['changelog_url']
  45     revision_url = base_url + self.url_parts_map['revision_url']
  46
  47     # Retrieve data from the url, return empty maps if fails. Html url is a\
  48     # url where the changelog can be parsed from html.
  49     url = changelog_url % (range_start, range_end)
  50     html_url = url + '?pretty=fuller'
  51     response = crash_utils.GetDataFromURL(html_url)
  52     if not response:
  53       return (revision_map, file_to_revision_map)
  54
  55     # Parse xml out of the returned string. If it failes, Try parsing
  56     # from JSON objects.
  57     try:
  58       dom = minidom.parseString(response)
  59     except ExpatError:
  60       self.ParseChangelogFromJSON(range_start, range_end, changelog_url,
  61                                   revision_url, revision_map,
  62                                   file_to_revision_map)
  63       return (revision_map, file_to_revision_map)
  64
  65     # The revisions information are in from the third divs to the second
  66     # to last one.
  67     divs = dom.getElementsByTagName('div')[2:-1]
  68     pres = dom.getElementsByTagName('pre')
  69     uls = dom.getElementsByTagName('ul')
  70
  71     # Divs, pres and uls each contain revision information for one CL, so
  72     # they should have same length.
  73     if not divs or len(divs) != len(pres) or len(pres) != len(uls):
  74       self.ParseChangelogFromJSON(range_start, range_end, changelog_url,
  75                                   revision_url, revision_map,
  76                                   file_to_revision_map)
  77       return (revision_map, file_to_revision_map)
  78
  79     # Iterate through divs and parse revisions
  80     for (div, pre, ul) in zip(divs, pres, uls):
  81       # Create new revision object for each revision.
  82       revision = {}
  83
  84       # There must be three <tr>s. If not, this page is wrong.
  85       trs = div.getElementsByTagName('tr')
  86       if len(trs) != 3:
  87         continue
  88
  89       # Retrieve git hash.
  90       githash = trs[0].getElementsByTagName('a')[0].firstChild.nodeValue
  91
  92       # Retrieve and set author.
  93       author = trs[1].getElementsByTagName(
  94           'td')[0].firstChild.nodeValue.split('<')[0]
  95       revision['author'] = author
  96       revision['time'] = trs[1].getElementsByTagName(
  97           'td')[1].firstChild.nodeValue
  98
  99       # Retrive and set message.
 100       revision['message'] = pre.firstChild.nodeValue
 101
 102       # Set url of this CL.
 103       revision_url_part = self.url_parts_map['revision_url'] % githash
 104       revision['url'] = base_url + revision_url_part
 105
 106       # Go through changed files, they are in li.
 107       lis = ul.getElementsByTagName('li')
 108       for li in lis:
 109         # Retrieve path and action of the changed file
 110         file_path = li.getElementsByTagName('a')[0].firstChild.nodeValue
 111         file_change_type = li.getElementsByTagName('span')[
 112             0].getAttribute('class')
 113
 114         # Normalize file action so that it is same as SVN parser.
 115         file_change_type = _ConvertToFileChangeType(file_change_type)
 116
 117         # Add the changed file to the map.
 118         if file_path not in file_to_revision_map:
 119           file_to_revision_map[file_path] = []
 120         file_to_revision_map[file_path].append((githash, file_change_type))
 121
 122       # Add this revision object to the map.
 123       revision_map[githash] = revision
 124
 125     # Parse one revision for the start range, because googlesource does not
 126     # include the start of the range.
 127     self.ParseRevision(revision_url, range_start, revision_map,
 128                        file_to_revision_map)
 129
 130     return (revision_map, file_to_revision_map)
 131
 132   def ParseChangelogFromJSON(self, range_start, range_end, changelog_url,
 133                              revision_url, revision_map, file_to_revision_map):
 134     """Parses changelog by going over the JSON file.
 135
 136     Args:
 137       range_start: Starting range of the regression.
 138       range_end: Ending range of the regression.
 139       changelog_url: The url to retrieve changelog from.
 140       revision_url: The url to retrieve individual revision from.
 141       revision_map: A map from a git hash number to its revision information.
 142       file_to_revision_map: A map from file to a git hash in which it occurs.
 143     """
 144     # Compute URLs from given range, and retrieves changelog. Stop if it fails.
 145     changelog_url %= (range_start, range_end)
 146     json_url = changelog_url + '?format=json'
 147     response = crash_utils.GetDataFromURL(json_url)
 148     if not response:
 149       return
 150
 151     # Parse changelog from the returned object. The returned string should
 152     # start with ")}]'\n", so start from the 6th character.
 153     revisions = crash_utils.LoadJSON(response[5:])
 154     if not revisions:
 155       return
 156
 157     # Parse individual revision in the log.
 158     for revision in revisions['log']:
 159       githash = revision['commit']
 160       self.ParseRevision(revision_url, githash, revision_map,
 161                          file_to_revision_map)
 162
 163     # Parse the revision with range_start, because googlesource ignores
 164     # that one.
 165     self.ParseRevision(revision_url, range_start, revision_map,
 166                        file_to_revision_map)
 167
 168   def ParseRevision(self, revision_url, githash, revision_map,
 169                     file_to_revision_map):
 170
 171     # Retrieve data from the URL, return if it fails.
 172     url = revision_url % githash
 173     response = crash_utils.GetDataFromURL(url + '?format=json')
 174     if not response:
 175       return
 176
 177     # Load JSON object from the string. If it fails, terminate the function.
 178     json_revision = crash_utils.LoadJSON(response[5:])
 179     if not json_revision:
 180       return
 181
 182     # Create a map representing object and get githash from the JSON object.
 183     revision = {}
 184     githash = json_revision['commit']
 185
 186     # Set author, message and URL of this CL.
 187     revision['author'] = json_revision['author']['name']
 188     revision['time'] = json_revision['author']['time']
 189     revision['message'] = json_revision['message']
 190     revision['url'] = url
 191
 192     # Iterate through the changed files.
 193     for diff in json_revision['tree_diff']:
 194       file_path = diff['new_path']
 195       file_change_type = diff['type']
 196
 197       # Normalize file action so that it fits with svn_repository_parser.
 198       file_change_type = _ConvertToFileChangeType(file_change_type)
 199
 200       # Add the file to the map.
 201       if file_path not in file_to_revision_map:
 202         file_to_revision_map[file_path] = []
 203       file_to_revision_map[file_path].append((githash, file_change_type))
 204
 205     # Add this CL to the map.
 206     revision_map[githash] = revision
 207
 208     return
 209
 210   def ParseLineDiff(self, path, component, file_change_type, githash):
 211     changed_line_numbers = []
 212     changed_line_contents = []
 213     base_url = self.component_to_url_map[component]['repository']
 214     backup_url = (base_url + self.url_parts_map['revision_url']) % githash
 215
 216     # If the file is added (not modified), treat it as if it is not changed.
 217     if file_change_type in ('A', 'C', 'R'):
 218       # TODO(stgao): Maybe return whole file change for Add, Rename, and Copy?
 219       return (backup_url, changed_line_numbers, changed_line_contents)
 220
 221     # Retrieves the diff data from URL, and if it fails, return emptry lines.
 222     url = (base_url + self.url_parts_map['diff_url']) % (githash, path)
 223     data = crash_utils.GetDataFromURL(url + '?format=text')
 224     if not data:
 225       return (backup_url, changed_line_numbers, changed_line_contents)
 226
 227     # Decode the returned object to line diff info
 228     diff = base64.b64decode(data).splitlines()
 229
 230     # Iterate through the lines in diff. Set current line to -1 so that we know
 231     # that current line is part of the diff chunk.
 232     current_line = -1
 233     for line in diff:
 234       line = line.strip()
 235
 236       # If line starts with @@, a new chunk starts.
 237       if line.startswith('@@'):
 238         current_line = int(line.split('+')[1].split(',')[0])
 239
 240       # If we are in a chunk.
 241       elif current_line != -1:
 242         # If line is either added or modified.
 243         if line.startswith('+'):
 244           changed_line_numbers.append(current_line)
 245           changed_line_contents.append(line[2:])
 246
 247         # Do not increment current line if the change is 'delete'.
 248         if not line.startswith('-'):
 249           current_line += 1
 250
 251     # Return url without '?format=json'
 252     return (url, changed_line_numbers, changed_line_contents)
 253
 254   def ParseBlameInfo(self, component, file_path, line, revision):
 255     base_url = self.component_to_url_map[component]['repository']
 256
 257     # Retrieve blame JSON file from googlesource. If it fails, return None.
 258     url_part = self.url_parts_map['blame_url'] % (revision, file_path)
 259     blame_url = base_url + url_part
 260     json_string = crash_utils.GetDataFromURL(blame_url)
 261     if not json_string:
 262       return
 263
 264     # Parse JSON object from the string. The returned string should
 265     # start with ")}]'\n", so start from the 6th character.
 266     annotation = crash_utils.LoadJSON(json_string[5:])
 267     if not annotation:
 268       return
 269
 270     # Go through the regions, which is a list of consecutive lines with same
 271     # author/revision.
 272     for blame_line in annotation['regions']:
 273       start = blame_line['start']
 274       count = blame_line['count']
 275
 276       # For each region, check if the line we want the blame info of is in this
 277       # region.
 278       if start <= line and line <= start + count - 1:
 279         # If we are in the right region, get the information from the line.
 280         revision = blame_line['commit']
 281         author = blame_line['author']['name']
 282         revision_url_parts = self.url_parts_map['revision_url'] % revision
 283         revision_url = base_url + revision_url_parts
 284         # TODO(jeun): Add a way to get content from JSON object.
 285         content = None
 286
 287         (revision_info, _) = self.ParseChangelog(component, revision, revision)
 288         message = revision_info[revision]['message']
 289         time = revision_info[revision]['time']
 290         return (content, revision, author, revision_url, message, time)
 291
 292     # Return none if the region does not exist.
 293     return None