tools/add_header.py

   1 #!/usr/bin/env python3
   2 # Copyright 2021 The Chromium Authors
   3 # Use of this source code is governed by a BSD-style license that can be
   4 # found in the LICENSE file.
   5 """Helper for adding or removing an include to/from source file(s).
   6
   7 clang-format already provides header sorting functionality; however, the
   8 functionality is limited to sorting headers within a block of headers surrounded
   9 by blank lines (these are a heuristic to avoid clang breaking ordering for
  10 headers sensitive to inclusion order, e.g. <windows.h>).
  11
  12 As a result, inserting a new header is a bit more complex than simply inserting
  13 the new header at the top and running clang-format.
  14
  15 This script implements additional logic to:
  16 - classify different blocks of headers by type (C system, C++ system, user)
  17 - find the appropriate insertion point for the new header
  18 - creating a new header block if necessary
  19
  20 As a bonus, it does *also* sort the includes, though any sorting disagreements
  21 with clang-format should be resolved in favor of clang-format.
  22
  23 It also supports removing a header with option `--remove`.
  24
  25 Usage:
  26 tools/add_header.py --header '<utility>' foo/bar.cc foo/baz.cc foo/baz.h
  27 tools/add_header.py --header '<vector>' --remove foo/bar.cc foo/baz.cc foo/baz.h
  28 """
  29
  30 import argparse
  31 import difflib
  32 import os.path
  33 import re
  34 import sys
  35
  36 # The specific values of these constants are also used as a sort key for
  37 # ordering different header types in the correct relative order.
  38 _HEADER_TYPE_C_SYSTEM = 0
  39 _HEADER_TYPE_CXX_SYSTEM = 1
  40 _HEADER_TYPE_USER = 2
  41 _HEADER_TYPE_INVALID = -1
  42
  43
  44 def ClassifyHeader(decorated_name):
  45   if IsCSystemHeader(decorated_name):
  46     return _HEADER_TYPE_C_SYSTEM
  47   elif IsCXXSystemHeader(decorated_name):
  48     return _HEADER_TYPE_CXX_SYSTEM
  49   elif IsUserHeader(decorated_name):
  50     return _HEADER_TYPE_USER
  51   else:
  52     return _HEADER_TYPE_INVALID
  53
  54
  55 def UndecoratedName(decorated_name):
  56   """Returns the undecorated version of decorated_name by removing "" or <>."""
  57   assert IsSystemHeader(decorated_name) or IsUserHeader(decorated_name)
  58   return decorated_name[1:-1]
  59
  60
  61 def IsSystemHeader(decorated_name):
  62   """Returns true if decorated_name looks like a system header."""
  63   return decorated_name[0] == '<' and decorated_name[-1] == '>'
  64
  65
  66 def IsCSystemHeader(decorated_name):
  67   """Returns true if decoraed_name looks like a C system header."""
  68   return IsSystemHeader(decorated_name) and UndecoratedName(
  69       decorated_name).endswith('.h')
  70
  71
  72 def IsCXXSystemHeader(decorated_name):
  73   """Returns true if decoraed_name looks like a C++ system header."""
  74   return IsSystemHeader(
  75       decorated_name) and not UndecoratedName(decorated_name).endswith('.h')
  76
  77
  78 def IsUserHeader(decorated_name):
  79   """Returns true if decoraed_name looks like a user header."""
  80   return decorated_name[0] == '"' and decorated_name[-1] == '"'
  81
  82
  83 _EMPTY_LINE_RE = re.compile(r'\s*$')
  84 _COMMENT_RE = re.compile(r'\s*//(.*)$')
  85 _INCLUDE_RE = re.compile(
  86     r'\s*#(import|include)\s+([<"].+?[">])\s*?(?://(.*))?$')
  87
  88
  89 def FindIncludes(lines):
  90   """Finds the block of #includes, assuming Google+Chrome C++ style source.
  91
  92   Note that this doesn't simply return a slice of the input lines, because
  93   having the actual indices simplifies things when generatingn the updated
  94   source text.
  95
  96   Args:
  97     lines: The source text split into lines.
  98
  99   Returns:
 100     A tuple of begin, end indices that can be used to slice the input lines to
 101         contain the includes to process. Returns -1, -1 if no such block of
 102         input lines could be found.
 103   """
 104   begin = end = -1
 105   for idx, line in enumerate(lines):
 106     # Skip over any initial comments (e.g. the copyright boilerplate) or empty
 107     # lines.
 108     # TODO(dcheng): This means that any preamble comment associated with the
 109     # first header will be dropped. So far, this hasn't broken anything, but
 110     # maybe this needs to be more clever.
 111     # TODO(dcheng): #define and #undef should probably also be allowed.
 112     if _EMPTY_LINE_RE.match(line) or _COMMENT_RE.match(line):
 113       continue
 114     m = _INCLUDE_RE.match(line)
 115     if not m:
 116       if begin < 0:
 117         # No match, but no #includes have been seen yet. Keep scanning for the
 118         # first #include.
 119         continue
 120       # Give up, it's something weird that probably requires manual
 121       # intervention.
 122       break
 123
 124     if begin < 0:
 125       begin = idx
 126     end = idx + 1
 127   return begin, end
 128
 129
 130 class Include(object):
 131   """Represents an #include/#import and any interesting metadata for it.
 132
 133   Attributes:
 134     decorated_name: The name of the header file, decorated with <> for system
 135       headers or "" for user headers.
 136
 137     directive: 'include' or 'import'
 138       TODO(dcheng): In the future, this may need to support C++ modules.
 139
 140     preamble: Any comment lines that precede this include line, e.g.:
 141
 142         // This is a preamble comment
 143         // for a header file.
 144         #include <windows.h>
 145
 146       would have a preamble of
 147
 148         ['// This is a preamble comment', '// for a header file.'].
 149
 150     inline_comment: Any comment that comes after the #include on the same line,
 151       e.g.
 152
 153         #include <windows.h>  // For CreateWindowExW()
 154
 155       would be parsed with an inline comment of ' For CreateWindowExW'.
 156
 157     header_type: The header type corresponding to decorated_name as determined
 158       by ClassifyHeader().
 159
 160     is_primary_header: True if this is the primary related header of a C++
 161       implementation file. Any primary header will be sorted to the top in its
 162       own separate block.
 163   """
 164
 165   def __init__(self, decorated_name, directive, preamble, inline_comment):
 166     self.decorated_name = decorated_name
 167     assert directive == 'include' or directive == 'import'
 168     self.directive = directive
 169     self.preamble = preamble
 170     self.inline_comment = inline_comment
 171     self.header_type = ClassifyHeader(decorated_name)
 172     assert self.header_type != _HEADER_TYPE_INVALID
 173     self.is_primary_header = False
 174
 175   def __repr__(self):
 176     return str((self.decorated_name, self.directive, self.preamble,
 177                 self.inline_comment, self.header_type, self.is_primary_header))
 178
 179   def ShouldInsertNewline(self, previous_include):
 180     # Per the Google C++ style guide, different blocks of headers should be
 181     # separated by an empty line.
 182     return (self.is_primary_header != previous_include.is_primary_header
 183             or self.header_type != previous_include.header_type)
 184
 185   def ToSource(self):
 186     """Generates a C++ source representation of this include."""
 187     source = []
 188     source.extend(self.preamble)
 189     include_line = '#%s %s' % (self.directive, self.decorated_name)
 190     if self.inline_comment:
 191       include_line = include_line + '  //' + self.inline_comment
 192     source.append(include_line)
 193     return [line.rstrip() for line in source]
 194
 195
 196 def ParseIncludes(lines):
 197   """Parses lines into a list of Include objects. Returns None on failure.
 198
 199   Args:
 200     lines: A list of strings representing C++ source text.
 201
 202   Returns:
 203     A list of Include objects representing the parsed input lines, or None if
 204     the input lines could not be parsed.
 205   """
 206   includes = []
 207   preamble = []
 208   for line in lines:
 209     if _EMPTY_LINE_RE.match(line):
 210       if preamble:
 211         # preamble contents are flushed when an #include directive is matched.
 212         # If preamble is non-empty, that means there is a preamble separated
 213         # from its #include directive by at least one newline. Just give up,
 214         # since the sorter has no idea how to preserve structure in this case.
 215         return None
 216       continue
 217     m = _INCLUDE_RE.match(line)
 218     if not m:
 219       preamble.append(line)
 220       continue
 221     includes.append(Include(m.group(2), m.group(1), preamble, m.group(3)))
 222     preamble = []
 223   # In theory, the caller should never pass a list of lines with a dangling
 224   # preamble. But there's a test case that exercises this, and just in case it
 225   # actually happens, fail a bit more gracefully.
 226   if preamble:
 227     return None
 228   return includes
 229
 230
 231 def _DecomposePath(filename):
 232   """Decomposes a filename into a list of directories and the basename.
 233
 234   Args:
 235     filename: A filename!
 236
 237   Returns:
 238     A tuple of a list of directories and a string basename.
 239   """
 240   dirs = []
 241   dirname, basename = os.path.split(filename)
 242   while dirname:
 243     dirname, last = os.path.split(dirname)
 244     dirs.append(last)
 245   dirs.reverse()
 246   # Remove the extension from the basename.
 247   basename = os.path.splitext(basename)[0]
 248   return dirs, basename
 249
 250
 251 _PLATFORM_SUFFIX = (
 252     r'(?:_(?:android|aura|chromeos|fuchsia|ios|linux|mac|ozone|posix|win|x11))?'
 253 )
 254 _TEST_SUFFIX = r'(?:_(?:browser|interactive_ui|perf|ui|unit)?test)?'
 255
 256
 257 def MarkPrimaryInclude(includes, filename):
 258   """Finds the primary header in includes and marks it as such.
 259
 260   Per the style guide, if moo.cc's main purpose is to implement or test the
 261   functionality in moo.h, moo.h should be ordered first in the includes.
 262
 263   Args:
 264     includes: A list of Include objects.
 265     filename: The filename to use as the basis for finding the primary header.
 266   """
 267   # Header files never have a primary include.
 268   if filename.endswith('.h'):
 269     return
 270
 271   # First pass. Looking for exact match primary header.
 272   exact_match_primary_header = f'{os.path.splitext(filename)[0]}.h'
 273   for include in includes:
 274     if IsUserHeader(include.decorated_name) and UndecoratedName(
 275         include.decorated_name) == exact_match_primary_header:
 276       include.is_primary_header = True
 277       return
 278
 279   basis = _DecomposePath(filename)
 280
 281   # Second pass. The list of includes is searched in reverse order of length.
 282   # Even though matching is fuzzy, moo_posix.h should take precedence over moo.h
 283   # when considering moo_posix.cc.
 284   includes.sort(key=lambda i: -len(i.decorated_name))
 285   for include in includes:
 286     if include.header_type != _HEADER_TYPE_USER:
 287       continue
 288     to_test = _DecomposePath(UndecoratedName(include.decorated_name))
 289
 290     # If the basename to test is longer than the basis, just skip it and
 291     # continue. moo.c should never match against moo_posix.h.
 292     if len(to_test[1]) > len(basis[1]):
 293       continue
 294
 295     # The basename in the two paths being compared need to fuzzily match.
 296     # This allows for situations where moo_posix.cc implements the interfaces
 297     # defined in moo.h.
 298     escaped_basename = re.escape(to_test[1])
 299     if not (re.match(escaped_basename + _PLATFORM_SUFFIX + _TEST_SUFFIX + '$',
 300                      basis[1]) or
 301             re.match(escaped_basename + _TEST_SUFFIX + _PLATFORM_SUFFIX + '$',
 302                      basis[1])):
 303       continue
 304
 305     # The topmost directory name must match, and the rest of the directory path
 306     # should be 'substantially similar'.
 307     s = difflib.SequenceMatcher(None, to_test[0], basis[0])
 308     first_matched = False
 309     total_matched = 0
 310     for match in s.get_matching_blocks():
 311       if total_matched == 0 and match.a == 0 and match.b == 0:
 312         first_matched = True
 313       total_matched += match.size
 314
 315     if not first_matched:
 316       continue
 317
 318     # 'Substantially similar' is defined to be:
 319     # - no more than two differences
 320     # - at least one match besides the topmost directory
 321     total_differences = abs(total_matched -
 322                             len(to_test[0])) + abs(total_matched -
 323                                                    len(basis[0]))
 324     # Note: total_differences != 0 is mainly intended to allow more succinct
 325     # tests (otherwise tests with just a basename would always trip the
 326     # total_matched < 2 check).
 327     if total_differences != 0 and (total_differences > 2 or total_matched < 2):
 328       continue
 329
 330     include.is_primary_header = True
 331     return
 332
 333
 334 def SerializeIncludes(includes):
 335   """Turns includes back into the corresponding C++ source text.
 336
 337   Args:
 338     includes: a list of Include objects.
 339
 340   Returns:
 341     A list of strings representing C++ source text.
 342   """
 343   source = []
 344
 345   special_headers = [
 346       # Must be included before ws2tcpip.h.
 347       # Doesn't need to be included before <windows.h> with
 348       # WIN32_LEAN_AND_MEAN but why chance it?
 349       '<winsock2.h>',
 350       # Must be before lots of things, e.g. shellapi.h, winbase.h,
 351       # versionhelpers.h, memoryapi.h, hidclass.h, ncrypt.h., ...
 352       '<windows.h>',
 353       # Must be before iphlpapi.h.
 354       '<ws2tcpip.h>',
 355       # Must be before propkey.h.
 356       '<shobjidl.h>',
 357       # Must be before atlapp.h.
 358       '<atlbase.h>',
 359       # Must be before intshcut.h.
 360       '<ole2.h>',
 361       # Must be before intshcut.h.
 362       '<unknwn.h>',
 363       # Must be before uiautomation.h.
 364       '<objbase.h>',
 365       # Must be before tpcshrd.h.
 366       '<tchar.h>',
 367   ]
 368
 369   # Ensure that headers are sorted as follows:
 370   #
 371   # 1. The primary header, if any, appears first.
 372   # 2. All headers of the same type (e.g. C system, C++ system headers, et
 373   #    cetera) are grouped contiguously.
 374   # 3. Any special sorting rules needed within each group for satisfying
 375   #    platform header idiosyncrasies. In practice, this only applies to C
 376   #    system headers.
 377   # 4. The remaining headers without special sorting rules are sorted
 378   #    lexicographically.
 379   #
 380   # The for loop below that outputs the actual source text depends on #2 above
 381   # to insert newlines between different groups of headers.
 382   def SortKey(include):
 383     def SpecialSortKey(include):
 384       lower_name = include.decorated_name.lower()
 385       for i in range(len(special_headers)):
 386         if special_headers[i] == lower_name:
 387           return i
 388       return len(special_headers)
 389
 390     return (not include.is_primary_header, include.header_type,
 391             SpecialSortKey(include), include.decorated_name)
 392
 393   includes.sort(key=SortKey)
 394
 395   # Assume there's always at least one include.
 396   previous_include = None
 397   for include in includes:
 398     if previous_include and include.ShouldInsertNewline(previous_include):
 399       source.append('')
 400     source.extend(include.ToSource())
 401     previous_include = include
 402   return source
 403
 404
 405 def AddHeaderToSource(filename, source, decorated_name, remove=False):
 406   """Adds or removes the specified header into/from the source text, if needed.
 407
 408   Args:
 409     filename: The name of the source file.
 410     source: A string containing the contents of the source file.
 411     decorated_name: The decorated name of the header to add or remove.
 412     remove: If true, remove instead of adding.
 413
 414   Returns:
 415     None if no changes are needed or the modified source text otherwise.
 416   """
 417   lines = source.splitlines()
 418   begin, end = FindIncludes(lines)
 419
 420   # No #includes in this file. Just give up.
 421   # TODO(dcheng): Be more clever and insert it after the file-level comment or
 422   # include guard as appropriate.
 423   if begin < 0:
 424     print(f'Skipping {filename}: unable to find includes!')
 425     return None
 426
 427   includes = ParseIncludes(lines[begin:end])
 428   if not includes:
 429     print(f'Skipping {filename}: unable to parse includes!')
 430     return None
 431
 432   if remove:
 433     for i in includes:
 434       if decorated_name == i.decorated_name:
 435         includes.remove(i)
 436         break
 437     else:
 438       print(f'Skipping {filename}: unable to find {decorated_name}')
 439       return None
 440   else:
 441     if decorated_name in [i.decorated_name for i in includes]:
 442       # Nothing to do.
 443       print(f'Skipping {filename}: no changes required!')
 444       return None
 445     else:
 446       includes.append(Include(decorated_name, 'include', [], None))
 447
 448   MarkPrimaryInclude(includes, filename)
 449
 450   lines[begin:end] = SerializeIncludes(includes)
 451   lines.append('')  # To avoid eating the newline at the end of the file.
 452   return '\n'.join(lines)
 453
 454
 455 def main():
 456   parser = argparse.ArgumentParser(
 457       description='Mass add (or remove) a new header into a bunch of files.')
 458   parser.add_argument(
 459       '--header',
 460       help='The decorated filename of the header to insert (e.g. "a" or <a>)',
 461       required=True)
 462   parser.add_argument('--remove',
 463                       help='Remove the header file instead of adding it',
 464                       action='store_true')
 465   parser.add_argument('files', nargs='+')
 466   args = parser.parse_args()
 467   if ClassifyHeader(args.header) == _HEADER_TYPE_INVALID:
 468     print('--header argument must be a decorated filename, e.g.')
 469     print('  --header "<utility>"')
 470     print('or')
 471     print('  --header \'"moo.h"\'')
 472     return 1
 473   operation = 'Removing' if args.remove else 'Inserting'
 474   print(f'{operation} #include {args.header}...')
 475   for filename in args.files:
 476     with open(filename, 'r') as f:
 477       new_source = AddHeaderToSource(os.path.normpath(filename), f.read(),
 478                                      args.header, args.remove)
 479     if not new_source:
 480       continue
 481     with open(filename, 'w', newline='\n') as f:
 482       f.write(new_source)
 483
 484
 485 if __name__ == '__main__':
 486   sys.exit(main())