src/tools/metrics/histograms/pretty_print.py

   1 #!/usr/bin/env python
   2 # Copyright 2013 The Chromium Authors. All rights reserved.
   3 # Use of this source code is governed by a BSD-style license that can be
   4 # found in the LICENSE file.
   5
   6 """Pretty-prints the histograms.xml file, alphabetizing tags, wrapping text
   7 at 80 chars, enforcing standard attribute ordering, and standardizing
   8 indentation.
   9
  10 This is quite a bit more complicated than just calling tree.toprettyxml();
  11 we need additional customization, like special attribute ordering in tags
  12 and wrapping text nodes, so we implement our own full custom XML pretty-printer.
  13 """
  14
  15 from __future__ import with_statement
  16
  17 import diffutil
  18 import json
  19 import logging
  20 import shutil
  21 import sys
  22 import textwrap
  23 import xml.dom.minidom
  24
  25
  26 WRAP_COLUMN = 80
  27
  28 # Desired order for tag attributes; attributes listed here will appear first,
  29 # and in the same order as in these lists.
  30 # { tag_name: [attribute_name, ...] }
  31 ATTRIBUTE_ORDER = {
  32   'enum': ['name', 'type'],
  33   'histogram': ['name', 'enum', 'units'],
  34   'int': ['value', 'label'],
  35   'fieldtrial': ['name', 'separator', 'ordering'],
  36   'group': ['name', 'label'],
  37   'affected-histogram': ['name'],
  38   'with-group': ['name'],
  39 }
  40
  41 # Tag names for top-level nodes whose children we don't want to indent.
  42 TAGS_THAT_DONT_INDENT = [
  43   'histogram-configuration',
  44   'histograms',
  45   'fieldtrials',
  46   'enums'
  47 ]
  48
  49 # Extra vertical spacing rules for special tag names.
  50 # {tag_name: (newlines_after_open, newlines_before_close, newlines_after_close)}
  51 TAGS_THAT_HAVE_EXTRA_NEWLINE = {
  52   'histogram-configuration': (2, 1, 1),
  53   'histograms': (2, 1, 1),
  54   'fieldtrials': (2, 1, 1),
  55   'enums': (2, 1, 1),
  56   'histogram': (1, 1, 1),
  57   'enum': (1, 1, 1),
  58   'fieldtrial': (1, 1, 1),
  59 }
  60
  61 # Tags that we allow to be squished into a single line for brevity.
  62 TAGS_THAT_ALLOW_SINGLE_LINE = [
  63   'summary',
  64   'int',
  65 ]
  66
  67 # Tags whose children we want to alphabetize. The key is the parent tag name,
  68 # and the value is a pair of the tag name of the children we want to sort,
  69 # and a key function that maps each child node to the desired sort key.
  70 ALPHABETIZATION_RULES = {
  71   'histograms': ('histogram', lambda n: n.attributes['name'].value.lower()),
  72   'enums': ('enum', lambda n: n.attributes['name'].value.lower()),
  73   'enum': ('int', lambda n: int(n.attributes['value'].value)),
  74   'fieldtrials': ('fieldtrial', lambda n: n.attributes['name'].value.lower()),
  75   'fieldtrial': ('affected-histogram',
  76                  lambda n: n.attributes['name'].value.lower()),
  77 }
  78
  79
  80 class Error(Exception):
  81   pass
  82
  83
  84 def LastLineLength(s):
  85   """Returns the length of the last line in s.
  86
  87   Args:
  88     s: A multi-line string, including newlines.
  89
  90   Returns:
  91     The length of the last line in s, in characters.
  92   """
  93   if s.rfind('\n') == -1: return len(s)
  94   return len(s) - s.rfind('\n') - len('\n')
  95
  96
  97 def XmlEscape(s):
  98   """XML-escapes the given string, replacing magic characters (&<>") with their
  99   escaped equivalents."""
 100   s = s.replace("&", "&amp;").replace("<", "&lt;")
 101   s = s.replace("\"", "&quot;").replace(">", "&gt;")
 102   return s
 103
 104
 105 def PrettyPrintNode(node, indent=0):
 106   """Pretty-prints the given XML node at the given indent level.
 107
 108   Args:
 109     node: The minidom node to pretty-print.
 110     indent: The current indent level.
 111
 112   Returns:
 113     The pretty-printed string (including embedded newlines).
 114
 115   Raises:
 116     Error if the XML has unknown tags or attributes.
 117   """
 118   # Handle the top-level document node.
 119   if node.nodeType == xml.dom.minidom.Node.DOCUMENT_NODE:
 120     return '\n'.join([PrettyPrintNode(n) for n in node.childNodes])
 121
 122   # Handle text nodes.
 123   if node.nodeType == xml.dom.minidom.Node.TEXT_NODE:
 124     # Wrap each paragraph in the text to fit in the 80 column limit.
 125     wrapper = textwrap.TextWrapper()
 126     wrapper.initial_indent = ' ' * indent
 127     wrapper.subsequent_indent = ' ' * indent
 128     wrapper.break_on_hyphens = False
 129     wrapper.break_long_words = False
 130     wrapper.width = WRAP_COLUMN
 131     text = XmlEscape(node.data)
 132     # Remove any common indent.
 133     text = textwrap.dedent(text.strip('\n'))
 134     lines = text.split('\n')
 135     # Split the text into paragraphs at blank line boundaries.
 136     paragraphs = [[]]
 137     for l in lines:
 138       if len(l.strip()) == 0 and len(paragraphs[-1]) > 0:
 139         paragraphs.append([])
 140       else:
 141         paragraphs[-1].append(l)
 142     # Remove trailing empty paragraph if present.
 143     if len(paragraphs) > 0 and len(paragraphs[-1]) == 0:
 144       paragraphs = paragraphs[:-1]
 145     # Wrap each paragraph and separate with two newlines.
 146     return '\n\n'.join([wrapper.fill('\n'.join(p)) for p in paragraphs])
 147
 148   # Handle element nodes.
 149   if node.nodeType == xml.dom.minidom.Node.ELEMENT_NODE:
 150     newlines_after_open, newlines_before_close, newlines_after_close = (
 151       TAGS_THAT_HAVE_EXTRA_NEWLINE.get(node.tagName, (1, 1, 0)))
 152     # Open the tag.
 153     s = ' ' * indent + '<' + node.tagName
 154
 155     # Calculate how much space to allow for the '>' or '/>'.
 156     closing_chars = 1
 157     if not node.childNodes:
 158       closing_chars = 2
 159
 160     # Pretty-print the attributes.
 161     attributes = node.attributes.keys()
 162     if attributes:
 163       # Reorder the attributes.
 164       if not node.tagName in ATTRIBUTE_ORDER:
 165         unrecognized_attributes = attributes;
 166       else:
 167         unrecognized_attributes = (
 168           [a for a in attributes if not a in ATTRIBUTE_ORDER[node.tagName]])
 169         attributes = (
 170           [a for a in ATTRIBUTE_ORDER[node.tagName] if a in attributes])
 171
 172       for a in unrecognized_attributes:
 173         logging.error(
 174             'Unrecognized attribute "%s" in tag "%s"' % (a, node.tagName))
 175       if unrecognized_attributes:
 176         raise Error()
 177
 178       for a in attributes:
 179         value = XmlEscape(node.attributes[a].value)
 180         # Replace sequences of whitespace with single spaces.
 181         words = value.split()
 182         a_str = ' %s="%s"' % (a, ' '.join(words))
 183         # Start a new line if the attribute will make this line too long.
 184         if LastLineLength(s) + len(a_str) + closing_chars > WRAP_COLUMN:
 185           s += '\n' + ' ' * (indent + 3)
 186         # Output everything up to the first quote.
 187         s += ' %s="' % (a)
 188         value_indent_level = LastLineLength(s)
 189         # Output one word at a time, splitting to the next line where necessary.
 190         column = value_indent_level
 191         for i, word in enumerate(words):
 192           # This is slightly too conservative since not every word will be
 193           # followed by the closing characters...
 194           if i > 0 and (column + len(word) + 1 + closing_chars > WRAP_COLUMN):
 195             s = s.rstrip()  # remove any trailing whitespace
 196             s += '\n' + ' ' * value_indent_level
 197             column = value_indent_level
 198           s += word + ' '
 199           column += len(word) + 1
 200         s = s.rstrip()  # remove any trailing whitespace
 201         s += '"'
 202       s = s.rstrip()  # remove any trailing whitespace
 203
 204     # Pretty-print the child nodes.
 205     if node.childNodes:
 206       s += '>'
 207       # Calculate the new indent level for child nodes.
 208       new_indent = indent
 209       if node.tagName not in TAGS_THAT_DONT_INDENT:
 210         new_indent += 2
 211       child_nodes = node.childNodes
 212
 213       # Recursively pretty-print the child nodes.
 214       child_nodes = [PrettyPrintNode(n, indent=new_indent) for n in child_nodes]
 215       child_nodes = [c for c in child_nodes if len(c.strip()) > 0]
 216
 217       # Determine whether we can fit the entire node on a single line.
 218       close_tag = '</%s>' % node.tagName
 219       space_left = WRAP_COLUMN - LastLineLength(s) - len(close_tag)
 220       if (node.tagName in TAGS_THAT_ALLOW_SINGLE_LINE and
 221           len(child_nodes) == 1 and len(child_nodes[0].strip()) <= space_left):
 222         s += child_nodes[0].strip()
 223       else:
 224         s += '\n' * newlines_after_open + '\n'.join(child_nodes)
 225         s += '\n' * newlines_before_close + ' ' * indent
 226       s += close_tag
 227     else:
 228       s += '/>'
 229     s += '\n' * newlines_after_close
 230     return s
 231
 232   # Handle comment nodes.
 233   if node.nodeType == xml.dom.minidom.Node.COMMENT_NODE:
 234     return '<!--%s-->\n' % node.data
 235
 236   # Ignore other node types. This could be a processing instruction (<? ... ?>)
 237   # or cdata section (<![CDATA[...]]!>), neither of which are legal in the
 238   # histograms XML at present.
 239   logging.error('Ignoring unrecognized node data: %s' % node.toxml())
 240   raise Error()
 241
 242
 243 def unsafeAppendChild(parent, child):
 244   """Append child to parent's list of children, ignoring the possibility that it
 245   is already in another node's childNodes list.  Requires that the previous
 246   parent of child is discarded (to avoid non-tree DOM graphs).
 247   This can provide a significant speedup as O(n^2) operations are removed (in
 248   particular, each child insertion avoids the need to traverse the old parent's
 249   entire list of children)."""
 250   child.parentNode = None
 251   parent.appendChild(child)
 252   child.parentNode = parent
 253
 254
 255 def TransformByAlphabetizing(node):
 256   """Transform the given XML by alphabetizing specific node types according to
 257   the rules in ALPHABETIZATION_RULES.
 258
 259   Args:
 260     node: The minidom node to transform.
 261
 262   Returns:
 263     The minidom node, with children appropriately alphabetized. Note that the
 264     transformation is done in-place, i.e. the original minidom tree is modified
 265     directly.
 266   """
 267   if node.nodeType != xml.dom.minidom.Node.ELEMENT_NODE:
 268     for c in node.childNodes: TransformByAlphabetizing(c)
 269     return node
 270
 271   # Element node with a tag name that we alphabetize the children of?
 272   if node.tagName in ALPHABETIZATION_RULES:
 273     # Put subnodes in a list of node,key pairs to allow for custom sorting.
 274     subtag, key_function = ALPHABETIZATION_RULES[node.tagName]
 275     subnodes = []
 276     last_key = -1
 277     for c in node.childNodes:
 278       if (c.nodeType == xml.dom.minidom.Node.ELEMENT_NODE and
 279           c.tagName == subtag):
 280         last_key = key_function(c)
 281       # Subnodes that we don't want to rearrange use the last node's key,
 282       # so they stay in the same relative position.
 283       subnodes.append( (c, last_key) )
 284
 285     # Sort the subnode list.
 286     subnodes.sort(key=lambda pair: pair[1])
 287
 288     # Re-add the subnodes, transforming each recursively.
 289     while node.firstChild:
 290       node.removeChild(node.firstChild)
 291     for (c, _) in subnodes:
 292       unsafeAppendChild(node, TransformByAlphabetizing(c))
 293     return node
 294
 295   # Recursively handle other element nodes and other node types.
 296   for c in node.childNodes: TransformByAlphabetizing(c)
 297   return node
 298
 299
 300 def PrettyPrint(raw_xml):
 301   """Pretty-print the given XML.
 302
 303   Args:
 304     xml: The contents of the histograms XML file, as a string.
 305
 306   Returns:
 307     The pretty-printed version.
 308   """
 309   tree = xml.dom.minidom.parseString(raw_xml)
 310   tree = TransformByAlphabetizing(tree)
 311   return PrettyPrintNode(tree)
 312
 313
 314 def main():
 315   logging.basicConfig(level=logging.INFO)
 316
 317   presubmit = ('--presubmit' in sys.argv)
 318
 319   logging.info('Loading histograms.xml...')
 320   with open('histograms.xml', 'rb') as f:
 321     xml = f.read()
 322
 323   # Check there are no CR ('\r') characters in the file.
 324   if '\r' in xml:
 325     logging.info('DOS-style line endings (CR characters) detected - these are '
 326                  'not allowed. Please run dos2unix histograms.xml')
 327     sys.exit(1)
 328
 329   logging.info('Pretty-printing...')
 330   try:
 331     pretty = PrettyPrint(xml)
 332   except Error:
 333     logging.error('Aborting parsing due to fatal errors.')
 334     sys.exit(1)
 335
 336   if xml == pretty:
 337     logging.info('histograms.xml is correctly pretty-printed.')
 338     sys.exit(0)
 339   if presubmit:
 340     logging.info('histograms.xml is not formatted correctly; run '
 341                  'pretty_print.py to fix.')
 342     sys.exit(1)
 343   if not diffutil.PromptUserToAcceptDiff(
 344       xml, pretty,
 345       'Is the prettified version acceptable?'):
 346     logging.error('Aborting')
 347     return
 348
 349   logging.info('Creating backup file histograms.before.pretty-print.xml')
 350   shutil.move('histograms.xml', 'histograms.before.pretty-print.xml')
 351
 352   logging.info('Writing new histograms.xml file')
 353   with open('histograms.xml', 'wb') as f:
 354     f.write(pretty)
 355
 356
 357 if __name__ == '__main__':
 358   main()