src/third_party/closure_linter/closure_linter/common/htmlutil.py

   1 #!/usr/bin/env python
   2 #
   3 # Copyright 2007 The Closure Linter Authors. All Rights Reserved.
   4 #
   5 # Licensed under the Apache License, Version 2.0 (the "License");
   6 # you may not use this file except in compliance with the License.
   7 # You may obtain a copy of the License at
   8 #
   9 #      http://www.apache.org/licenses/LICENSE-2.0
  10 #
  11 # Unless required by applicable law or agreed to in writing, software
  12 # distributed under the License is distributed on an "AS-IS" BASIS,
  13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 # See the License for the specific language governing permissions and
  15 # limitations under the License.
  16
  17 """Utilities for dealing with HTML."""
  18
  19 __author__ = ('robbyw@google.com (Robert Walker)')
  20
  21 import cStringIO
  22 import formatter
  23 import htmllib
  24 import HTMLParser
  25 import re
  26
  27
  28 class ScriptExtractor(htmllib.HTMLParser):
  29   """Subclass of HTMLParser that extracts script contents from an HTML file.
  30
  31   Also inserts appropriate blank lines so that line numbers in the extracted
  32   code match the line numbers in the original HTML.
  33   """
  34
  35   def __init__(self):
  36     """Initialize a ScriptExtractor."""
  37     htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
  38     self._in_script = False
  39     self._text = ''
  40
  41   def start_script(self, attrs):
  42     """Internal handler for the start of a script tag.
  43
  44     Args:
  45       attrs: The attributes of the script tag, as a list of tuples.
  46     """
  47     for attribute in attrs:
  48       if attribute[0].lower() == 'src':
  49         # Skip script tags with a src specified.
  50         return
  51     self._in_script = True
  52
  53   def end_script(self):
  54     """Internal handler for the end of a script tag."""
  55     self._in_script = False
  56
  57   def handle_data(self, data):
  58     """Internal handler for character data.
  59
  60     Args:
  61       data: The character data from the HTML file.
  62     """
  63     if self._in_script:
  64       # If the last line contains whitespace only, i.e. is just there to
  65       # properly align a </script> tag, strip the whitespace.
  66       if data.rstrip(' \t') != data.rstrip(' \t\n\r\f'):
  67         data = data.rstrip(' \t')
  68       self._text += data
  69     else:
  70       self._AppendNewlines(data)
  71
  72   def handle_comment(self, data):
  73     """Internal handler for HTML comments.
  74
  75     Args:
  76       data: The text of the comment.
  77     """
  78     self._AppendNewlines(data)
  79
  80   def _AppendNewlines(self, data):
  81     """Count the number of newlines in the given string and append them.
  82
  83     This ensures line numbers are correct for reported errors.
  84
  85     Args:
  86       data: The data to count newlines in.
  87     """
  88     # We append 'x' to both sides of the string to ensure that splitlines
  89     # gives us an accurate count.
  90     for i in xrange(len(('x' + data + 'x').splitlines()) - 1):
  91       self._text += '\n'
  92
  93   def GetScriptLines(self):
  94     """Return the extracted script lines.
  95
  96     Returns:
  97       The extracted script lines as a list of strings.
  98     """
  99     return self._text.splitlines()
 100
 101
 102 def GetScriptLines(f):
 103   """Extract script tag contents from the given HTML file.
 104
 105   Args:
 106     f: The HTML file.
 107
 108   Returns:
 109     Lines in the HTML file that are from script tags.
 110   """
 111   extractor = ScriptExtractor()
 112
 113   # The HTML parser chokes on text like Array.<!string>, so we patch
 114   # that bug by replacing the < with &lt; - escaping all text inside script
 115   # tags would be better but it's a bit of a catch 22.
 116   contents = f.read()
 117   contents = re.sub(r'<([^\s\w/])',
 118          lambda x: '&lt;%s' % x.group(1),
 119          contents)
 120
 121   extractor.feed(contents)
 122   extractor.close()
 123   return extractor.GetScriptLines()
 124
 125
 126 def StripTags(str):
 127   """Returns the string with HTML tags stripped.
 128
 129   Args:
 130     str: An html string.
 131
 132   Returns:
 133     The html string with all tags stripped. If there was a parse error, returns
 134     the text successfully parsed so far.
 135   """
 136   # Brute force approach to stripping as much HTML as possible. If there is a
 137   # parsing error, don't strip text before parse error position, and continue
 138   # trying from there.
 139   final_text = ''
 140   finished = False
 141   while not finished:
 142     try:
 143       strip = _HtmlStripper()
 144       strip.feed(str)
 145       strip.close()
 146       str = strip.get_output()
 147       final_text += str
 148       finished = True
 149     except HTMLParser.HTMLParseError, e:
 150       final_text += str[:e.offset]
 151       str = str[e.offset + 1:]
 152
 153   return final_text
 154
 155
 156 class _HtmlStripper(HTMLParser.HTMLParser):
 157   """Simple class to strip tags from HTML.
 158
 159   Does so by doing nothing when encountering tags, and appending character data
 160   to a buffer when that is encountered.
 161   """
 162   def __init__(self):
 163     self.reset()
 164     self.__output = cStringIO.StringIO()
 165
 166   def handle_data(self, d):
 167     self.__output.write(d)
 168
 169   def get_output(self):
 170     return self.__output.getvalue()