src/third_party/trace-viewer/third_party/tvcm/tvcm/parse_html_deps.py

   1 # Copyright (c) 2013 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 import re
   6 import os
   7 import sys
   8 from HTMLParser import HTMLParser
   9
  10 from tvcm import module
  11 from tvcm import strip_js_comments
  12 from tvcm import html_generation_controller
  13
  14 def _InitBeautifulSoup():
  15   tvcm_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
  16   bs_path = os.path.join(tvcm_path, 'third_party', 'beautifulsoup')
  17   if bs_path in sys.path:
  18     return
  19   sys.path.insert(0, bs_path)
  20 _InitBeautifulSoup()
  21 import BeautifulSoup
  22 import polymer_soup
  23
  24
  25 class InlineScript(object):
  26   def __init__(self, soup):
  27     self._soup = soup
  28     self._stripped_contents = None
  29     self._open_tags = None
  30
  31   @property
  32   def contents(self):
  33     return str(self._soup.string)
  34
  35   @property
  36   def stripped_contents(self):
  37     if not self._stripped_contents:
  38       self._stripped_contents = strip_js_comments.StripJSComments(
  39           self.contents)
  40     return self._stripped_contents
  41
  42   @property
  43   def open_tags(self):
  44     if self._open_tags:
  45       return self._open_tags
  46     open_tags = []
  47     cur = self._soup.parent
  48     while cur:
  49       if isinstance(cur, BeautifulSoup.BeautifulSoup):
  50         break
  51
  52       open_tags.append(_Tag(cur.name, cur.attrs))
  53       cur = cur.parent
  54
  55     open_tags.reverse()
  56     assert open_tags[-1].tag == 'script'
  57     del open_tags[-1]
  58
  59     self._open_tags = open_tags
  60     return self._open_tags
  61
  62
  63 def _IsDoctype(x):
  64   if not isinstance(x, BeautifulSoup.Declaration):
  65     return False
  66   return x == 'DOCTYPE html' or x == 'DOCTYPE HTML'
  67
  68 class HTMLModuleParserResults(object):
  69   def __init__(self, html):
  70     self._soup = polymer_soup.PolymerSoup(html)
  71     self._inline_scripts = None
  72
  73   @property
  74   def has_decl(self):
  75     decls = [x for x in self._soup.contents
  76              if _IsDoctype(x)]
  77     return len(decls) == 1
  78
  79   @property
  80   def scripts_external(self):
  81     tags = self._soup.findAll('script', src=True)
  82     return [t['src'] for t in tags]
  83
  84   @property
  85   def inline_scripts(self):
  86     if not self._inline_scripts:
  87       tags = self._soup.findAll('script', src=None)
  88       self._inline_scripts = [InlineScript(t.string) for t in tags]
  89     return self._inline_scripts
  90
  91   @property
  92   def imports(self):
  93     tags = self._soup.findAll('link', rel='import')
  94     return [t['href'] for t in tags]
  95
  96   @property
  97   def stylesheets(self):
  98     tags = self._soup.findAll('link', rel='stylesheet')
  99     return [t['href'] for t in tags]
 100
 101   @property
 102   def inline_stylesheets(self):
 103     tags = self._soup.findAll('style')
 104     return [str(t.string) for t in tags]
 105
 106   def YieldHTMLInPieces(self, controller):
 107     yield self.GenerateHTML(controller)
 108
 109   def GenerateHTML(self, controller):
 110     soup = polymer_soup.PolymerSoup(str(self._soup))
 111
 112     # Remove decl
 113     for x in soup.contents:
 114      if isinstance(x, BeautifulSoup.Declaration):
 115       if _IsDoctype(x):
 116         x.extract()
 117
 118     # Remove all imports
 119     imports = soup.findAll('link', rel='import')
 120     for imp in imports:
 121       imp.extract()
 122
 123     # Remove all script links
 124     scripts_external = soup.findAll('script', src=True)
 125     for script in scripts_external:
 126       script.extract()
 127
 128     # Remove all inline script
 129     scripts_external = soup.findAll('script', src=None)
 130     for script in scripts_external:
 131       script.extract()
 132
 133     # Process all inline styles
 134     inline_styles = soup.findAll('style')
 135     for style in inline_styles:
 136       html = controller.GetHTMLForInlineStylesheet(str(style.string))
 137       if html:
 138         ns = BeautifulSoup.Tag(soup, 'style')
 139         ns.append(BeautifulSoup.NavigableString(html))
 140         style.replaceWith(ns)
 141       else:
 142         style.extract()
 143
 144     # Rewrite all external stylesheet hrefs or remove, as needed
 145     stylesheet_links = soup.findAll('link', rel='stylesheet')
 146     for stylesheet_link in stylesheet_links:
 147       html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
 148       if html:
 149         tmp = polymer_soup.PolymerSoup(html).findChildren()
 150         assert len(tmp) == 1
 151         stylesheet_link.replaceWith(tmp[0])
 152       else:
 153         stylesheet_link.extract()
 154
 155     # We is done.
 156     return str(soup)
 157
 158   @property
 159   def html_contents_without_links_and_script(self):
 160     return self.GenerateHTML(html_generation_controller.HTMLGenerationController())
 161
 162 class _Tag(object):
 163   def __init__(self, tag, attrs):
 164     self.tag = tag
 165     self.attrs = attrs
 166
 167   def __repr__(self):
 168     attr_string = ' '.join(['%s="%s"' % (x[0], x[1]) for x in self.attrs])
 169     return '<%s %s>' % (self.tag, attr_string)
 170
 171 class HTMLModuleParser():
 172   def Parse(self, html):
 173     if html is None:
 174       html = ''
 175     else:
 176       if html.find('< /script>') != -1:
 177         raise Exception('Escape script tags with <\/script>')
 178
 179     return HTMLModuleParserResults(html)