src/third_party/trace-viewer/third_party/tvcm/tvcm/parse_html_deps.py

   1 # Copyright (c) 2013 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 import re
   6 import os
   7 import sys
   8 from HTMLParser import HTMLParser
   9
  10 from tvcm import module
  11 from tvcm import strip_js_comments
  12 from tvcm import html_generation_controller
  13
  14 def _InitBeautifulSoup():
  15   tvcm_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
  16   bs_path = os.path.join(tvcm_path, 'third_party', 'beautifulsoup')
  17   if bs_path in sys.path:
  18     return
  19   sys.path.insert(0, bs_path)
  20 _InitBeautifulSoup()
  21 import BeautifulSoup
  22
  23
  24 class InlineScript(object):
  25   def __init__(self, soup):
  26     self._soup = soup
  27     self._stripped_contents = None
  28     self._open_tags = None
  29
  30   @property
  31   def contents(self):
  32     return str(self._soup.string)
  33
  34   @property
  35   def stripped_contents(self):
  36     if not self._stripped_contents:
  37       self._stripped_contents = strip_js_comments.StripJSComments(
  38           self.contents)
  39     return self._stripped_contents
  40
  41   @property
  42   def open_tags(self):
  43     if self._open_tags:
  44       return self._open_tags
  45     open_tags = []
  46     cur = self._soup.parent
  47     while cur:
  48       if isinstance(cur, BeautifulSoup.BeautifulSoup):
  49         break
  50
  51       open_tags.append(_Tag(cur.name, cur.attrs))
  52       cur = cur.parent
  53
  54     open_tags.reverse()
  55     assert open_tags[-1].tag == 'script'
  56     del open_tags[-1]
  57
  58     self._open_tags = open_tags
  59     return self._open_tags
  60
  61
  62 def _IsDoctype(x):
  63   if not isinstance(x, BeautifulSoup.Declaration):
  64     return False
  65   return x == 'DOCTYPE html' or x == 'DOCTYPE HTML'
  66
  67 class HTMLModuleParserResults(object):
  68   def __init__(self, html):
  69     self._soup = BeautifulSoup.BeautifulSoup(html)
  70     self._inline_scripts = None
  71
  72   @property
  73   def has_decl(self):
  74     decls = [x for x in self._soup.contents
  75              if _IsDoctype(x)]
  76     return len(decls) == 1
  77
  78   @property
  79   def scripts_external(self):
  80     tags = self._soup.findAll('script', src=True)
  81     return [t['src'] for t in tags]
  82
  83   @property
  84   def inline_scripts(self):
  85     if not self._inline_scripts:
  86       tags = self._soup.findAll('script', src=None)
  87       self._inline_scripts = [InlineScript(t.string) for t in tags]
  88     return self._inline_scripts
  89
  90   @property
  91   def imports(self):
  92     tags = self._soup.findAll('link', rel='import')
  93     return [t['href'] for t in tags]
  94
  95   @property
  96   def stylesheets(self):
  97     tags = self._soup.findAll('link', rel='stylesheet')
  98     return [t['href'] for t in tags]
  99
 100   @property
 101   def inline_stylesheets(self):
 102     tags = self._soup.findAll('style')
 103     return [str(t.string) for t in tags]
 104
 105   def YieldHTMLInPieces(self, controller):
 106     yield self.GenerateHTML(controller)
 107
 108   def GenerateHTML(self, controller):
 109     soup = BeautifulSoup.BeautifulSoup(str(self._soup))
 110
 111     # Remove decl
 112     for x in soup.contents:
 113      if isinstance(x, BeautifulSoup.Declaration):
 114       if _IsDoctype(x):
 115         x.extract()
 116
 117     # Remove all imports
 118     imports = soup.findAll('link', rel='import')
 119     for imp in imports:
 120       imp.extract()
 121
 122     # Remove all script links
 123     scripts_external = soup.findAll('script', src=True)
 124     for script in scripts_external:
 125       script.extract()
 126
 127     # Remove all inline script
 128     scripts_external = soup.findAll('script', src=None)
 129     for script in scripts_external:
 130       script.extract()
 131
 132     # Process all inline styles
 133     inline_styles = soup.findAll('style')
 134     for style in inline_styles:
 135       html = controller.GetHTMLForInlineStylesheet(str(style.string))
 136       if html:
 137         ns = BeautifulSoup.Tag(soup, 'style')
 138         ns.append(BeautifulSoup.NavigableString(html))
 139         style.replaceWith(ns)
 140       else:
 141         style.extract()
 142
 143     # Rewrite all external stylesheet hrefs or remove, as needed
 144     stylesheet_links = soup.findAll('link', rel='stylesheet')
 145     for stylesheet_link in stylesheet_links:
 146       html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
 147       if html:
 148         tmp = BeautifulSoup.BeautifulSoup(html).findChildren()
 149         assert len(tmp) == 1
 150         stylesheet_link.replaceWith(tmp[0])
 151       else:
 152         stylesheet_link.extract()
 153
 154     # We is done.
 155     return str(soup)
 156
 157   @property
 158   def html_contents_without_links_and_script(self):
 159     return self.GenerateHTML(html_generation_controller.HTMLGenerationController())
 160
 161 class _Tag(object):
 162   def __init__(self, tag, attrs):
 163     self.tag = tag
 164     self.attrs = attrs
 165
 166   def __repr__(self):
 167     attr_string = ' '.join(['%s="%s"' % (x[0], x[1]) for x in self.attrs])
 168     return '<%s %s>' % (self.tag, attr_string)
 169
 170 class HTMLModuleParser():
 171   def Parse(self, html):
 172     if html is None:
 173       html = ''
 174     else:
 175       if html.find('< /script>') != -1:
 176         raise Exception('Escape script tags with <\/script>')
 177
 178     return HTMLModuleParserResults(html)