1 # Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
8 from HTMLParser import HTMLParser
10 from tvcm import module
11 from tvcm import strip_js_comments
12 from tvcm import html_generation_controller
14 def _InitBeautifulSoup():
15 tvcm_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
16 bs_path = os.path.join(tvcm_path, 'third_party', 'beautifulsoup')
17 if bs_path in sys.path:
19 sys.path.insert(0, bs_path)
24 class InlineScript(object):
25 def __init__(self, soup):
27 self._stripped_contents = None
28 self._open_tags = None
32 return str(self._soup.string)
35 def stripped_contents(self):
36 if not self._stripped_contents:
37 self._stripped_contents = strip_js_comments.StripJSComments(
39 return self._stripped_contents
44 return self._open_tags
46 cur = self._soup.parent
48 if isinstance(cur, BeautifulSoup.BeautifulSoup):
51 open_tags.append(_Tag(cur.name, cur.attrs))
55 assert open_tags[-1].tag == 'script'
58 self._open_tags = open_tags
59 return self._open_tags
63 if not isinstance(x, BeautifulSoup.Declaration):
65 return x == 'DOCTYPE html' or x == 'DOCTYPE HTML'
67 class HTMLModuleParserResults(object):
68 def __init__(self, html):
69 self._soup = BeautifulSoup.BeautifulSoup(html)
70 self._inline_scripts = None
74 decls = [x for x in self._soup.contents
76 return len(decls) == 1
79 def scripts_external(self):
80 tags = self._soup.findAll('script', src=True)
81 return [t['src'] for t in tags]
84 def inline_scripts(self):
85 if not self._inline_scripts:
86 tags = self._soup.findAll('script', src=None)
87 self._inline_scripts = [InlineScript(t.string) for t in tags]
88 return self._inline_scripts
92 tags = self._soup.findAll('link', rel='import')
93 return [t['href'] for t in tags]
96 def stylesheets(self):
97 tags = self._soup.findAll('link', rel='stylesheet')
98 return [t['href'] for t in tags]
101 def inline_stylesheets(self):
102 tags = self._soup.findAll('style')
103 return [str(t.string) for t in tags]
105 def YieldHTMLInPieces(self, controller):
106 yield self.GenerateHTML(controller)
108 def GenerateHTML(self, controller):
109 soup = BeautifulSoup.BeautifulSoup(str(self._soup))
112 for x in soup.contents:
113 if isinstance(x, BeautifulSoup.Declaration):
118 imports = soup.findAll('link', rel='import')
122 # Remove all script links
123 scripts_external = soup.findAll('script', src=True)
124 for script in scripts_external:
127 # Remove all inline script
128 scripts_external = soup.findAll('script', src=None)
129 for script in scripts_external:
132 # Process all inline styles
133 inline_styles = soup.findAll('style')
134 for style in inline_styles:
135 html = controller.GetHTMLForInlineStylesheet(str(style.string))
137 ns = BeautifulSoup.Tag(soup, 'style')
138 ns.append(BeautifulSoup.NavigableString(html))
139 style.replaceWith(ns)
143 # Rewrite all external stylesheet hrefs or remove, as needed
144 stylesheet_links = soup.findAll('link', rel='stylesheet')
145 for stylesheet_link in stylesheet_links:
146 html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
148 tmp = BeautifulSoup.BeautifulSoup(html).findChildren()
150 stylesheet_link.replaceWith(tmp[0])
152 stylesheet_link.extract()
158 def html_contents_without_links_and_script(self):
159 return self.GenerateHTML(html_generation_controller.HTMLGenerationController())
162 def __init__(self, tag, attrs):
167 attr_string = ' '.join(['%s="%s"' % (x[0], x[1]) for x in self.attrs])
168 return '<%s %s>' % (self.tag, attr_string)
170 class HTMLModuleParser():
171 def Parse(self, html):
175 if html.find('< /script>') != -1:
176 raise Exception('Escape script tags with <\/script>')
178 return HTMLModuleParserResults(html)