# found in the LICENSE file.
import re
+import os
+import sys
from HTMLParser import HTMLParser
from tvcm import module
from tvcm import strip_js_comments
from tvcm import html_generation_controller
+def _InitBeautifulSoup():
+ tvcm_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+ bs_path = os.path.join(tvcm_path, 'third_party', 'beautifulsoup')
+ if bs_path in sys.path:
+ return
+ sys.path.insert(0, bs_path)
+_InitBeautifulSoup()
+import BeautifulSoup
-CHUNK_TEXT_OP = 'text-op'
-CHUNK_SCRIPT_OP = 'script-op'
-CHUNK_STYLESHEET_OP = 'stylesheet-op'
-CHUNK_INLINE_STYLE_OP = 'inline-style-op'
-
-class _Chunk(object):
- def __init__(self, op, data):
- self.op = op
- self.data = data
class InlineScript(object):
- def __init__(self, contents, open_tags):
- self.contents = contents
- self.open_tags = open_tags
+ def __init__(self, soup):
+ self._soup = soup
self._stripped_contents = None
+ self._open_tags = None
+
+ @property
+ def contents(self):
+ return str(self._soup.string)
@property
def stripped_contents(self):
self.contents)
return self._stripped_contents
+ @property
+ def open_tags(self):
+ if self._open_tags:
+ return self._open_tags
+ open_tags = []
+ cur = self._soup.parent
+ while cur:
+ if isinstance(cur, BeautifulSoup.BeautifulSoup):
+ break
+
+ open_tags.append(_Tag(cur.name, cur.attrs))
+ cur = cur.parent
+
+ open_tags.reverse()
+ assert open_tags[-1].tag == 'script'
+ del open_tags[-1]
+
+ self._open_tags = open_tags
+ return self._open_tags
+
+
+def _IsDoctype(x):
+ if not isinstance(x, BeautifulSoup.Declaration):
+ return False
+ return x == 'DOCTYPE html' or x == 'DOCTYPE HTML'
+
class HTMLModuleParserResults(object):
- def __init__(self):
- self.scripts_external = []
- self.inline_scripts = []
- self.stylesheets = []
- self.imports = []
- self.has_decl = False
- self._chunks = []
+ def __init__(self, html):
+ self._soup = BeautifulSoup.BeautifulSoup(html)
+ self._inline_scripts = None
@property
- def inline_stylesheets(self):
- return [x.data for x in self._chunks
- if x.op == CHUNK_INLINE_STYLE_OP]
+ def has_decl(self):
+ decls = [x for x in self._soup.contents
+ if _IsDoctype(x)]
+ return len(decls) == 1
- def AppendHTMLContent(self, text):
- self._chunks.append(_Chunk(CHUNK_TEXT_OP, text))
+ @property
+ def scripts_external(self):
+ tags = self._soup.findAll('script', src=True)
+ return [t['src'] for t in tags]
- def AppendHTMLInlineStyleContent(self, text):
- self._chunks.append(_Chunk(CHUNK_INLINE_STYLE_OP, text))
+ @property
+ def inline_scripts(self):
+ if not self._inline_scripts:
+ tags = self._soup.findAll('script', src=None)
+ self._inline_scripts = [InlineScript(t.string) for t in tags]
+ return self._inline_scripts
- def AppendHTMLScriptSplicePoint(self, href):
- self._chunks.append(_Chunk(CHUNK_SCRIPT_OP, href))
+ @property
+ def imports(self):
+ tags = self._soup.findAll('link', rel='import')
+ return [t['href'] for t in tags]
- def AppendHTMLStylesheetSplicePoint(self, href):
- self._chunks.append(_Chunk(CHUNK_STYLESHEET_OP, href))
+ @property
+ def stylesheets(self):
+ tags = self._soup.findAll('link', rel='stylesheet')
+ return [t['href'] for t in tags]
- def GenerateHTML(self, controller):
- return ''.join(list(self.YieldHTMLInPieces(controller)))
+ @property
+ def inline_stylesheets(self):
+ tags = self._soup.findAll('style')
+ return [str(t.string) for t in tags]
def YieldHTMLInPieces(self, controller):
- for chunk in self._chunks:
- if chunk.op == CHUNK_TEXT_OP:
- yield chunk.data
- elif chunk.op == CHUNK_INLINE_STYLE_OP:
- html = controller.GetHTMLForInlineStylesheet(chunk.data)
- if html:
- yield html
- elif chunk.op == CHUNK_SCRIPT_OP:
- html = controller.GetHTMLForScriptHRef(chunk.data)
- if html:
- yield html
- elif chunk.op == CHUNK_STYLESHEET_OP:
- html = controller.GetHTMLForStylesheetHRef(chunk.data)
- if html:
- yield html
+ yield self.GenerateHTML(controller)
+
+ def GenerateHTML(self, controller):
+ soup = BeautifulSoup.BeautifulSoup(str(self._soup))
+
+ # Remove decl
+ for x in soup.contents:
+ if isinstance(x, BeautifulSoup.Declaration):
+ if _IsDoctype(x):
+ x.extract()
+
+ # Remove all imports
+ imports = soup.findAll('link', rel='import')
+ for imp in imports:
+ imp.extract()
+
+ # Remove all script links
+ scripts_external = soup.findAll('script', src=True)
+ for script in scripts_external:
+ script.extract()
+
+ # Remove all inline script
+ scripts_external = soup.findAll('script', src=None)
+ for script in scripts_external:
+ script.extract()
+
+ # Process all inline styles
+ inline_styles = soup.findAll('style')
+ for style in inline_styles:
+ html = controller.GetHTMLForInlineStylesheet(str(style.string))
+ if html:
+ ns = BeautifulSoup.Tag(soup, 'style')
+ ns.append(BeautifulSoup.NavigableString(html))
+ style.replaceWith(ns)
+ else:
+ style.extract()
+
+ # Rewrite all external stylesheet hrefs or remove, as needed
+ stylesheet_links = soup.findAll('link', rel='stylesheet')
+ for stylesheet_link in stylesheet_links:
+ html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
+ if html:
+ tmp = BeautifulSoup.BeautifulSoup(html).findChildren()
+ assert len(tmp) == 1
+ stylesheet_link.replaceWith(tmp[0])
else:
- raise NotImplementedError()
+ stylesheet_link.extract()
+
+ # We is done.
+ return str(soup)
@property
def html_contents_without_links_and_script(self):
return self.GenerateHTML(html_generation_controller.HTMLGenerationController())
-_SELF_CLOSING_TAGS = ('link', 'p', 'meta')
-
class _Tag(object):
def __init__(self, tag, attrs):
self.tag = tag
attr_string = ' '.join(['%s="%s"' % (x[0], x[1]) for x in self.attrs])
return '<%s %s>' % (self.tag, attr_string)
-class HTMLModuleParser(HTMLParser):
- def __init__(self):
- HTMLParser.__init__(self)
- self.current_results = None
- self.current_inline_script = None
- self._current_inline_style_sheet_contents = None
- self.open_tags = []
-
+class HTMLModuleParser():
def Parse(self, html):
- results = HTMLModuleParserResults()
- if html is None or len(html) == 0:
- return results
- if html.find('< /script>') != -1:
- raise Exception('Escape script tags with <\/script>')
- self.current_results = results
- self.feed(html)
- self.current_results = None
- if len(self.open_tags):
- raise Exception('There were open tags: %s' % ','.join(self.open_tags))
- return results
-
- def handle_decl(self, decl):
- assert self.current_results.has_decl == False, 'Only one doctype decl allowed'
- self.current_results.has_decl = True
-
- def handle_starttag(self, tag, attrs):
- if tag == 'br':
- raise Exception('Must use <br/>')
-
- if tag not in _SELF_CLOSING_TAGS:
- self.open_tags.append(_Tag(tag, attrs))
-
- if tag == 'link':
- is_stylesheet = False
- is_import = False
- href = None
- for attr in attrs:
- if attr[0] == 'rel' and attr[1] == 'stylesheet':
- is_stylesheet = True
- elif attr[0] == 'rel' and attr[1] == 'import':
- is_import = True
- elif attr[0] == 'href':
- href = attr[1]
-
- if is_stylesheet:
- self.current_results.AppendHTMLStylesheetSplicePoint(href)
- self.current_results.stylesheets.append(href)
- elif is_import:
- self.current_results.imports.append(href)
- else:
- self.current_results.AppendHTMLContent(
- self.get_starttag_text())
-
- elif tag == 'script':
- had_src = False
- for attr in attrs:
- if attr[0] == 'src':
- self.current_results.scripts_external.append(attr[1])
- self.current_results.AppendHTMLScriptSplicePoint(attr[1])
- had_src = True
- if had_src == False:
- assert self.current_inline_script == None
- self.current_inline_script = InlineScript(
- '',
- list(self.open_tags[:-1]))
-
- elif tag == 'style':
- self._current_inline_style_sheet_contents = ''
- self.current_results.AppendHTMLContent(
- self.get_starttag_text())
-
+ if html is None:
+ html = ''
else:
- self.current_results.AppendHTMLContent(
- self.get_starttag_text())
-
- def handle_entityref(self, name):
- self.current_results.AppendHTMLContent('&%s;' % name)
-
- def handle_charref(self, name):
- self.current_results.AppendHTMLContent('&#%s;' % name)
-
- def handle_startendtag(self, tag, attrs):
- if (tag == 'script'):
- raise Exception('Script must have explicit close tag')
- self.current_results.AppendHTMLContent('%s' % self.get_starttag_text())
-
- def handle_endtag(self, tag):
- if tag not in _SELF_CLOSING_TAGS:
- if len(self.open_tags) == 0:
- raise Exception('got </%s> with no previous open tag' % tag)
-
- if self.open_tags[-1].tag != tag:
- raise Exception('Expected </%s> but got </%s>' % (
- self.open_tags[-1].tag, tag))
- self.open_tags.pop()
-
- if tag == 'script':
- if self.current_inline_script:
- self.current_results.inline_scripts.append(
- self.current_inline_script)
- self.current_inline_script = None
-
- elif tag == 'style':
- if self._current_inline_style_sheet_contents != None:
- self.current_results.AppendHTMLInlineStyleContent(
- self._current_inline_style_sheet_contents)
- self._current_inline_style_sheet_contents = None
- self.current_results.AppendHTMLContent('</style>')
+ if html.find('< /script>') != -1:
+ raise Exception('Escape script tags with <\/script>')
- else:
- self.current_results.AppendHTMLContent("</%s>" % tag)
-
- def handle_data(self, data):
- if self.current_inline_script:
- self.current_inline_script.contents += data
-
- elif self._current_inline_style_sheet_contents != None:
- result = re.match(r"\s*@import url\(([^\)]*)\)", data,
- flags=re.IGNORECASE)
- if result:
- raise Exception("@import not yet supported")
- self._current_inline_style_sheet_contents += data
- else:
- self.current_results.AppendHTMLContent(data)
+ return HTMLModuleParserResults(html)