Upstream version 10.39.225.0
[platform/framework/web/crosswalk.git] / src / third_party / trace-viewer / third_party / tvcm / tvcm / parse_html_deps.py
index 9533da8..af3b13d 100644 (file)
@@ -3,28 +3,33 @@
 # found in the LICENSE file.
 
 import re
+import os
+import sys
 from HTMLParser import HTMLParser
 
 from tvcm import module
 from tvcm import strip_js_comments
 from tvcm import html_generation_controller
 
+def _InitBeautifulSoup():
+  tvcm_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+  bs_path = os.path.join(tvcm_path, 'third_party', 'beautifulsoup')
+  if bs_path in sys.path:
+    return
+  sys.path.insert(0, bs_path)
+_InitBeautifulSoup()
+import BeautifulSoup
 
-CHUNK_TEXT_OP = 'text-op'
-CHUNK_SCRIPT_OP = 'script-op'
-CHUNK_STYLESHEET_OP = 'stylesheet-op'
-CHUNK_INLINE_STYLE_OP = 'inline-style-op'
-
-class _Chunk(object):
-  def __init__(self, op, data):
-    self.op = op
-    self.data = data
 
 class InlineScript(object):
-  def __init__(self, contents, open_tags):
-    self.contents = contents
-    self.open_tags = open_tags
+  def __init__(self, soup):
+    self._soup = soup
     self._stripped_contents = None
+    self._open_tags = None
+
+  @property
+  def contents(self):
+    return str(self._soup.string)
 
   @property
   def stripped_contents(self):
@@ -33,60 +38,126 @@ class InlineScript(object):
           self.contents)
     return self._stripped_contents
 
+  @property
+  def open_tags(self):
+    if self._open_tags:
+      return self._open_tags
+    open_tags = []
+    cur = self._soup.parent
+    while cur:
+      if isinstance(cur, BeautifulSoup.BeautifulSoup):
+        break
+
+      open_tags.append(_Tag(cur.name, cur.attrs))
+      cur = cur.parent
+
+    open_tags.reverse()
+    assert open_tags[-1].tag == 'script'
+    del open_tags[-1]
+
+    self._open_tags = open_tags
+    return self._open_tags
+
+
+def _IsDoctype(x):
+  if not isinstance(x, BeautifulSoup.Declaration):
+    return False
+  return x == 'DOCTYPE html' or x == 'DOCTYPE HTML'
+
 class HTMLModuleParserResults(object):
-  def __init__(self):
-    self.scripts_external = []
-    self.inline_scripts = []
-    self.stylesheets = []
-    self.imports = []
-    self.has_decl = False
-    self._chunks = []
+  def __init__(self, html):
+    self._soup = BeautifulSoup.BeautifulSoup(html)
+    self._inline_scripts = None
 
   @property
-  def inline_stylesheets(self):
-    return [x.data for x in self._chunks
-            if x.op == CHUNK_INLINE_STYLE_OP]
+  def has_decl(self):
+    decls = [x for x in self._soup.contents
+             if _IsDoctype(x)]
+    return len(decls) == 1
 
-  def AppendHTMLContent(self, text):
-    self._chunks.append(_Chunk(CHUNK_TEXT_OP, text))
+  @property
+  def scripts_external(self):
+    tags = self._soup.findAll('script', src=True)
+    return [t['src'] for t in tags]
 
-  def AppendHTMLInlineStyleContent(self, text):
-    self._chunks.append(_Chunk(CHUNK_INLINE_STYLE_OP, text))
+  @property
+  def inline_scripts(self):
+    if not self._inline_scripts:
+      tags = self._soup.findAll('script', src=None)
+      self._inline_scripts = [InlineScript(t.string) for t in tags]
+    return self._inline_scripts
 
-  def AppendHTMLScriptSplicePoint(self, href):
-    self._chunks.append(_Chunk(CHUNK_SCRIPT_OP, href))
+  @property
+  def imports(self):
+    tags = self._soup.findAll('link', rel='import')
+    return [t['href'] for t in tags]
 
-  def AppendHTMLStylesheetSplicePoint(self, href):
-    self._chunks.append(_Chunk(CHUNK_STYLESHEET_OP, href))
+  @property
+  def stylesheets(self):
+    tags = self._soup.findAll('link', rel='stylesheet')
+    return [t['href'] for t in tags]
 
-  def GenerateHTML(self, controller):
-    return ''.join(list(self.YieldHTMLInPieces(controller)))
+  @property
+  def inline_stylesheets(self):
+    tags = self._soup.findAll('style')
+    return [str(t.string) for t in tags]
 
   def YieldHTMLInPieces(self, controller):
-    for chunk in self._chunks:
-      if chunk.op == CHUNK_TEXT_OP:
-        yield chunk.data
-      elif chunk.op == CHUNK_INLINE_STYLE_OP:
-        html = controller.GetHTMLForInlineStylesheet(chunk.data)
-        if html:
-          yield html
-      elif chunk.op == CHUNK_SCRIPT_OP:
-        html = controller.GetHTMLForScriptHRef(chunk.data)
-        if html:
-          yield html
-      elif chunk.op == CHUNK_STYLESHEET_OP:
-        html = controller.GetHTMLForStylesheetHRef(chunk.data)
-        if html:
-          yield html
+    yield self.GenerateHTML(controller)
+
+  def GenerateHTML(self, controller):
+    soup = BeautifulSoup.BeautifulSoup(str(self._soup))
+
+    # Remove decl
+    for x in soup.contents:
+     if isinstance(x, BeautifulSoup.Declaration):
+      if _IsDoctype(x):
+        x.extract()
+
+    # Remove all imports
+    imports = soup.findAll('link', rel='import')
+    for imp in imports:
+      imp.extract()
+
+    # Remove all script links
+    scripts_external = soup.findAll('script', src=True)
+    for script in scripts_external:
+      script.extract()
+
+    # Remove all inline script
+    scripts_external = soup.findAll('script', src=None)
+    for script in scripts_external:
+      script.extract()
+
+    # Process all inline styles
+    inline_styles = soup.findAll('style')
+    for style in inline_styles:
+      html = controller.GetHTMLForInlineStylesheet(str(style.string))
+      if html:
+        ns = BeautifulSoup.Tag(soup, 'style')
+        ns.append(BeautifulSoup.NavigableString(html))
+        style.replaceWith(ns)
+      else:
+        style.extract()
+
+    # Rewrite all external stylesheet hrefs or remove, as needed
+    stylesheet_links = soup.findAll('link', rel='stylesheet')
+    for stylesheet_link in stylesheet_links:
+      html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
+      if html:
+        tmp = BeautifulSoup.BeautifulSoup(html).findChildren()
+        assert len(tmp) == 1
+        stylesheet_link.replaceWith(tmp[0])
       else:
-        raise NotImplementedError()
+        stylesheet_link.extract()
+
+    # We is done.
+    return str(soup)
 
   @property
   def html_contents_without_links_and_script(self):
     return self.GenerateHTML(html_generation_controller.HTMLGenerationController())
 
-_SELF_CLOSING_TAGS = ('link', 'p', 'meta')
-
 class _Tag(object):
   def __init__(self, tag, attrs):
     self.tag = tag
@@ -96,127 +167,12 @@ class _Tag(object):
     attr_string = ' '.join(['%s="%s"' % (x[0], x[1]) for x in self.attrs])
     return '<%s %s>' % (self.tag, attr_string)
 
-class HTMLModuleParser(HTMLParser):
-  def __init__(self):
-    HTMLParser.__init__(self)
-    self.current_results = None
-    self.current_inline_script = None
-    self._current_inline_style_sheet_contents = None
-    self.open_tags = []
-
+class HTMLModuleParser():
   def Parse(self, html):
-    results = HTMLModuleParserResults()
-    if html is None or len(html) == 0:
-      return results
-    if html.find('< /script>') != -1:
-      raise Exception('Escape script tags with <\/script>')
-    self.current_results = results
-    self.feed(html)
-    self.current_results = None
-    if len(self.open_tags):
-      raise Exception('There were open tags: %s' % ','.join(self.open_tags))
-    return results
-
-  def handle_decl(self, decl):
-    assert self.current_results.has_decl == False, 'Only one doctype decl allowed'
-    self.current_results.has_decl = True
-
-  def handle_starttag(self, tag, attrs):
-    if tag == 'br':
-      raise Exception('Must use <br/>')
-
-    if tag not in _SELF_CLOSING_TAGS:
-      self.open_tags.append(_Tag(tag, attrs))
-
-    if tag == 'link':
-      is_stylesheet = False
-      is_import = False
-      href = None
-      for attr in attrs:
-        if attr[0] == 'rel' and attr[1] == 'stylesheet':
-          is_stylesheet = True
-        elif attr[0] == 'rel' and attr[1] == 'import':
-          is_import = True
-        elif attr[0] == 'href':
-          href = attr[1]
-
-      if is_stylesheet:
-        self.current_results.AppendHTMLStylesheetSplicePoint(href)
-        self.current_results.stylesheets.append(href)
-      elif is_import:
-        self.current_results.imports.append(href)
-      else:
-        self.current_results.AppendHTMLContent(
-          self.get_starttag_text())
-
-    elif tag == 'script':
-      had_src = False
-      for attr in attrs:
-        if attr[0] == 'src':
-          self.current_results.scripts_external.append(attr[1])
-          self.current_results.AppendHTMLScriptSplicePoint(attr[1])
-          had_src = True
-      if had_src == False:
-        assert self.current_inline_script == None
-        self.current_inline_script = InlineScript(
-            '',
-            list(self.open_tags[:-1]))
-
-    elif tag == 'style':
-      self._current_inline_style_sheet_contents = ''
-      self.current_results.AppendHTMLContent(
-        self.get_starttag_text())
-
+    if html is None:
+      html = ''
     else:
-      self.current_results.AppendHTMLContent(
-        self.get_starttag_text())
-
-  def handle_entityref(self, name):
-    self.current_results.AppendHTMLContent('&%s;' % name)
-
-  def handle_charref(self, name):
-    self.current_results.AppendHTMLContent('&#%s;' % name)
-
-  def handle_startendtag(self, tag, attrs):
-    if (tag == 'script'):
-      raise Exception('Script must have explicit close tag')
-    self.current_results.AppendHTMLContent('%s' % self.get_starttag_text())
-
-  def handle_endtag(self, tag):
-    if tag not in _SELF_CLOSING_TAGS:
-      if len(self.open_tags) == 0:
-        raise Exception('got </%s> with no previous open tag' % tag)
-
-      if self.open_tags[-1].tag != tag:
-        raise Exception('Expected </%s> but got </%s>' % (
-            self.open_tags[-1].tag, tag))
-      self.open_tags.pop()
-
-    if tag == 'script':
-      if self.current_inline_script:
-        self.current_results.inline_scripts.append(
-            self.current_inline_script)
-        self.current_inline_script = None
-
-    elif tag == 'style':
-      if self._current_inline_style_sheet_contents != None:
-        self.current_results.AppendHTMLInlineStyleContent(
-            self._current_inline_style_sheet_contents)
-        self._current_inline_style_sheet_contents = None
-      self.current_results.AppendHTMLContent('</style>')
+      if html.find('< /script>') != -1:
+        raise Exception('Escape script tags with <\/script>')
 
-    else:
-      self.current_results.AppendHTMLContent("</%s>" % tag)
-
-  def handle_data(self, data):
-    if self.current_inline_script:
-      self.current_inline_script.contents += data
-
-    elif self._current_inline_style_sheet_contents != None:
-      result = re.match(r"\s*@import url\(([^\)]*)\)", data,
-                        flags=re.IGNORECASE)
-      if result:
-        raise Exception("@import not yet supported")
-      self._current_inline_style_sheet_contents += data
-    else:
-      self.current_results.AppendHTMLContent(data)
+    return HTMLModuleParserResults(html)