Upload Tizen:Base source
[toolchains/python-lxml.git] / doc / html / html5parser.html
1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
2 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
3 <head>
4 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
5 <meta name="generator" content="Docutils 0.5: http://docutils.sourceforge.net/" />
6 <title>html5lib Parser</title>
7 <link rel="stylesheet" href="style.css" type="text/css" />
8 </head>
9 <body>
10 <div class="document" id="html5lib-parser">
11 <div class="sidemenu"><ul id="lxml-section"><li><span class="section title">lxml</span><ul class="menu foreign" id="index-menu"><li class="menu title"><a href="index.html">lxml</a><ul class="submenu"><li class="menu item"><a href="index.html#introduction">Introduction</a></li><li class="menu item"><a href="index.html#documentation">Documentation</a></li><li class="menu item"><a href="index.html#download">Download</a></li><li class="menu item"><a href="index.html#mailing-list">Mailing list</a></li><li class="menu item"><a href="index.html#bug-tracker">Bug tracker</a></li><li class="menu item"><a href="index.html#license">License</a></li><li class="menu item"><a href="index.html#old-versions">Old Versions</a></li></ul></li></ul><ul class="menu foreign" id="intro-menu"><li class="menu title"><a href="intro.html">Why lxml?</a><ul class="submenu"><li class="menu item"><a href="intro.html#motto">Motto</a></li><li class="menu item"><a href="intro.html#aims">Aims</a></li></ul></li></ul><ul class="menu foreign" id="installation-menu"><li class="menu title"><a href="installation.html">Installing lxml</a><ul class="submenu"><li class="menu item"><a href="installation.html#requirements">Requirements</a></li><li class="menu item"><a href="installation.html#installation">Installation</a></li><li class="menu item"><a href="installation.html#building-lxml-from-sources">Building lxml from sources</a></li><li class="menu item"><a href="installation.html#ms-windows">MS Windows</a></li><li class="menu item"><a href="installation.html#macos-x">MacOS-X</a></li></ul></li></ul><ul class="menu foreign" id="lxml2-menu"><li class="menu title"><a href="lxml2.html">What's new in lxml 2.0?</a><ul class="submenu"><li class="menu item"><a href="lxml2.html#changes-in-etree-and-objectify">Changes in etree and objectify</a></li><li class="menu item"><a href="lxml2.html#new-modules">New modules</a></li></ul></li></ul><ul class="menu foreign" id="performance-menu"><li class="menu title"><a href="performance.html">Benchmarks and Speed</a><ul class="submenu"><li class="menu item"><a href="performance.html#general-notes">General notes</a></li><li class="menu item"><a href="performance.html#how-to-read-the-timings">How to read the timings</a></li><li class="menu item"><a href="performance.html#parsing-and-serialising">Parsing and Serialising</a></li><li class="menu item"><a href="performance.html#the-elementtree-api">The ElementTree API</a></li><li class="menu item"><a href="performance.html#xpath">XPath</a></li><li class="menu item"><a href="performance.html#a-longer-example">A longer example</a></li><li class="menu item"><a href="performance.html#lxml-objectify">lxml.objectify</a></li></ul></li></ul><ul class="menu foreign" id="compatibility-menu"><li class="menu title"><a href="compatibility.html">ElementTree compatibility of lxml.etree</a></li></ul><ul class="menu foreign" id="FAQ-menu"><li class="menu title"><a href="FAQ.html">lxml FAQ - Frequently Asked Questions</a><ul class="submenu"><li class="menu item"><a href="FAQ.html#general-questions">General Questions</a></li><li class="menu item"><a href="FAQ.html#installation">Installation</a></li><li class="menu item"><a href="FAQ.html#contributing">Contributing</a></li><li class="menu item"><a href="FAQ.html#bugs">Bugs</a></li><li class="menu item"><a href="FAQ.html#threading">Threading</a></li><li class="menu item"><a href="FAQ.html#parsing-and-serialisation">Parsing and Serialisation</a></li><li class="menu item"><a href="FAQ.html#xpath-and-document-traversal">XPath and Document Traversal</a></li></ul></li></ul></li></ul><ul id="Developing with lxml-section"><li><span class="section title">Developing with lxml</span><ul class="menu foreign" id="tutorial-menu"><li class="menu title"><a href="tutorial.html">The lxml.etree Tutorial</a><ul class="submenu"><li class="menu item"><a href="tutorial.html#the-element-class">The Element class</a></li><li class="menu item"><a href="tutorial.html#the-elementtree-class">The ElementTree class</a></li><li class="menu item"><a href="tutorial.html#parsing-from-strings-and-files">Parsing from strings and files</a></li><li class="menu item"><a href="tutorial.html#namespaces">Namespaces</a></li><li class="menu item"><a href="tutorial.html#the-e-factory">The E-factory</a></li><li class="menu item"><a href="tutorial.html#elementpath">ElementPath</a></li></ul></li></ul><ul class="menu foreign" id="api index-menu"><li class="menu title"><a href="api/index.html">API reference</a></li></ul><ul class="menu foreign" id="api-menu"><li class="menu title"><a href="api.html">APIs specific to lxml.etree</a><ul class="submenu"><li class="menu item"><a href="api.html#lxml-etree">lxml.etree</a></li><li class="menu item"><a href="api.html#other-element-apis">Other Element APIs</a></li><li class="menu item"><a href="api.html#trees-and-documents">Trees and Documents</a></li><li class="menu item"><a href="api.html#iteration">Iteration</a></li><li class="menu item"><a href="api.html#error-handling-on-exceptions">Error handling on exceptions</a></li><li class="menu item"><a href="api.html#error-logging">Error logging</a></li><li class="menu item"><a href="api.html#serialisation">Serialisation</a></li><li class="menu item"><a href="api.html#cdata">CDATA</a></li><li class="menu item"><a href="api.html#xinclude-and-elementinclude">XInclude and ElementInclude</a></li><li class="menu item"><a href="api.html#write-c14n-on-elementtree">write_c14n on ElementTree</a></li></ul></li></ul><ul class="menu foreign" id="parsing-menu"><li class="menu title"><a href="parsing.html">Parsing XML and HTML with lxml</a><ul class="submenu"><li class="menu item"><a href="parsing.html#parsers">Parsers</a></li><li class="menu item"><a href="parsing.html#the-target-parser-interface">The target parser interface</a></li><li class="menu item"><a href="parsing.html#the-feed-parser-interface">The feed parser interface</a></li><li class="menu item"><a href="parsing.html#iterparse-and-iterwalk">iterparse and iterwalk</a></li><li class="menu item"><a href="parsing.html#python-unicode-strings">Python unicode strings</a></li></ul></li></ul><ul class="menu foreign" id="validation-menu"><li class="menu title"><a href="validation.html">Validation with lxml</a><ul class="submenu"><li class="menu item"><a href="validation.html#validation-at-parse-time">Validation at parse time</a></li><li class="menu item"><a href="validation.html#dtd">DTD</a></li><li class="menu item"><a href="validation.html#relaxng">RelaxNG</a></li><li class="menu item"><a href="validation.html#xmlschema">XMLSchema</a></li><li class="menu item"><a href="validation.html#schematron">Schematron</a></li></ul></li></ul><ul class="menu foreign" id="xpathxslt-menu"><li class="menu title"><a href="xpathxslt.html">XPath and XSLT with lxml</a><ul class="submenu"><li class="menu item"><a href="xpathxslt.html#xpath">XPath</a></li><li class="menu item"><a href="xpathxslt.html#xslt">XSLT</a></li></ul></li></ul><ul class="menu foreign" id="objectify-menu"><li class="menu title"><a href="objectify.html">lxml.objectify</a><ul class="submenu"><li class="menu item"><a href="objectify.html#the-lxml-objectify-api">The lxml.objectify API</a></li><li class="menu item"><a href="objectify.html#asserting-a-schema">Asserting a Schema</a></li><li class="menu item"><a href="objectify.html#objectpath">ObjectPath</a></li><li class="menu item"><a href="objectify.html#python-data-types">Python data types</a></li><li class="menu item"><a href="objectify.html#how-data-types-are-matched">How data types are matched</a></li><li class="menu item"><a href="objectify.html#what-is-different-from-lxml-etree">What is different from lxml.etree?</a></li></ul></li></ul><ul class="menu foreign" id="lxmlhtml-menu"><li class="menu title"><a href="lxmlhtml.html">lxml.html</a><ul class="submenu"><li class="menu item"><a href="lxmlhtml.html#parsing-html">Parsing HTML</a></li><li class="menu item"><a href="lxmlhtml.html#html-element-methods">HTML Element Methods</a></li><li class="menu item"><a href="lxmlhtml.html#running-html-doctests">Running HTML doctests</a></li><li class="menu item"><a href="lxmlhtml.html#creating-html-with-the-e-factory">Creating HTML with the E-factory</a></li><li class="menu item"><a href="lxmlhtml.html#working-with-links">Working with links</a></li><li class="menu item"><a href="lxmlhtml.html#forms">Forms</a></li><li class="menu item"><a href="lxmlhtml.html#cleaning-up-html">Cleaning up HTML</a></li><li class="menu item"><a href="lxmlhtml.html#html-diff">HTML Diff</a></li><li class="menu item"><a href="lxmlhtml.html#examples">Examples</a></li></ul></li></ul><ul class="menu foreign" id="cssselect-menu"><li class="menu title"><a href="cssselect.html">lxml.cssselect</a><ul class="submenu"><li class="menu item"><a href="cssselect.html#the-cssselector-class">The CSSSelector class</a></li><li class="menu item"><a href="cssselect.html#css-selectors">CSS Selectors</a></li><li class="menu item"><a href="cssselect.html#namespaces">Namespaces</a></li><li class="menu item"><a href="cssselect.html#limitations">Limitations</a></li></ul></li></ul><ul class="menu foreign" id="elementsoup-menu"><li class="menu title"><a href="elementsoup.html">BeautifulSoup Parser</a><ul class="submenu"><li class="menu item"><a href="elementsoup.html#parsing-with-the-soupparser">Parsing with the soupparser</a></li><li class="menu item"><a href="elementsoup.html#entity-handling">Entity handling</a></li><li class="menu item"><a href="elementsoup.html#using-soupparser-as-a-fallback">Using soupparser as a fallback</a></li><li class="menu item"><a href="elementsoup.html#using-only-the-encoding-detection">Using only the encoding detection</a></li></ul></li></ul><ul class="menu current" id="html5parser-menu"><li class="menu title"><a href="html5parser.html">html5lib Parser</a><ul class="submenu"><li class="menu item"><a href="html5parser.html#differences-to-regular-html-parsing">Differences to regular HTML parsing</a></li><li class="menu item"><a href="html5parser.html#function-reference">Function Reference</a></li></ul></li></ul></li></ul><ul id="Extending lxml-section"><li><span class="section title">Extending lxml</span><ul class="menu foreign" id="resolvers-menu"><li class="menu title"><a href="resolvers.html">Document loading and URL resolving</a><ul class="submenu"><li class="menu item"><a href="resolvers.html#uri-resolvers">URI Resolvers</a></li><li class="menu item"><a href="resolvers.html#document-loading-in-context">Document loading in context</a></li><li class="menu item"><a href="resolvers.html#i-o-access-control-in-xslt">I/O access control in XSLT</a></li></ul></li></ul><ul class="menu foreign" id="extensions-menu"><li class="menu title"><a href="extensions.html">Python extensions for XPath and XSLT</a><ul class="submenu"><li class="menu item"><a href="extensions.html#xpath-extension-functions">XPath Extension functions</a></li><li class="menu item"><a href="extensions.html#xslt-extension-elements">XSLT extension elements</a></li></ul></li></ul><ul class="menu foreign" id="element classes-menu"><li class="menu title"><a href="element_classes.html">Using custom Element classes in lxml</a><ul class="submenu"><li class="menu item"><a href="element_classes.html#background-on-element-proxies">Background on Element proxies</a></li><li class="menu item"><a href="element_classes.html#element-initialization">Element initialization</a></li><li class="menu item"><a href="element_classes.html#setting-up-a-class-lookup-scheme">Setting up a class lookup scheme</a></li><li class="menu item"><a href="element_classes.html#generating-xml-with-custom-classes">Generating XML with custom classes</a></li><li class="menu item"><a href="element_classes.html#implementing-namespaces">Implementing namespaces</a></li></ul></li></ul><ul class="menu foreign" id="sax-menu"><li class="menu title"><a href="sax.html">Sax support</a><ul class="submenu"><li class="menu item"><a href="sax.html#building-a-tree-from-sax-events">Building a tree from SAX events</a></li><li class="menu item"><a href="sax.html#producing-sax-events-from-an-elementtree-or-element">Producing SAX events from an ElementTree or Element</a></li><li class="menu item"><a href="sax.html#interfacing-with-pulldom-minidom">Interfacing with pulldom/minidom</a></li></ul></li></ul><ul class="menu foreign" id="capi-menu"><li class="menu title"><a href="capi.html">The public C-API of lxml.etree</a><ul class="submenu"><li class="menu item"><a href="capi.html#writing-external-modules-in-cython">Writing external modules in Cython</a></li><li class="menu item"><a href="capi.html#writing-external-modules-in-c">Writing external modules in C</a></li></ul></li></ul></li></ul><ul id="Developing lxml-section"><li><span class="section title">Developing lxml</span><ul class="menu foreign" id="build-menu"><li class="menu title"><a href="build.html">How to build lxml from source</a><ul class="submenu"><li class="menu item"><a href="build.html#cython">Cython</a></li><li class="menu item"><a href="build.html#subversion">Subversion</a></li><li class="menu item"><a href="build.html#setuptools">Setuptools</a></li><li class="menu item"><a href="build.html#running-the-tests-and-reporting-errors">Running the tests and reporting errors</a></li><li class="menu item"><a href="build.html#building-an-egg">Building an egg</a></li><li class="menu item"><a href="build.html#building-lxml-on-macos-x">Building lxml on MacOS-X</a></li><li class="menu item"><a href="build.html#static-linking-on-windows">Static linking on Windows</a></li><li class="menu item"><a href="build.html#building-debian-packages-from-svn-sources">Building Debian packages from SVN sources</a></li></ul></li></ul><ul class="menu foreign" id="lxml source howto-menu"><li class="menu title"><a href="lxml-source-howto.html">How to read the source of lxml</a><ul class="submenu"><li class="menu item"><a href="lxml-source-howto.html#what-is-cython">What is Cython?</a></li><li class="menu item"><a href="lxml-source-howto.html#where-to-start">Where to start?</a></li><li class="menu item"><a href="lxml-source-howto.html#lxml-etree">lxml.etree</a></li><li class="menu item"><a href="lxml-source-howto.html#python-modules">Python modules</a></li><li class="menu item"><a href="lxml-source-howto.html#lxml-objectify">lxml.objectify</a></li><li class="menu item"><a href="lxml-source-howto.html#lxml-html">lxml.html</a></li></ul></li></ul><ul class="menu foreign" id="changes 2 2 3-menu"><li class="menu title"><a href="changes-2.2.3.html">Release Changelog</a></li></ul><ul class="menu foreign" id="credits-menu"><li class="menu title"><a href="credits.html">Credits</a><ul class="submenu"><li class="menu item"><a href="credits.html#main-contributors">Main contributors</a></li><li class="menu item"><a href="credits.html#special-thanks-goes-to">Special thanks goes to:</a></li></ul></li></ul></li></ul></div><h1 class="title">html5lib Parser</h1>
12
13 <p><a class="reference external" href="http://code.google.com/p/html5lib/">html5lib</a> is a Python package that implements the HTML5 parsing algorithm
14 which is heavily influenced by current browsers and based on the <a class="reference external" href="http://www.whatwg.org/specs/web-apps/current-work/">WHATWG
15 HTML5 specification</a>.</p>
16 <p>lxml can benefit from the parsing capabilities of <cite>html5lib</cite> through
17 the <tt class="docutils literal"><span class="pre">lxml.html.html5parser</span></tt> module.  It provides a similar interface
18 to the <tt class="docutils literal"><span class="pre">lxml.html</span></tt> module by providing <tt class="docutils literal"><span class="pre">fromstring()</span></tt>,
19 <tt class="docutils literal"><span class="pre">parse()</span></tt>, <tt class="docutils literal"><span class="pre">document_fromstring()</span></tt>, <tt class="docutils literal"><span class="pre">fragment_fromstring()</span></tt> and
20 <tt class="docutils literal"><span class="pre">fragments_fromstring()</span></tt> that work like the regular html parsing
21 functions.</p>
22 <div class="section" id="differences-to-regular-html-parsing">
23 <h1>Differences to regular HTML parsing</h1>
24 <p>There are a few differences in the returned tree to the regular HTML
25 parsing functions from <tt class="docutils literal"><span class="pre">lxml.html</span></tt>.  html5lib normalizes some elements
26 and element structures to a common format.  For example even if a tables
27 does not have a <cite>tbody</cite> html5lib will inject one automatically:</p>
28 <div class="syntax"><pre><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">lxml.html</span> <span class="kn">import</span> <span class="n">tostring</span><span class="p">,</span> <span class="n">html5parser</span>
29 <span class="gp">&gt;&gt;&gt; </span><span class="n">tostring</span><span class="p">(</span><span class="n">html5parser</span><span class="o">.</span><span class="n">fromstring</span><span class="p">(</span><span class="s">"&lt;table&gt;&lt;td&gt;foo"</span><span class="p">))</span>
30 <span class="go">'&lt;table&gt;&lt;tbody&gt;&lt;tr&gt;&lt;td&gt;foo&lt;/td&gt;&lt;/tr&gt;&lt;/tbody&gt;&lt;/table&gt;'</span>
31 </pre></div>
32 <p>Also the parameters the functions accept are different.</p>
33 </div>
34 <div class="section" id="function-reference">
35 <h1>Function Reference</h1>
36 <dl class="docutils">
37 <dt><tt class="docutils literal"><span class="pre">parse(filename_url_or_file)</span></tt>:</dt>
38 <dd>Parses the named file or url, or if the object has a <tt class="docutils literal"><span class="pre">.read()</span></tt>
39 method, parses from that.</dd>
40 <dt><tt class="docutils literal"><span class="pre">document_fromstring(html,</span> <span class="pre">guess_charset=True)</span></tt>:</dt>
41 <dd><p class="first">Parses a document from the given string.  This always creates a
42 correct HTML document, which means the parent node is <tt class="docutils literal"><span class="pre">&lt;html&gt;</span></tt>,
43 and there is a body and possibly a head.</p>
44 <p class="last">If a bytestring is passed and <tt class="docutils literal"><span class="pre">guess_charset</span></tt> is true the chardet
45 library (if installed) will guess the charset if ambiguities exist.</p>
46 </dd>
47 <dt><tt class="docutils literal"><span class="pre">fragment_fromstring(string,</span> <span class="pre">create_parent=False,</span> <span class="pre">guess_charset=False)</span></tt>:</dt>
48 <dd><p class="first">Returns an HTML fragment from a string.  The fragment must contain
49 just a single element, unless <tt class="docutils literal"><span class="pre">create_parent</span></tt> is given;
50 e.g,. <tt class="docutils literal"><span class="pre">fragment_fromstring(string,</span> <span class="pre">create_parent='div')</span></tt> will
51 wrap the element in a <tt class="docutils literal"><span class="pre">&lt;div&gt;</span></tt>.  If <tt class="docutils literal"><span class="pre">create_parent</span></tt> is true the
52 default parent tag (div) is used.</p>
53 <p class="last">If a bytestring is passed and <tt class="docutils literal"><span class="pre">guess_charset</span></tt> is true the chardet
54 library (if installed) will guess the charset if ambiguities exist.</p>
55 </dd>
56 <dt><tt class="docutils literal"><span class="pre">fragments_fromstring(string,</span> <span class="pre">no_leading_text=False,</span> <span class="pre">parser=None)</span></tt>:</dt>
57 <dd><p class="first">Returns a list of the elements found in the fragment.  The first item in
58 the list may be a string.  If <tt class="docutils literal"><span class="pre">no_leading_text</span></tt> is true, then it will
59 be an error if there is leading text, and it will always be a list of
60 only elements.</p>
61 <p class="last">If a bytestring is passed and <tt class="docutils literal"><span class="pre">guess_charset</span></tt> is true the chardet
62 library (if installed) will guess the charset if ambiguities exist.</p>
63 </dd>
64 <dt><tt class="docutils literal"><span class="pre">fromstring(string)</span></tt>:</dt>
65 <dd>Returns <tt class="docutils literal"><span class="pre">document_fromstring</span></tt> or <tt class="docutils literal"><span class="pre">fragment_fromstring</span></tt>, based
66 on whether the string looks like a full document, or just a
67 fragment.</dd>
68 </dl>
69 <p>Additionally all parsing functions accept an <tt class="docutils literal"><span class="pre">parser</span></tt> keyword argument
70 that can be set to a custom parser instance.  To create custom parsers
71 you can subclass the <tt class="docutils literal"><span class="pre">HTMLParser</span></tt> and <tt class="docutils literal"><span class="pre">XHTMLParser</span></tt> from the same
72 module.  Note that these are the parser classes provided by html5lib.</p>
73 </div>
74 </div>
75 <div class="footer">
76 <hr class="footer" />
77 Generated on: 2009-10-30.
78
79 </div>
80 </body>
81 </html>