1 <?xml version="1.0" encoding="ascii"?>
2 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
3 "DTD/xhtml1-transitional.dtd">
4 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
6 <title>lxml.html.html5parser</title>
7 <link rel="stylesheet" href="epydoc.css" type="text/css" />
8 <script type="text/javascript" src="epydoc.js"></script>
11 <body bgcolor="white" text="black" link="blue" vlink="#204080"
13 <!-- ==================== NAVIGATION BAR ==================== -->
14 <table class="navbar" border="0" width="100%" cellpadding="0"
15 bgcolor="#a0c0ff" cellspacing="0">
18 <th> <a
19 href="lxml-module.html">Home</a> </th>
22 <th> <a
23 href="module-tree.html">Trees</a> </th>
26 <th> <a
27 href="identifier-index.html">Indices</a> </th>
30 <th> <a
31 href="help.html">Help</a> </th>
33 <!-- Project homepage -->
34 <th class="navbar" align="right" width="100%">
35 <table border="0" cellpadding="0" cellspacing="0">
36 <tr><th class="navbar" align="center"
37 ><a class="navbar" target="_top" href="/">lxml API</a></th>
41 <table width="100%" cellpadding="0" cellspacing="0">
44 <span class="breadcrumbs">
45 <a href="lxml-module.html">Package lxml</a> ::
46 <a href="lxml.html-module.html">Package html</a> ::
47 Module html5parser
51 <table cellpadding="0" cellspacing="0">
52 <!-- hide/show private -->
53 <tr><td align="right"><span class="options">[<a href="javascript:void(0);" class="privatelink"
54 onclick="toggle_private();">hide private</a>]</span></td></tr>
55 <tr><td align="right"><span class="options"
56 >[<a href="frames.html" target="_top">frames</a
57 >] | <a href="lxml.html.html5parser-module.html"
58 target="_top">no frames</a>]</span></td></tr>
63 <!-- ==================== MODULE DESCRIPTION ==================== -->
64 <h1 class="epydoc">Module html5parser</h1><p class="nomargin-top"><span class="codelink"><a href="lxml.html.html5parser-pysrc.html">source code</a></span></p>
65 An interface to html5lib that mimics the lxml.html interface.
67 <!-- ==================== CLASSES ==================== -->
68 <a name="section-Classes"></a>
69 <table class="summary" border="1" cellpadding="3"
70 cellspacing="0" width="100%" bgcolor="white">
71 <tr bgcolor="#70b0f0" class="table-header">
72 <td colspan="2" class="table-header">
73 <table border="0" cellpadding="0" cellspacing="0" width="100%">
75 <td align="left"><span class="table-header">Classes</span></td>
76 <td align="right" valign="top"
77 ><span class="options">[<a href="#section-Classes"
78 class="privatelink" onclick="toggle_private();"
79 >hide private</a>]</span></td>
85 <td width="15%" align="right" valign="top" class="summary">
86 <span class="summary-type"> </span>
87 </td><td class="summary">
88 <a href="lxml.html.html5parser.HTMLParser-class.html" class="summary-name">HTMLParser</a><br />
89 An html5lib HTML parser with lxml as tree.
93 <td width="15%" align="right" valign="top" class="summary">
94 <span class="summary-type"> </span>
95 </td><td class="summary">
96 <a href="lxml.html.html5parser.XHTMLParser-class.html" class="summary-name">XHTMLParser</a><br />
97 An html5lib XHTML Parser with lxml as tree.
101 <!-- ==================== FUNCTIONS ==================== -->
102 <a name="section-Functions"></a>
103 <table class="summary" border="1" cellpadding="3"
104 cellspacing="0" width="100%" bgcolor="white">
105 <tr bgcolor="#70b0f0" class="table-header">
106 <td colspan="2" class="table-header">
107 <table border="0" cellpadding="0" cellspacing="0" width="100%">
109 <td align="left"><span class="table-header">Functions</span></td>
110 <td align="right" valign="top"
111 ><span class="options">[<a href="#section-Functions"
112 class="privatelink" onclick="toggle_private();"
113 >hide private</a>]</span></td>
119 <td width="15%" align="right" valign="top" class="summary">
120 <span class="summary-type"> </span>
121 </td><td class="summary">
122 <table width="100%" cellpadding="0" cellspacing="0" border="0">
124 <td><span class="summary-sig"><a name="_find_tag"></a><span class="summary-sig-name">_find_tag</span>(<span class="summary-sig-arg">tree</span>,
125 <span class="summary-sig-arg">tag</span>)</span></td>
126 <td align="right" valign="top">
127 <span class="codelink"><a href="lxml.html.html5parser-pysrc.html#_find_tag">source code</a></span>
136 <td width="15%" align="right" valign="top" class="summary">
137 <span class="summary-type"> </span>
138 </td><td class="summary">
139 <table width="100%" cellpadding="0" cellspacing="0" border="0">
141 <td><span class="summary-sig"><a href="lxml.html.html5parser-module.html#document_fromstring" class="summary-sig-name">document_fromstring</a>(<span class="summary-sig-arg">html</span>,
142 <span class="summary-sig-arg">guess_charset</span>=<span class="summary-sig-default">None</span>,
143 <span class="summary-sig-arg">parser</span>=<span class="summary-sig-default">None</span>)</span><br />
144 Parse a whole document into a string.</td>
145 <td align="right" valign="top">
146 <span class="codelink"><a href="lxml.html.html5parser-pysrc.html#document_fromstring">source code</a></span>
155 <td width="15%" align="right" valign="top" class="summary">
156 <span class="summary-type"> </span>
157 </td><td class="summary">
158 <table width="100%" cellpadding="0" cellspacing="0" border="0">
160 <td><span class="summary-sig"><a href="lxml.html.html5parser-module.html#fragments_fromstring" class="summary-sig-name">fragments_fromstring</a>(<span class="summary-sig-arg">html</span>,
161 <span class="summary-sig-arg">no_leading_text</span>=<span class="summary-sig-default">False</span>,
162 <span class="summary-sig-arg">guess_charset</span>=<span class="summary-sig-default">None</span>,
163 <span class="summary-sig-arg">parser</span>=<span class="summary-sig-default">None</span>)</span><br />
164 Parses several HTML elements, returning a list of elements.</td>
165 <td align="right" valign="top">
166 <span class="codelink"><a href="lxml.html.html5parser-pysrc.html#fragments_fromstring">source code</a></span>
175 <td width="15%" align="right" valign="top" class="summary">
176 <span class="summary-type"> </span>
177 </td><td class="summary">
178 <table width="100%" cellpadding="0" cellspacing="0" border="0">
180 <td><span class="summary-sig"><a href="lxml.html.html5parser-module.html#fragment_fromstring" class="summary-sig-name">fragment_fromstring</a>(<span class="summary-sig-arg">html</span>,
181 <span class="summary-sig-arg">create_parent</span>=<span class="summary-sig-default">False</span>,
182 <span class="summary-sig-arg">guess_charset</span>=<span class="summary-sig-default">None</span>,
183 <span class="summary-sig-arg">parser</span>=<span class="summary-sig-default">None</span>)</span><br />
184 Parses a single HTML element; it is an error if there is more than
185 one element, or if anything but whitespace precedes or follows the
187 <td align="right" valign="top">
188 <span class="codelink"><a href="lxml.html.html5parser-pysrc.html#fragment_fromstring">source code</a></span>
197 <td width="15%" align="right" valign="top" class="summary">
198 <span class="summary-type"> </span>
199 </td><td class="summary">
200 <table width="100%" cellpadding="0" cellspacing="0" border="0">
202 <td><span class="summary-sig"><a href="lxml.html.html5parser-module.html#fromstring" class="summary-sig-name">fromstring</a>(<span class="summary-sig-arg">html</span>,
203 <span class="summary-sig-arg">guess_charset</span>=<span class="summary-sig-default">None</span>,
204 <span class="summary-sig-arg">parser</span>=<span class="summary-sig-default">None</span>)</span><br />
205 Parse the html, returning a single element/document.</td>
206 <td align="right" valign="top">
207 <span class="codelink"><a href="lxml.html.html5parser-pysrc.html#fromstring">source code</a></span>
216 <td width="15%" align="right" valign="top" class="summary">
217 <span class="summary-type"> </span>
218 </td><td class="summary">
219 <table width="100%" cellpadding="0" cellspacing="0" border="0">
221 <td><span class="summary-sig"><a href="lxml.html.html5parser-module.html#parse" class="summary-sig-name">parse</a>(<span class="summary-sig-arg">filename_url_or_file</span>,
222 <span class="summary-sig-arg">guess_charset</span>=<span class="summary-sig-default">None</span>,
223 <span class="summary-sig-arg">parser</span>=<span class="summary-sig-default">None</span>)</span><br />
224 Parse a filename, URL, or file-like object into an HTML document
225 tree. Note: this returns a tree, not an element. Use
226 <tt class="rst-docutils literal"><span class="pre">parse(...).getroot()</span></tt> to get the document root.</td>
227 <td align="right" valign="top">
228 <span class="codelink"><a href="lxml.html.html5parser-pysrc.html#parse">source code</a></span>
237 <td width="15%" align="right" valign="top" class="summary">
238 <span class="summary-type"> </span>
239 </td><td class="summary">
240 <table width="100%" cellpadding="0" cellspacing="0" border="0">
242 <td><span class="summary-sig"><a name="_looks_like_url"></a><span class="summary-sig-name">_looks_like_url</span>(<span class="summary-sig-arg">str</span>)</span></td>
243 <td align="right" valign="top">
244 <span class="codelink"><a href="lxml.html.html5parser-pysrc.html#_looks_like_url">source code</a></span>
253 <!-- ==================== VARIABLES ==================== -->
254 <a name="section-Variables"></a>
255 <table class="summary" border="1" cellpadding="3"
256 cellspacing="0" width="100%" bgcolor="white">
257 <tr bgcolor="#70b0f0" class="table-header">
258 <td colspan="2" class="table-header">
259 <table border="0" cellpadding="0" cellspacing="0" width="100%">
261 <td align="left"><span class="table-header">Variables</span></td>
262 <td align="right" valign="top"
263 ><span class="options">[<a href="#section-Variables"
264 class="privatelink" onclick="toggle_private();"
265 >hide private</a>]</span></td>
271 <td width="15%" align="right" valign="top" class="summary">
272 <span class="summary-type"> </span>
273 </td><td class="summary">
274 <a name="xhtml_parser"></a><span class="summary-name">xhtml_parser</span> = <code title="XHTMLParser()">XHTMLParser()</code>
278 <td width="15%" align="right" valign="top" class="summary">
279 <span class="summary-type"> </span>
280 </td><td class="summary">
281 <a name="html_parser"></a><span class="summary-name">html_parser</span> = <code title="<lxml.html.html5parser.HTMLParser object>"><lxml.html.html5parser.HTMLParser object></code>
285 <td width="15%" align="right" valign="top" class="summary">
286 <span class="summary-type"> </span>
287 </td><td class="summary">
288 <a name="__package__"></a><span class="summary-name">__package__</span> = <code title="'lxml.html'"><code class="variable-quote">'</code><code class="variable-string">lxml.html</code><code class="variable-quote">'</code></code>
292 <!-- ==================== FUNCTION DETAILS ==================== -->
293 <a name="section-FunctionDetails"></a>
294 <table class="details" border="1" cellpadding="3"
295 cellspacing="0" width="100%" bgcolor="white">
296 <tr bgcolor="#70b0f0" class="table-header">
297 <td colspan="2" class="table-header">
298 <table border="0" cellpadding="0" cellspacing="0" width="100%">
300 <td align="left"><span class="table-header">Function Details</span></td>
301 <td align="right" valign="top"
302 ><span class="options">[<a href="#section-FunctionDetails"
303 class="privatelink" onclick="toggle_private();"
304 >hide private</a>]</span></td>
310 <a name="document_fromstring"></a>
312 <table class="details" border="1" cellpadding="3"
313 cellspacing="0" width="100%" bgcolor="white">
315 <table width="100%" cellpadding="0" cellspacing="0" border="0">
316 <tr valign="top"><td>
317 <h3 class="epydoc"><span class="sig"><span class="sig-name">document_fromstring</span>(<span class="sig-arg">html</span>,
318 <span class="sig-arg">guess_charset</span>=<span class="sig-default">None</span>,
319 <span class="sig-arg">parser</span>=<span class="sig-default">None</span>)</span>
321 </td><td align="right" valign="top"
322 ><span class="codelink"><a href="lxml.html.html5parser-pysrc.html#document_fromstring">source code</a></span>
326 <p>Parse a whole document into a string.</p>
327 <p>If <code class="link">guess_charset</code> is true, or if the input is not Unicode but a
328 byte string, the <code class="link">chardet</code> library will perform charset guessing
334 <a name="fragments_fromstring"></a>
336 <table class="details" border="1" cellpadding="3"
337 cellspacing="0" width="100%" bgcolor="white">
339 <table width="100%" cellpadding="0" cellspacing="0" border="0">
340 <tr valign="top"><td>
341 <h3 class="epydoc"><span class="sig"><span class="sig-name">fragments_fromstring</span>(<span class="sig-arg">html</span>,
342 <span class="sig-arg">no_leading_text</span>=<span class="sig-default">False</span>,
343 <span class="sig-arg">guess_charset</span>=<span class="sig-default">None</span>,
344 <span class="sig-arg">parser</span>=<span class="sig-default">None</span>)</span>
346 </td><td align="right" valign="top"
347 ><span class="codelink"><a href="lxml.html.html5parser-pysrc.html#fragments_fromstring">source code</a></span>
351 <p>Parses several HTML elements, returning a list of elements.</p>
352 <p>The first item in the list may be a string. If no_leading_text is true,
353 then it will be an error if there is leading text, and it will always be
354 a list of only elements.</p>
355 <p>If <code class="link">guess_charset</code> is true, the <code class="link">chardet</code> library will perform charset
356 guessing on the string.</p>
361 <a name="fragment_fromstring"></a>
363 <table class="details" border="1" cellpadding="3"
364 cellspacing="0" width="100%" bgcolor="white">
366 <table width="100%" cellpadding="0" cellspacing="0" border="0">
367 <tr valign="top"><td>
368 <h3 class="epydoc"><span class="sig"><span class="sig-name">fragment_fromstring</span>(<span class="sig-arg">html</span>,
369 <span class="sig-arg">create_parent</span>=<span class="sig-default">False</span>,
370 <span class="sig-arg">guess_charset</span>=<span class="sig-default">None</span>,
371 <span class="sig-arg">parser</span>=<span class="sig-default">None</span>)</span>
373 </td><td align="right" valign="top"
374 ><span class="codelink"><a href="lxml.html.html5parser-pysrc.html#fragment_fromstring">source code</a></span>
378 <p>Parses a single HTML element; it is an error if there is more than
379 one element, or if anything but whitespace precedes or follows the
381 <p>If 'create_parent' is true (or is a tag name) then a parent node
382 will be created to encapsulate the HTML in a single element. In
383 this case, leading or trailing text is allowed.</p>
384 <p>If <code class="link">guess_charset</code> is true, the <code class="link">chardet</code> library will perform charset
385 guessing on the string.</p>
390 <a name="fromstring"></a>
392 <table class="details" border="1" cellpadding="3"
393 cellspacing="0" width="100%" bgcolor="white">
395 <table width="100%" cellpadding="0" cellspacing="0" border="0">
396 <tr valign="top"><td>
397 <h3 class="epydoc"><span class="sig"><span class="sig-name">fromstring</span>(<span class="sig-arg">html</span>,
398 <span class="sig-arg">guess_charset</span>=<span class="sig-default">None</span>,
399 <span class="sig-arg">parser</span>=<span class="sig-default">None</span>)</span>
401 </td><td align="right" valign="top"
402 ><span class="codelink"><a href="lxml.html.html5parser-pysrc.html#fromstring">source code</a></span>
406 <p>Parse the html, returning a single element/document.</p>
407 <p>This tries to minimally parse the chunk of text, without knowing if it
408 is a fragment or a document.</p>
409 <p>'base_url' will set the document's base_url attribute (and the tree's
411 <p>If <code class="link">guess_charset</code> is true, or if the input is not Unicode but a
412 byte string, the <code class="link">chardet</code> library will perform charset guessing
420 <table class="details" border="1" cellpadding="3"
421 cellspacing="0" width="100%" bgcolor="white">
423 <table width="100%" cellpadding="0" cellspacing="0" border="0">
424 <tr valign="top"><td>
425 <h3 class="epydoc"><span class="sig"><span class="sig-name">parse</span>(<span class="sig-arg">filename_url_or_file</span>,
426 <span class="sig-arg">guess_charset</span>=<span class="sig-default">None</span>,
427 <span class="sig-arg">parser</span>=<span class="sig-default">None</span>)</span>
429 </td><td align="right" valign="top"
430 ><span class="codelink"><a href="lxml.html.html5parser-pysrc.html#parse">source code</a></span>
434 <p>Parse a filename, URL, or file-like object into an HTML document
435 tree. Note: this returns a tree, not an element. Use
436 <tt class="rst-docutils literal"><span class="pre">parse(...).getroot()</span></tt> to get the document root.</p>
437 <p>If <tt class="rst-docutils literal">guess_charset</tt> is true, the <tt class="rst-docutils literal">useChardet</tt> option is passed into
438 html5lib to enable character detection. This option is on by default
439 when parsing from URLs, off by default when parsing from file(-like)
440 objects (which tend to return Unicode more often than not), and on by
441 default when parsing from a file path (which is read in binary mode).</p>
447 <!-- ==================== NAVIGATION BAR ==================== -->
448 <table class="navbar" border="0" width="100%" cellpadding="0"
449 bgcolor="#a0c0ff" cellspacing="0">
452 <th> <a
453 href="lxml-module.html">Home</a> </th>
456 <th> <a
457 href="module-tree.html">Trees</a> </th>
460 <th> <a
461 href="identifier-index.html">Indices</a> </th>
464 <th> <a
465 href="help.html">Help</a> </th>
467 <!-- Project homepage -->
468 <th class="navbar" align="right" width="100%">
469 <table border="0" cellpadding="0" cellspacing="0">
470 <tr><th class="navbar" align="center"
471 ><a class="navbar" target="_top" href="/">lxml API</a></th>
475 <table border="0" cellpadding="0" cellspacing="0" width="100%%">
477 <td align="left" class="footer">
478 Generated by Epydoc 3.0.1
479 on Wed Jan 29 12:26:21 2020
481 <td align="right" class="footer">
482 <a target="mainFrame" href="http://epydoc.sourceforge.net"
483 >http://epydoc.sourceforge.net</a>
488 <script type="text/javascript">
490 // Private objects are initially displayed (because if
491 // javascript is turned off then we want them to be
492 // visible); but by default, we want to hide them. So hide
493 // them unless we have a cookie that says to show them.