Imported Upstream version 4.5.2
[platform/upstream/python-lxml.git] / doc / html / api / lxml.html.html5parser-module.html
1 <?xml version="1.0" encoding="ascii"?>
2 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
3           "DTD/xhtml1-transitional.dtd">
4 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
5 <head>
6   <title>lxml.html.html5parser</title>
7   <link rel="stylesheet" href="epydoc.css" type="text/css" />
8   <script type="text/javascript" src="epydoc.js"></script>
9 </head>
10
11 <body bgcolor="white" text="black" link="blue" vlink="#204080"
12       alink="#204080">
13 <!-- ==================== NAVIGATION BAR ==================== -->
14 <table class="navbar" border="0" width="100%" cellpadding="0"
15        bgcolor="#a0c0ff" cellspacing="0">
16   <tr valign="middle">
17   <!-- Home link -->
18       <th>&nbsp;&nbsp;&nbsp;<a
19         href="lxml-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>
20
21   <!-- Tree link -->
22       <th>&nbsp;&nbsp;&nbsp;<a
23         href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>
24
25   <!-- Index link -->
26       <th>&nbsp;&nbsp;&nbsp;<a
27         href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
28
29   <!-- Help link -->
30       <th>&nbsp;&nbsp;&nbsp;<a
31         href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
32
33   <!-- Project homepage -->
34       <th class="navbar" align="right" width="100%">
35         <table border="0" cellpadding="0" cellspacing="0">
36           <tr><th class="navbar" align="center"
37             ><a class="navbar" target="_top" href="/">lxml API</a></th>
38           </tr></table></th>
39   </tr>
40 </table>
41 <table width="100%" cellpadding="0" cellspacing="0">
42   <tr valign="top">
43     <td width="100%">
44       <span class="breadcrumbs">
45         <a href="lxml-module.html">Package&nbsp;lxml</a> ::
46         <a href="lxml.html-module.html">Package&nbsp;html</a> ::
47         Module&nbsp;html5parser
48       </span>
49     </td>
50     <td>
51       <table cellpadding="0" cellspacing="0">
52         <!-- hide/show private -->
53         <tr><td align="right"><span class="options">[<a href="javascript:void(0);" class="privatelink"
54     onclick="toggle_private();">hide&nbsp;private</a>]</span></td></tr>
55         <tr><td align="right"><span class="options"
56             >[<a href="frames.html" target="_top">frames</a
57             >]&nbsp;|&nbsp;<a href="lxml.html.html5parser-module.html"
58             target="_top">no&nbsp;frames</a>]</span></td></tr>
59       </table>
60     </td>
61   </tr>
62 </table>
63 <!-- ==================== MODULE DESCRIPTION ==================== -->
64 <h1 class="epydoc">Module html5parser</h1><p class="nomargin-top"><span class="codelink"><a href="lxml.html.html5parser-pysrc.html">source&nbsp;code</a></span></p>
65 An interface to html5lib that mimics the lxml.html interface.
66
67 <!-- ==================== CLASSES ==================== -->
68 <a name="section-Classes"></a>
69 <table class="summary" border="1" cellpadding="3"
70        cellspacing="0" width="100%" bgcolor="white">
71 <tr bgcolor="#70b0f0" class="table-header">
72   <td colspan="2" class="table-header">
73     <table border="0" cellpadding="0" cellspacing="0" width="100%">
74       <tr valign="top">
75         <td align="left"><span class="table-header">Classes</span></td>
76         <td align="right" valign="top"
77          ><span class="options">[<a href="#section-Classes"
78          class="privatelink" onclick="toggle_private();"
79          >hide private</a>]</span></td>
80       </tr>
81     </table>
82   </td>
83 </tr>
84 <tr>
85     <td width="15%" align="right" valign="top" class="summary">
86       <span class="summary-type">&nbsp;</span>
87     </td><td class="summary">
88         <a href="lxml.html.html5parser.HTMLParser-class.html" class="summary-name">HTMLParser</a><br />
89       An html5lib HTML parser with lxml as tree.
90     </td>
91   </tr>
92 <tr>
93     <td width="15%" align="right" valign="top" class="summary">
94       <span class="summary-type">&nbsp;</span>
95     </td><td class="summary">
96         <a href="lxml.html.html5parser.XHTMLParser-class.html" class="summary-name">XHTMLParser</a><br />
97       An html5lib XHTML Parser with lxml as tree.
98     </td>
99   </tr>
100 </table>
101 <!-- ==================== FUNCTIONS ==================== -->
102 <a name="section-Functions"></a>
103 <table class="summary" border="1" cellpadding="3"
104        cellspacing="0" width="100%" bgcolor="white">
105 <tr bgcolor="#70b0f0" class="table-header">
106   <td colspan="2" class="table-header">
107     <table border="0" cellpadding="0" cellspacing="0" width="100%">
108       <tr valign="top">
109         <td align="left"><span class="table-header">Functions</span></td>
110         <td align="right" valign="top"
111          ><span class="options">[<a href="#section-Functions"
112          class="privatelink" onclick="toggle_private();"
113          >hide private</a>]</span></td>
114       </tr>
115     </table>
116   </td>
117 </tr>
118 <tr class="private">
119     <td width="15%" align="right" valign="top" class="summary">
120       <span class="summary-type">&nbsp;</span>
121     </td><td class="summary">
122       <table width="100%" cellpadding="0" cellspacing="0" border="0">
123         <tr>
124           <td><span class="summary-sig"><a name="_find_tag"></a><span class="summary-sig-name">_find_tag</span>(<span class="summary-sig-arg">tree</span>,
125         <span class="summary-sig-arg">tag</span>)</span></td>
126           <td align="right" valign="top">
127             <span class="codelink"><a href="lxml.html.html5parser-pysrc.html#_find_tag">source&nbsp;code</a></span>
128             
129           </td>
130         </tr>
131       </table>
132       
133     </td>
134   </tr>
135 <tr>
136     <td width="15%" align="right" valign="top" class="summary">
137       <span class="summary-type">&nbsp;</span>
138     </td><td class="summary">
139       <table width="100%" cellpadding="0" cellspacing="0" border="0">
140         <tr>
141           <td><span class="summary-sig"><a href="lxml.html.html5parser-module.html#document_fromstring" class="summary-sig-name">document_fromstring</a>(<span class="summary-sig-arg">html</span>,
142         <span class="summary-sig-arg">guess_charset</span>=<span class="summary-sig-default">None</span>,
143         <span class="summary-sig-arg">parser</span>=<span class="summary-sig-default">None</span>)</span><br />
144       Parse a whole document into a string.</td>
145           <td align="right" valign="top">
146             <span class="codelink"><a href="lxml.html.html5parser-pysrc.html#document_fromstring">source&nbsp;code</a></span>
147             
148           </td>
149         </tr>
150       </table>
151       
152     </td>
153   </tr>
154 <tr>
155     <td width="15%" align="right" valign="top" class="summary">
156       <span class="summary-type">&nbsp;</span>
157     </td><td class="summary">
158       <table width="100%" cellpadding="0" cellspacing="0" border="0">
159         <tr>
160           <td><span class="summary-sig"><a href="lxml.html.html5parser-module.html#fragments_fromstring" class="summary-sig-name">fragments_fromstring</a>(<span class="summary-sig-arg">html</span>,
161         <span class="summary-sig-arg">no_leading_text</span>=<span class="summary-sig-default">False</span>,
162         <span class="summary-sig-arg">guess_charset</span>=<span class="summary-sig-default">None</span>,
163         <span class="summary-sig-arg">parser</span>=<span class="summary-sig-default">None</span>)</span><br />
164       Parses several HTML elements, returning a list of elements.</td>
165           <td align="right" valign="top">
166             <span class="codelink"><a href="lxml.html.html5parser-pysrc.html#fragments_fromstring">source&nbsp;code</a></span>
167             
168           </td>
169         </tr>
170       </table>
171       
172     </td>
173   </tr>
174 <tr>
175     <td width="15%" align="right" valign="top" class="summary">
176       <span class="summary-type">&nbsp;</span>
177     </td><td class="summary">
178       <table width="100%" cellpadding="0" cellspacing="0" border="0">
179         <tr>
180           <td><span class="summary-sig"><a href="lxml.html.html5parser-module.html#fragment_fromstring" class="summary-sig-name">fragment_fromstring</a>(<span class="summary-sig-arg">html</span>,
181         <span class="summary-sig-arg">create_parent</span>=<span class="summary-sig-default">False</span>,
182         <span class="summary-sig-arg">guess_charset</span>=<span class="summary-sig-default">None</span>,
183         <span class="summary-sig-arg">parser</span>=<span class="summary-sig-default">None</span>)</span><br />
184       Parses a single HTML element; it is an error if there is more than
185 one element, or if anything but whitespace precedes or follows the
186 element.</td>
187           <td align="right" valign="top">
188             <span class="codelink"><a href="lxml.html.html5parser-pysrc.html#fragment_fromstring">source&nbsp;code</a></span>
189             
190           </td>
191         </tr>
192       </table>
193       
194     </td>
195   </tr>
196 <tr>
197     <td width="15%" align="right" valign="top" class="summary">
198       <span class="summary-type">&nbsp;</span>
199     </td><td class="summary">
200       <table width="100%" cellpadding="0" cellspacing="0" border="0">
201         <tr>
202           <td><span class="summary-sig"><a href="lxml.html.html5parser-module.html#fromstring" class="summary-sig-name">fromstring</a>(<span class="summary-sig-arg">html</span>,
203         <span class="summary-sig-arg">guess_charset</span>=<span class="summary-sig-default">None</span>,
204         <span class="summary-sig-arg">parser</span>=<span class="summary-sig-default">None</span>)</span><br />
205       Parse the html, returning a single element/document.</td>
206           <td align="right" valign="top">
207             <span class="codelink"><a href="lxml.html.html5parser-pysrc.html#fromstring">source&nbsp;code</a></span>
208             
209           </td>
210         </tr>
211       </table>
212       
213     </td>
214   </tr>
215 <tr>
216     <td width="15%" align="right" valign="top" class="summary">
217       <span class="summary-type">&nbsp;</span>
218     </td><td class="summary">
219       <table width="100%" cellpadding="0" cellspacing="0" border="0">
220         <tr>
221           <td><span class="summary-sig"><a href="lxml.html.html5parser-module.html#parse" class="summary-sig-name">parse</a>(<span class="summary-sig-arg">filename_url_or_file</span>,
222         <span class="summary-sig-arg">guess_charset</span>=<span class="summary-sig-default">None</span>,
223         <span class="summary-sig-arg">parser</span>=<span class="summary-sig-default">None</span>)</span><br />
224       Parse a filename, URL, or file-like object into an HTML document
225 tree.  Note: this returns a tree, not an element.  Use
226 <tt class="rst-docutils literal"><span class="pre">parse(...).getroot()</span></tt> to get the document root.</td>
227           <td align="right" valign="top">
228             <span class="codelink"><a href="lxml.html.html5parser-pysrc.html#parse">source&nbsp;code</a></span>
229             
230           </td>
231         </tr>
232       </table>
233       
234     </td>
235   </tr>
236 <tr class="private">
237     <td width="15%" align="right" valign="top" class="summary">
238       <span class="summary-type">&nbsp;</span>
239     </td><td class="summary">
240       <table width="100%" cellpadding="0" cellspacing="0" border="0">
241         <tr>
242           <td><span class="summary-sig"><a name="_looks_like_url"></a><span class="summary-sig-name">_looks_like_url</span>(<span class="summary-sig-arg">str</span>)</span></td>
243           <td align="right" valign="top">
244             <span class="codelink"><a href="lxml.html.html5parser-pysrc.html#_looks_like_url">source&nbsp;code</a></span>
245             
246           </td>
247         </tr>
248       </table>
249       
250     </td>
251   </tr>
252 </table>
253 <!-- ==================== VARIABLES ==================== -->
254 <a name="section-Variables"></a>
255 <table class="summary" border="1" cellpadding="3"
256        cellspacing="0" width="100%" bgcolor="white">
257 <tr bgcolor="#70b0f0" class="table-header">
258   <td colspan="2" class="table-header">
259     <table border="0" cellpadding="0" cellspacing="0" width="100%">
260       <tr valign="top">
261         <td align="left"><span class="table-header">Variables</span></td>
262         <td align="right" valign="top"
263          ><span class="options">[<a href="#section-Variables"
264          class="privatelink" onclick="toggle_private();"
265          >hide private</a>]</span></td>
266       </tr>
267     </table>
268   </td>
269 </tr>
270 <tr>
271     <td width="15%" align="right" valign="top" class="summary">
272       <span class="summary-type">&nbsp;</span>
273     </td><td class="summary">
274         <a name="xhtml_parser"></a><span class="summary-name">xhtml_parser</span> = <code title="XHTMLParser()">XHTMLParser()</code>
275     </td>
276   </tr>
277 <tr>
278     <td width="15%" align="right" valign="top" class="summary">
279       <span class="summary-type">&nbsp;</span>
280     </td><td class="summary">
281         <a name="html_parser"></a><span class="summary-name">html_parser</span> = <code title="&lt;lxml.html.html5parser.HTMLParser object&gt;">&lt;lxml.html.html5parser.HTMLParser object&gt;</code>
282     </td>
283   </tr>
284 <tr>
285     <td width="15%" align="right" valign="top" class="summary">
286       <span class="summary-type">&nbsp;</span>
287     </td><td class="summary">
288         <a name="__package__"></a><span class="summary-name">__package__</span> = <code title="'lxml.html'"><code class="variable-quote">'</code><code class="variable-string">lxml.html</code><code class="variable-quote">'</code></code>
289     </td>
290   </tr>
291 </table>
292 <!-- ==================== FUNCTION DETAILS ==================== -->
293 <a name="section-FunctionDetails"></a>
294 <table class="details" border="1" cellpadding="3"
295        cellspacing="0" width="100%" bgcolor="white">
296 <tr bgcolor="#70b0f0" class="table-header">
297   <td colspan="2" class="table-header">
298     <table border="0" cellpadding="0" cellspacing="0" width="100%">
299       <tr valign="top">
300         <td align="left"><span class="table-header">Function Details</span></td>
301         <td align="right" valign="top"
302          ><span class="options">[<a href="#section-FunctionDetails"
303          class="privatelink" onclick="toggle_private();"
304          >hide private</a>]</span></td>
305       </tr>
306     </table>
307   </td>
308 </tr>
309 </table>
310 <a name="document_fromstring"></a>
311 <div>
312 <table class="details" border="1" cellpadding="3"
313        cellspacing="0" width="100%" bgcolor="white">
314 <tr><td>
315   <table width="100%" cellpadding="0" cellspacing="0" border="0">
316   <tr valign="top"><td>
317   <h3 class="epydoc"><span class="sig"><span class="sig-name">document_fromstring</span>(<span class="sig-arg">html</span>,
318         <span class="sig-arg">guess_charset</span>=<span class="sig-default">None</span>,
319         <span class="sig-arg">parser</span>=<span class="sig-default">None</span>)</span>
320   </h3>
321   </td><td align="right" valign="top"
322     ><span class="codelink"><a href="lxml.html.html5parser-pysrc.html#document_fromstring">source&nbsp;code</a></span>&nbsp;
323     </td>
324   </tr></table>
325   
326   <p>Parse a whole document into a string.</p>
327 <p>If <code class="link">guess_charset</code> is true, or if the input is not Unicode but a
328 byte string, the <code class="link">chardet</code> library will perform charset guessing
329 on the string.</p>
330   <dl class="fields">
331   </dl>
332 </td></tr></table>
333 </div>
334 <a name="fragments_fromstring"></a>
335 <div>
336 <table class="details" border="1" cellpadding="3"
337        cellspacing="0" width="100%" bgcolor="white">
338 <tr><td>
339   <table width="100%" cellpadding="0" cellspacing="0" border="0">
340   <tr valign="top"><td>
341   <h3 class="epydoc"><span class="sig"><span class="sig-name">fragments_fromstring</span>(<span class="sig-arg">html</span>,
342         <span class="sig-arg">no_leading_text</span>=<span class="sig-default">False</span>,
343         <span class="sig-arg">guess_charset</span>=<span class="sig-default">None</span>,
344         <span class="sig-arg">parser</span>=<span class="sig-default">None</span>)</span>
345   </h3>
346   </td><td align="right" valign="top"
347     ><span class="codelink"><a href="lxml.html.html5parser-pysrc.html#fragments_fromstring">source&nbsp;code</a></span>&nbsp;
348     </td>
349   </tr></table>
350   
351   <p>Parses several HTML elements, returning a list of elements.</p>
352 <p>The first item in the list may be a string.  If no_leading_text is true,
353 then it will be an error if there is leading text, and it will always be
354 a list of only elements.</p>
355 <p>If <code class="link">guess_charset</code> is true, the <code class="link">chardet</code> library will perform charset
356 guessing on the string.</p>
357   <dl class="fields">
358   </dl>
359 </td></tr></table>
360 </div>
361 <a name="fragment_fromstring"></a>
362 <div>
363 <table class="details" border="1" cellpadding="3"
364        cellspacing="0" width="100%" bgcolor="white">
365 <tr><td>
366   <table width="100%" cellpadding="0" cellspacing="0" border="0">
367   <tr valign="top"><td>
368   <h3 class="epydoc"><span class="sig"><span class="sig-name">fragment_fromstring</span>(<span class="sig-arg">html</span>,
369         <span class="sig-arg">create_parent</span>=<span class="sig-default">False</span>,
370         <span class="sig-arg">guess_charset</span>=<span class="sig-default">None</span>,
371         <span class="sig-arg">parser</span>=<span class="sig-default">None</span>)</span>
372   </h3>
373   </td><td align="right" valign="top"
374     ><span class="codelink"><a href="lxml.html.html5parser-pysrc.html#fragment_fromstring">source&nbsp;code</a></span>&nbsp;
375     </td>
376   </tr></table>
377   
378   <p>Parses a single HTML element; it is an error if there is more than
379 one element, or if anything but whitespace precedes or follows the
380 element.</p>
381 <p>If 'create_parent' is true (or is a tag name) then a parent node
382 will be created to encapsulate the HTML in a single element.  In
383 this case, leading or trailing text is allowed.</p>
384 <p>If <code class="link">guess_charset</code> is true, the <code class="link">chardet</code> library will perform charset
385 guessing on the string.</p>
386   <dl class="fields">
387   </dl>
388 </td></tr></table>
389 </div>
390 <a name="fromstring"></a>
391 <div>
392 <table class="details" border="1" cellpadding="3"
393        cellspacing="0" width="100%" bgcolor="white">
394 <tr><td>
395   <table width="100%" cellpadding="0" cellspacing="0" border="0">
396   <tr valign="top"><td>
397   <h3 class="epydoc"><span class="sig"><span class="sig-name">fromstring</span>(<span class="sig-arg">html</span>,
398         <span class="sig-arg">guess_charset</span>=<span class="sig-default">None</span>,
399         <span class="sig-arg">parser</span>=<span class="sig-default">None</span>)</span>
400   </h3>
401   </td><td align="right" valign="top"
402     ><span class="codelink"><a href="lxml.html.html5parser-pysrc.html#fromstring">source&nbsp;code</a></span>&nbsp;
403     </td>
404   </tr></table>
405   
406   <p>Parse the html, returning a single element/document.</p>
407 <p>This tries to minimally parse the chunk of text, without knowing if it
408 is a fragment or a document.</p>
409 <p>'base_url' will set the document's base_url attribute (and the tree's
410 docinfo.URL)</p>
411 <p>If <code class="link">guess_charset</code> is true, or if the input is not Unicode but a
412 byte string, the <code class="link">chardet</code> library will perform charset guessing
413 on the string.</p>
414   <dl class="fields">
415   </dl>
416 </td></tr></table>
417 </div>
418 <a name="parse"></a>
419 <div>
420 <table class="details" border="1" cellpadding="3"
421        cellspacing="0" width="100%" bgcolor="white">
422 <tr><td>
423   <table width="100%" cellpadding="0" cellspacing="0" border="0">
424   <tr valign="top"><td>
425   <h3 class="epydoc"><span class="sig"><span class="sig-name">parse</span>(<span class="sig-arg">filename_url_or_file</span>,
426         <span class="sig-arg">guess_charset</span>=<span class="sig-default">None</span>,
427         <span class="sig-arg">parser</span>=<span class="sig-default">None</span>)</span>
428   </h3>
429   </td><td align="right" valign="top"
430     ><span class="codelink"><a href="lxml.html.html5parser-pysrc.html#parse">source&nbsp;code</a></span>&nbsp;
431     </td>
432   </tr></table>
433   
434   <p>Parse a filename, URL, or file-like object into an HTML document
435 tree.  Note: this returns a tree, not an element.  Use
436 <tt class="rst-docutils literal"><span class="pre">parse(...).getroot()</span></tt> to get the document root.</p>
437 <p>If <tt class="rst-docutils literal">guess_charset</tt> is true, the <tt class="rst-docutils literal">useChardet</tt> option is passed into
438 html5lib to enable character detection.  This option is on by default
439 when parsing from URLs, off by default when parsing from file(-like)
440 objects (which tend to return Unicode more often than not), and on by
441 default when parsing from a file path (which is read in binary mode).</p>
442   <dl class="fields">
443   </dl>
444 </td></tr></table>
445 </div>
446 <br />
447 <!-- ==================== NAVIGATION BAR ==================== -->
448 <table class="navbar" border="0" width="100%" cellpadding="0"
449        bgcolor="#a0c0ff" cellspacing="0">
450   <tr valign="middle">
451   <!-- Home link -->
452       <th>&nbsp;&nbsp;&nbsp;<a
453         href="lxml-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>
454
455   <!-- Tree link -->
456       <th>&nbsp;&nbsp;&nbsp;<a
457         href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>
458
459   <!-- Index link -->
460       <th>&nbsp;&nbsp;&nbsp;<a
461         href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
462
463   <!-- Help link -->
464       <th>&nbsp;&nbsp;&nbsp;<a
465         href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
466
467   <!-- Project homepage -->
468       <th class="navbar" align="right" width="100%">
469         <table border="0" cellpadding="0" cellspacing="0">
470           <tr><th class="navbar" align="center"
471             ><a class="navbar" target="_top" href="/">lxml API</a></th>
472           </tr></table></th>
473   </tr>
474 </table>
475 <table border="0" cellpadding="0" cellspacing="0" width="100%%">
476   <tr>
477     <td align="left" class="footer">
478     Generated by Epydoc 3.0.1
479     on Thu Jul  9 18:29:53 2020
480     </td>
481     <td align="right" class="footer">
482       <a target="mainFrame" href="http://epydoc.sourceforge.net"
483         >http://epydoc.sourceforge.net</a>
484     </td>
485   </tr>
486 </table>
487
488 <script type="text/javascript">
489   <!--
490   // Private objects are initially displayed (because if
491   // javascript is turned off then we want them to be
492   // visible); but by default, we want to hide them.  So hide
493   // them unless we have a cookie that says to show them.
494   checkCookie();
495   // -->
496 </script>
497 </body>
498 </html>