1 <?xml version="1.0" encoding="ascii"?>
2 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
3 "DTD/xhtml1-transitional.dtd">
4 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
6 <title>lxml.etree.HTMLParser</title>
7 <link rel="stylesheet" href="epydoc.css" type="text/css" />
8 <script type="text/javascript" src="epydoc.js"></script>
11 <body bgcolor="white" text="black" link="blue" vlink="#204080"
13 <!-- ==================== NAVIGATION BAR ==================== -->
14 <table class="navbar" border="0" width="100%" cellpadding="0"
15 bgcolor="#a0c0ff" cellspacing="0">
18 <th> <a
19 href="lxml-module.html">Home</a> </th>
22 <th> <a
23 href="module-tree.html">Trees</a> </th>
26 <th> <a
27 href="identifier-index.html">Indices</a> </th>
30 <th> <a
31 href="help.html">Help</a> </th>
33 <!-- Project homepage -->
34 <th class="navbar" align="right" width="100%">
35 <table border="0" cellpadding="0" cellspacing="0">
36 <tr><th class="navbar" align="center"
37 ><a class="navbar" target="_top" href="/">lxml API</a></th>
41 <table width="100%" cellpadding="0" cellspacing="0">
44 <span class="breadcrumbs">
45 <a href="lxml-module.html">Package lxml</a> ::
46 <a href="lxml.etree-module.html">Module etree</a> ::
51 <table cellpadding="0" cellspacing="0">
52 <!-- hide/show private -->
53 <tr><td align="right"><span class="options">[<a href="javascript:void(0);" class="privatelink"
54 onclick="toggle_private();">hide private</a>]</span></td></tr>
55 <tr><td align="right"><span class="options"
56 >[<a href="frames.html" target="_top">frames</a
57 >] | <a href="lxml.etree.HTMLParser-class.html"
58 target="_top">no frames</a>]</span></td></tr>
63 <!-- ==================== CLASS DESCRIPTION ==================== -->
64 <h1 class="epydoc">Class HTMLParser</h1><p class="nomargin-top"></p>
65 <pre class="base-tree">
70 <a href="lxml.etree._FeedParser-class.html" onclick="show_private();">_FeedParser</a> --+
72 <strong class="uidshort">HTMLParser</strong>
75 <dl><dt>Known Subclasses:</dt>
77 <ul class="subclass-list">
78 <li><a href="lxml.html.HTMLParser-class.html">html.HTMLParser</a></li><li class="private">, <a href="lxml.etree.HTMLPullParser-class.html" onclick="show_private();">HTMLPullParser</a></li> </ul>
82 <p>HTMLParser(self, encoding=None, remove_blank_text=False, remove_comments=False, remove_pis=False, strip_cdata=True, no_network=True, target=None, schema: XMLSchema =None, recover=True, compact=True, collect_ids=True, huge_tree=False)</p>
83 <p>The HTML parser.</p>
84 <p>This parser allows reading HTML into a normal XML tree. By
85 default, it can read broken (non well-formed) HTML, depending on
86 the capabilities of libxml2. Use the 'recover' option to switch
88 <p>Available boolean keyword arguments:</p>
89 <ul class="rst-simple">
90 <li>recover - try hard to parse through broken HTML (default: True)</li>
91 <li>no_network - prevent network access for related files (default: True)</li>
92 <li>remove_blank_text - discard empty text nodes that are ignorable (i.e. not actual text content)</li>
93 <li>remove_comments - discard comments</li>
94 <li>remove_pis - discard processing instructions</li>
95 <li>strip_cdata - replace CDATA sections by normal text content (default: True)</li>
96 <li>compact - save memory for short text content (default: True)</li>
97 <li>default_doctype - add a default doctype even if it is not found in the HTML (default: True)</li>
98 <li>collect_ids - use a hash table of XML IDs for fast access (default: True)</li>
99 <li><dl class="rst-first rst-docutils">
100 <dt>huge_tree - disable security restrictions and support very deep trees</dt>
101 <dd>and very long text content (only affects libxml2 2.7+)</dd>
105 <p>Other keyword arguments:</p>
106 <ul class="rst-simple">
107 <li>encoding - override the document encoding</li>
108 <li>target - a parser target object that will receive the parse events</li>
109 <li>schema - an XMLSchema to validate against</li>
111 <p>Note that you should avoid sharing parsers between threads for performance
114 <!-- ==================== INSTANCE METHODS ==================== -->
115 <a name="section-InstanceMethods"></a>
116 <table class="summary" border="1" cellpadding="3"
117 cellspacing="0" width="100%" bgcolor="white">
118 <tr bgcolor="#70b0f0" class="table-header">
119 <td colspan="2" class="table-header">
120 <table border="0" cellpadding="0" cellspacing="0" width="100%">
122 <td align="left"><span class="table-header">Instance Methods</span></td>
123 <td align="right" valign="top"
124 ><span class="options">[<a href="#section-InstanceMethods"
125 class="privatelink" onclick="toggle_private();"
126 >hide private</a>]</span></td>
132 <td width="15%" align="right" valign="top" class="summary">
133 <span class="summary-type"> </span>
134 </td><td class="summary">
135 <table width="100%" cellpadding="0" cellspacing="0" border="0">
137 <td><span class="summary-sig"><a href="lxml.etree.HTMLParser-class.html#__init__" class="summary-sig-name">__init__</a>(<span class="summary-sig-arg">self</span>,
138 <span class="summary-sig-arg">encoding</span>=<span class="summary-sig-default">None</span>,
139 <span class="summary-sig-arg">remove_blank_text</span>=<span class="summary-sig-default">False</span>,
140 <span class="summary-sig-arg">remove_comments</span>=<span class="summary-sig-default">False</span>,
141 <span class="summary-sig-arg">remove_pis</span>=<span class="summary-sig-default">False</span>,
142 <span class="summary-sig-arg">strip_cdata</span>=<span class="summary-sig-default">True</span>,
143 <span class="summary-sig-arg">no_network</span>=<span class="summary-sig-default">True</span>,
144 <span class="summary-sig-arg">target</span>=<span class="summary-sig-default">None</span>,
145 <span class="summary-sig-arg">schema: XMLSchema</span>=<span class="summary-sig-default">None</span>,
146 <span class="summary-sig-arg">recover</span>=<span class="summary-sig-default">True</span>,
147 <span class="summary-sig-arg">compact</span>=<span class="summary-sig-default">True</span>,
148 <span class="summary-sig-arg">collect_ids</span>=<span class="summary-sig-default">True</span>,
149 <span class="summary-sig-arg">huge_tree</span>=<span class="summary-sig-default">False</span>)</span><br />
150 x.__init__(...) initializes x; see help(type(x)) for signature</td>
151 <td align="right" valign="top">
161 <td width="15%" align="right" valign="top" class="summary">
162 <span class="summary-type">a new object with type S, a subtype of T</span>
163 </td><td class="summary">
164 <table width="100%" cellpadding="0" cellspacing="0" border="0">
166 <td><span class="summary-sig"><a href="lxml.etree.HTMLParser-class.html#__new__" class="summary-sig-name">__new__</a>(<span class="summary-sig-arg">T</span>,
167 <span class="summary-sig-arg">S</span>,
168 <span class="summary-sig-arg">...</span>)</span></td>
169 <td align="right" valign="top">
179 <td colspan="2" class="summary">
180 <p class="indent-wrapped-lines"><b>Inherited from <code><a href="lxml.etree._FeedParser-class.html" onclick="show_private();">_FeedParser</a></code></b>:
181 <code><a href="lxml.etree._FeedParser-class.html#close">close</a></code>,
182 <code><a href="lxml.etree._FeedParser-class.html#feed">feed</a></code>
184 <p class="indent-wrapped-lines"><b>Inherited from <code><i>unreachable</i>._BaseParser</code></b>:
186 <code>makeelement</code>,
187 <code>setElementClassLookup</code>,
188 <code>set_element_class_lookup</code>
190 <p class="indent-wrapped-lines"><b>Inherited from <code>object</code></b>:
191 <code>__delattr__</code>,
192 <code>__format__</code>,
193 <code>__getattribute__</code>,
194 <code>__hash__</code>,
195 <code>__reduce__</code>,
196 <code>__reduce_ex__</code>,
197 <code>__repr__</code>,
198 <code>__setattr__</code>,
199 <code>__sizeof__</code>,
200 <code>__str__</code>,
201 <code>__subclasshook__</code>
206 <!-- ==================== PROPERTIES ==================== -->
207 <a name="section-Properties"></a>
208 <table class="summary" border="1" cellpadding="3"
209 cellspacing="0" width="100%" bgcolor="white">
210 <tr bgcolor="#70b0f0" class="table-header">
211 <td colspan="2" class="table-header">
212 <table border="0" cellpadding="0" cellspacing="0" width="100%">
214 <td align="left"><span class="table-header">Properties</span></td>
215 <td align="right" valign="top"
216 ><span class="options">[<a href="#section-Properties"
217 class="privatelink" onclick="toggle_private();"
218 >hide private</a>]</span></td>
224 <td colspan="2" class="summary">
225 <p class="indent-wrapped-lines"><b>Inherited from <code><a href="lxml.etree._FeedParser-class.html" onclick="show_private();">_FeedParser</a></code></b>:
226 <code><a href="lxml.etree._FeedParser-class.html#feed_error_log">feed_error_log</a></code>
228 <p class="indent-wrapped-lines"><b>Inherited from <code><i>unreachable</i>._BaseParser</code></b>:
229 <code>error_log</code>,
230 <code>resolvers</code>,
234 <p class="indent-wrapped-lines"><b>Inherited from <code>object</code></b>:
235 <code>__class__</code>
240 <!-- ==================== METHOD DETAILS ==================== -->
241 <a name="section-MethodDetails"></a>
242 <table class="details" border="1" cellpadding="3"
243 cellspacing="0" width="100%" bgcolor="white">
244 <tr bgcolor="#70b0f0" class="table-header">
245 <td colspan="2" class="table-header">
246 <table border="0" cellpadding="0" cellspacing="0" width="100%">
248 <td align="left"><span class="table-header">Method Details</span></td>
249 <td align="right" valign="top"
250 ><span class="options">[<a href="#section-MethodDetails"
251 class="privatelink" onclick="toggle_private();"
252 >hide private</a>]</span></td>
258 <a name="__init__"></a>
260 <table class="details" border="1" cellpadding="3"
261 cellspacing="0" width="100%" bgcolor="white">
263 <table width="100%" cellpadding="0" cellspacing="0" border="0">
264 <tr valign="top"><td>
265 <h3 class="epydoc"><span class="sig"><span class="sig-name">__init__</span>(<span class="sig-arg">self</span>,
266 <span class="sig-arg">encoding</span>=<span class="sig-default">None</span>,
267 <span class="sig-arg">remove_blank_text</span>=<span class="sig-default">False</span>,
268 <span class="sig-arg">remove_comments</span>=<span class="sig-default">False</span>,
269 <span class="sig-arg">remove_pis</span>=<span class="sig-default">False</span>,
270 <span class="sig-arg">strip_cdata</span>=<span class="sig-default">True</span>,
271 <span class="sig-arg">no_network</span>=<span class="sig-default">True</span>,
272 <span class="sig-arg">target</span>=<span class="sig-default">None</span>,
273 <span class="sig-arg">schema: XMLSchema</span>=<span class="sig-default">None</span>,
274 <span class="sig-arg">recover</span>=<span class="sig-default">True</span>,
275 <span class="sig-arg">compact</span>=<span class="sig-default">True</span>,
276 <span class="sig-arg">collect_ids</span>=<span class="sig-default">True</span>,
277 <span class="sig-arg">huge_tree</span>=<span class="sig-default">False</span>)</span>
278 <br /><em class="fname">(Constructor)</em>
280 </td><td align="right" valign="top"
285 x.__init__(...) initializes x; see help(type(x)) for signature
293 <a name="__new__"></a>
295 <table class="details" border="1" cellpadding="3"
296 cellspacing="0" width="100%" bgcolor="white">
298 <table width="100%" cellpadding="0" cellspacing="0" border="0">
299 <tr valign="top"><td>
300 <h3 class="epydoc"><span class="sig"><span class="sig-name">__new__</span>(<span class="sig-arg">T</span>,
301 <span class="sig-arg">S</span>,
302 <span class="sig-arg">...</span>)</span>
304 </td><td align="right" valign="top"
311 <dt>Returns: a new object with type S, a subtype of T</dt>
319 <!-- ==================== NAVIGATION BAR ==================== -->
320 <table class="navbar" border="0" width="100%" cellpadding="0"
321 bgcolor="#a0c0ff" cellspacing="0">
324 <th> <a
325 href="lxml-module.html">Home</a> </th>
328 <th> <a
329 href="module-tree.html">Trees</a> </th>
332 <th> <a
333 href="identifier-index.html">Indices</a> </th>
336 <th> <a
337 href="help.html">Help</a> </th>
339 <!-- Project homepage -->
340 <th class="navbar" align="right" width="100%">
341 <table border="0" cellpadding="0" cellspacing="0">
342 <tr><th class="navbar" align="center"
343 ><a class="navbar" target="_top" href="/">lxml API</a></th>
347 <table border="0" cellpadding="0" cellspacing="0" width="100%%">
349 <td align="left" class="footer">
350 Generated by Epydoc 3.0.1
351 on Thu Jul 9 18:29:53 2020
353 <td align="right" class="footer">
354 <a target="mainFrame" href="http://epydoc.sourceforge.net"
355 >http://epydoc.sourceforge.net</a>
360 <script type="text/javascript">
362 // Private objects are initially displayed (because if
363 // javascript is turned off then we want them to be
364 // visible); but by default, we want to hide them. So hide
365 // them unless we have a cookie that says to show them.