Imported Upstream version 4.6.0
[platform/upstream/python-lxml.git] / doc / html / apidoc / lxml.html.clean.html
1
2
3 <!DOCTYPE html>
4 <html class="writer-html5" lang="en" >
5 <head>
6   <meta charset="utf-8">
7   
8   <meta name="viewport" content="width=device-width, initial-scale=1.0">
9   
10   <title>lxml.html.clean module &mdash; lxml  documentation</title>
11   
12
13   
14   <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
15   <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
16
17   
18   
19   
20   
21
22   
23   <!--[if lt IE 9]>
24     <script src="_static/js/html5shiv.min.js"></script>
25   <![endif]-->
26   
27     
28       <script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
29         <script src="_static/jquery.js"></script>
30         <script src="_static/underscore.js"></script>
31         <script src="_static/doctools.js"></script>
32         <script src="_static/language_data.js"></script>
33     
34     <script type="text/javascript" src="_static/js/theme.js"></script>
35
36     
37     <link rel="index" title="Index" href="genindex.html" />
38     <link rel="search" title="Search" href="search.html" />
39     <link rel="next" title="lxml.html.defs module" href="lxml.html.defs.html" />
40     <link rel="prev" title="lxml.html.builder module" href="lxml.html.builder.html" /> 
41 </head>
42
43 <body class="wy-body-for-nav">
44
45    
46   <div class="wy-grid-for-nav">
47     
48     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
49       <div class="wy-side-scroll">
50         <div class="wy-side-nav-search" >
51           
52
53           
54             <a href="index.html" class="icon icon-home" alt="Documentation Home"> lxml
55           
56
57           
58             
59             <img src="_static/python-xml.png" class="logo" alt="Logo"/>
60           
61           </a>
62
63           
64             
65             
66               <div class="version">
67                 4.6.0
68               </div>
69             
70           
71
72           
73 <div role="search">
74   <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
75     <input type="text" name="q" placeholder="Search docs" />
76     <input type="hidden" name="check_keywords" value="yes" />
77     <input type="hidden" name="area" value="default" />
78   </form>
79 </div>
80
81           
82         </div>
83
84         
85         <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
86           
87             
88             
89               
90             
91             
92               <ul class="current">
93 <li class="toctree-l1 current"><a class="reference internal" href="lxml.html">lxml package</a><ul class="current">
94 <li class="toctree-l2 current"><a class="reference internal" href="lxml.html.html">lxml.html package</a><ul class="current">
95 <li class="toctree-l3"><a class="reference internal" href="lxml.html.ElementSoup.html">lxml.html.ElementSoup module</a></li>
96 <li class="toctree-l3"><a class="reference internal" href="lxml.html._diffcommand.html">lxml.html._diffcommand module</a></li>
97 <li class="toctree-l3"><a class="reference internal" href="lxml.html._setmixin.html">lxml.html._setmixin module</a></li>
98 <li class="toctree-l3"><a class="reference internal" href="lxml.html.builder.html">lxml.html.builder module</a></li>
99 <li class="toctree-l3 current"><a class="current reference internal" href="#">lxml.html.clean module</a></li>
100 <li class="toctree-l3"><a class="reference internal" href="lxml.html.defs.html">lxml.html.defs module</a></li>
101 <li class="toctree-l3"><a class="reference internal" href="lxml.html.diff.html">lxml.html.diff module</a></li>
102 <li class="toctree-l3"><a class="reference internal" href="lxml.html.formfill.html">lxml.html.formfill module</a></li>
103 <li class="toctree-l3"><a class="reference internal" href="lxml.html.html5parser.html">lxml.html.html5parser module</a></li>
104 <li class="toctree-l3"><a class="reference internal" href="lxml.html.soupparser.html">lxml.html.soupparser module</a></li>
105 </ul>
106 </li>
107 <li class="toctree-l2"><a class="reference internal" href="lxml.isoschematron.html">lxml.isoschematron package</a></li>
108 <li class="toctree-l2"><a class="reference internal" href="lxml.ElementInclude.html">lxml.ElementInclude module</a></li>
109 <li class="toctree-l2"><a class="reference internal" href="lxml._elementpath.html">lxml._elementpath module</a></li>
110 <li class="toctree-l2"><a class="reference internal" href="lxml.builder.html">lxml.builder module</a></li>
111 <li class="toctree-l2"><a class="reference internal" href="lxml.cssselect.html">lxml.cssselect module</a></li>
112 <li class="toctree-l2"><a class="reference internal" href="lxml.doctestcompare.html">lxml.doctestcompare module</a></li>
113 <li class="toctree-l2"><a class="reference internal" href="lxml.etree.html">lxml.etree module</a></li>
114 <li class="toctree-l2"><a class="reference internal" href="lxml.objectify.html">lxml.objectify module</a></li>
115 <li class="toctree-l2"><a class="reference internal" href="lxml.sax.html">lxml.sax module</a></li>
116 </ul>
117 </li>
118 </ul>
119
120             
121           
122         </div>
123         
124       </div>
125     </nav>
126
127     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
128
129       
130       <nav class="wy-nav-top" aria-label="top navigation">
131         
132           <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
133           <a href="index.html">lxml</a>
134         
135       </nav>
136
137
138       <div class="wy-nav-content">
139         
140         <div class="rst-content">
141         
142           
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158 <div role="navigation" aria-label="breadcrumbs navigation">
159
160   <ul class="wy-breadcrumbs">
161     
162       <li><a href="index.html" class="icon icon-home"></a> &raquo;</li>
163         
164           <li><a href="lxml.html">lxml package</a> &raquo;</li>
165         
166           <li><a href="lxml.html.html">lxml.html package</a> &raquo;</li>
167         
168       <li>lxml.html.clean module</li>
169     
170     
171       <li class="wy-breadcrumbs-aside">
172         
173             
174             <a href="_sources/lxml.html.clean.rst.txt" rel="nofollow"> View page source</a>
175           
176         
177       </li>
178     
179   </ul>
180
181   
182   <hr/>
183 </div>
184           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
185            <div itemprop="articleBody">
186             
187   <div class="section" id="module-lxml.html.clean">
188 <span id="lxml-html-clean-module"></span><h1>lxml.html.clean module<a class="headerlink" href="#module-lxml.html.clean" title="Permalink to this headline">¶</a></h1>
189 <p>A cleanup tool for HTML.</p>
190 <p>Removes unwanted tags and content.  See the <cite>Cleaner</cite> class for
191 details.</p>
192 <dl class="py class">
193 <dt id="lxml.html.clean.Cleaner">
194 <em class="property">class </em><code class="sig-prename descclassname">lxml.html.clean.</code><code class="sig-name descname">Cleaner</code><span class="sig-paren">(</span><em class="sig-param"><span class="o">**</span><span class="n">kw</span></em><span class="sig-paren">)</span><a class="headerlink" href="#lxml.html.clean.Cleaner" title="Permalink to this definition">¶</a></dt>
195 <dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
196 <p>Instances cleans the document of each of the possible offending
197 elements.  The cleaning is controlled by attributes; you can
198 override attributes in a subclass, or set them in the constructor.</p>
199 <dl>
200 <dt><code class="docutils literal notranslate"><span class="pre">scripts</span></code>:</dt><dd><p>Removes any <code class="docutils literal notranslate"><span class="pre">&lt;script&gt;</span></code> tags.</p>
201 </dd>
202 <dt><code class="docutils literal notranslate"><span class="pre">javascript</span></code>:</dt><dd><p>Removes any Javascript, like an <code class="docutils literal notranslate"><span class="pre">onclick</span></code> attribute. Also removes stylesheets
203 as they could contain Javascript.</p>
204 </dd>
205 <dt><code class="docutils literal notranslate"><span class="pre">comments</span></code>:</dt><dd><p>Removes any comments.</p>
206 </dd>
207 <dt><code class="docutils literal notranslate"><span class="pre">style</span></code>:</dt><dd><p>Removes any style tags.</p>
208 </dd>
209 <dt><code class="docutils literal notranslate"><span class="pre">inline_style</span></code></dt><dd><p>Removes any style attributes.  Defaults to the value of the <code class="docutils literal notranslate"><span class="pre">style</span></code> option.</p>
210 </dd>
211 <dt><code class="docutils literal notranslate"><span class="pre">links</span></code>:</dt><dd><p>Removes any <code class="docutils literal notranslate"><span class="pre">&lt;link&gt;</span></code> tags</p>
212 </dd>
213 <dt><code class="docutils literal notranslate"><span class="pre">meta</span></code>:</dt><dd><p>Removes any <code class="docutils literal notranslate"><span class="pre">&lt;meta&gt;</span></code> tags</p>
214 </dd>
215 <dt><code class="docutils literal notranslate"><span class="pre">page_structure</span></code>:</dt><dd><p>Structural parts of a page: <code class="docutils literal notranslate"><span class="pre">&lt;head&gt;</span></code>, <code class="docutils literal notranslate"><span class="pre">&lt;html&gt;</span></code>, <code class="docutils literal notranslate"><span class="pre">&lt;title&gt;</span></code>.</p>
216 </dd>
217 <dt><code class="docutils literal notranslate"><span class="pre">processing_instructions</span></code>:</dt><dd><p>Removes any processing instructions.</p>
218 </dd>
219 <dt><code class="docutils literal notranslate"><span class="pre">embedded</span></code>:</dt><dd><p>Removes any embedded objects (flash, iframes)</p>
220 </dd>
221 <dt><code class="docutils literal notranslate"><span class="pre">frames</span></code>:</dt><dd><p>Removes any frame-related tags</p>
222 </dd>
223 <dt><code class="docutils literal notranslate"><span class="pre">forms</span></code>:</dt><dd><p>Removes any form tags</p>
224 </dd>
225 <dt><code class="docutils literal notranslate"><span class="pre">annoying_tags</span></code>:</dt><dd><p>Tags that aren’t <em>wrong</em>, but are annoying.  <code class="docutils literal notranslate"><span class="pre">&lt;blink&gt;</span></code> and <code class="docutils literal notranslate"><span class="pre">&lt;marquee&gt;</span></code></p>
226 </dd>
227 <dt><code class="docutils literal notranslate"><span class="pre">remove_tags</span></code>:</dt><dd><p>A list of tags to remove.  Only the tags will be removed,
228 their content will get pulled up into the parent tag.</p>
229 </dd>
230 <dt><code class="docutils literal notranslate"><span class="pre">kill_tags</span></code>:</dt><dd><p>A list of tags to kill.  Killing also removes the tag’s content,
231 i.e. the whole subtree, not just the tag itself.</p>
232 </dd>
233 <dt><code class="docutils literal notranslate"><span class="pre">allow_tags</span></code>:</dt><dd><p>A list of tags to include (default include all).</p>
234 </dd>
235 <dt><code class="docutils literal notranslate"><span class="pre">remove_unknown_tags</span></code>:</dt><dd><p>Remove any tags that aren’t standard parts of HTML.</p>
236 </dd>
237 <dt><code class="docutils literal notranslate"><span class="pre">safe_attrs_only</span></code>:</dt><dd><p>If true, only include ‘safe’ attributes (specifically the list
238 from the feedparser HTML sanitisation web site).</p>
239 </dd>
240 <dt><code class="docutils literal notranslate"><span class="pre">safe_attrs</span></code>:</dt><dd><p>A set of attribute names to override the default list of attributes
241 considered ‘safe’ (when safe_attrs_only=True).</p>
242 </dd>
243 <dt><code class="docutils literal notranslate"><span class="pre">add_nofollow</span></code>:</dt><dd><p>If true, then any &lt;a&gt; tags will have <code class="docutils literal notranslate"><span class="pre">rel=&quot;nofollow&quot;</span></code> added to them.</p>
244 </dd>
245 <dt><code class="docutils literal notranslate"><span class="pre">host_whitelist</span></code>:</dt><dd><p>A list or set of hosts that you can use for embedded content
246 (for content like <code class="docutils literal notranslate"><span class="pre">&lt;object&gt;</span></code>, <code class="docutils literal notranslate"><span class="pre">&lt;link</span> <span class="pre">rel=&quot;stylesheet&quot;&gt;</span></code>, etc).
247 You can also implement/override the method
248 <code class="docutils literal notranslate"><span class="pre">allow_embedded_url(el,</span> <span class="pre">url)</span></code> or <code class="docutils literal notranslate"><span class="pre">allow_element(el)</span></code> to
249 implement more complex rules for what can be embedded.
250 Anything that passes this test will be shown, regardless of
251 the value of (for instance) <code class="docutils literal notranslate"><span class="pre">embedded</span></code>.</p>
252 <p>Note that this parameter might not work as intended if you do not
253 make the links absolute before doing the cleaning.</p>
254 <p>Note that you may also need to set <code class="docutils literal notranslate"><span class="pre">whitelist_tags</span></code>.</p>
255 </dd>
256 <dt><code class="docutils literal notranslate"><span class="pre">whitelist_tags</span></code>:</dt><dd><p>A set of tags that can be included with <code class="docutils literal notranslate"><span class="pre">host_whitelist</span></code>.
257 The default is <code class="docutils literal notranslate"><span class="pre">iframe</span></code> and <code class="docutils literal notranslate"><span class="pre">embed</span></code>; you may wish to
258 include other tags like <code class="docutils literal notranslate"><span class="pre">script</span></code>, or you may want to
259 implement <code class="docutils literal notranslate"><span class="pre">allow_embedded_url</span></code> for more control.  Set to None to
260 include all tags.</p>
261 </dd>
262 </dl>
263 <p>This modifies the document <em>in place</em>.</p>
264 <dl class="py method">
265 <dt id="lxml.html.clean.Cleaner._has_sneaky_javascript">
266 <code class="sig-name descname">_has_sneaky_javascript</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">style</span></em><span class="sig-paren">)</span><a class="headerlink" href="#lxml.html.clean.Cleaner._has_sneaky_javascript" title="Permalink to this definition">¶</a></dt>
267 <dd><p>Depending on the browser, stuff like <code class="docutils literal notranslate"><span class="pre">e</span> <span class="pre">x</span> <span class="pre">p</span> <span class="pre">r</span> <span class="pre">e</span> <span class="pre">s</span> <span class="pre">s</span> <span class="pre">i</span> <span class="pre">o</span> <span class="pre">n(...)</span></code>
268 can get interpreted, or <code class="docutils literal notranslate"><span class="pre">expre/*</span> <span class="pre">stuff</span> <span class="pre">*/ssion(...)</span></code>.  This
269 checks for attempt to do stuff like this.</p>
270 <p>Typically the response will be to kill the entire style; if you
271 have just a bit of Javascript in the style another rule will catch
272 that and remove only the Javascript from the style; this catches
273 more sneaky attempts.</p>
274 </dd></dl>
275
276 <dl class="py method">
277 <dt id="lxml.html.clean.Cleaner._kill_elements">
278 <code class="sig-name descname">_kill_elements</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">doc</span></em>, <em class="sig-param"><span class="n">condition</span></em>, <em class="sig-param"><span class="n">iterate</span><span class="o">=</span><span class="default_value">None</span></em><span class="sig-paren">)</span><a class="headerlink" href="#lxml.html.clean.Cleaner._kill_elements" title="Permalink to this definition">¶</a></dt>
279 <dd></dd></dl>
280
281 <dl class="py method">
282 <dt id="lxml.html.clean.Cleaner._remove_javascript_link">
283 <code class="sig-name descname">_remove_javascript_link</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">link</span></em><span class="sig-paren">)</span><a class="headerlink" href="#lxml.html.clean.Cleaner._remove_javascript_link" title="Permalink to this definition">¶</a></dt>
284 <dd></dd></dl>
285
286 <dl class="py method">
287 <dt id="lxml.html.clean.Cleaner._substitute_comments">
288 <code class="sig-name descname">_substitute_comments</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">string</span></em>, <em class="sig-param"><span class="n">count</span><span class="o">=</span><span class="default_value">0</span></em><span class="sig-paren">)</span><a class="headerlink" href="#lxml.html.clean.Cleaner._substitute_comments" title="Permalink to this definition">¶</a></dt>
289 <dd><p>Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl.</p>
290 </dd></dl>
291
292 <dl class="py method">
293 <dt id="lxml.html.clean.Cleaner.allow_element">
294 <code class="sig-name descname">allow_element</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">el</span></em><span class="sig-paren">)</span><a class="headerlink" href="#lxml.html.clean.Cleaner.allow_element" title="Permalink to this definition">¶</a></dt>
295 <dd><p>Decide whether an element is configured to be accepted or rejected.</p>
296 <dl class="field-list simple">
297 <dt class="field-odd">Parameters</dt>
298 <dd class="field-odd"><p><strong>el</strong> – an element.</p>
299 </dd>
300 <dt class="field-even">Returns</dt>
301 <dd class="field-even"><p>true to accept the element or false to reject/discard it.</p>
302 </dd>
303 </dl>
304 </dd></dl>
305
306 <dl class="py method">
307 <dt id="lxml.html.clean.Cleaner.allow_embedded_url">
308 <code class="sig-name descname">allow_embedded_url</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">el</span></em>, <em class="sig-param"><span class="n">url</span></em><span class="sig-paren">)</span><a class="headerlink" href="#lxml.html.clean.Cleaner.allow_embedded_url" title="Permalink to this definition">¶</a></dt>
309 <dd><p>Decide whether a URL that was found in an element’s attributes or text
310 if configured to be accepted or rejected.</p>
311 <dl class="field-list simple">
312 <dt class="field-odd">Parameters</dt>
313 <dd class="field-odd"><ul class="simple">
314 <li><p><strong>el</strong> – an element.</p></li>
315 <li><p><strong>url</strong> – a URL found on the element.</p></li>
316 </ul>
317 </dd>
318 <dt class="field-even">Returns</dt>
319 <dd class="field-even"><p>true to accept the URL and false to reject it.</p>
320 </dd>
321 </dl>
322 </dd></dl>
323
324 <dl class="py method">
325 <dt id="lxml.html.clean.Cleaner.allow_follow">
326 <code class="sig-name descname">allow_follow</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">anchor</span></em><span class="sig-paren">)</span><a class="headerlink" href="#lxml.html.clean.Cleaner.allow_follow" title="Permalink to this definition">¶</a></dt>
327 <dd><p>Override to suppress rel=”nofollow” on some anchors.</p>
328 </dd></dl>
329
330 <dl class="py method">
331 <dt id="lxml.html.clean.Cleaner.clean_html">
332 <code class="sig-name descname">clean_html</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">html</span></em><span class="sig-paren">)</span><a class="headerlink" href="#lxml.html.clean.Cleaner.clean_html" title="Permalink to this definition">¶</a></dt>
333 <dd></dd></dl>
334
335 <dl class="py method">
336 <dt id="lxml.html.clean.Cleaner.kill_conditional_comments">
337 <code class="sig-name descname">kill_conditional_comments</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">doc</span></em><span class="sig-paren">)</span><a class="headerlink" href="#lxml.html.clean.Cleaner.kill_conditional_comments" title="Permalink to this definition">¶</a></dt>
338 <dd><p>IE conditional comments basically embed HTML that the parser
339 doesn’t normally see.  We can’t allow anything like that, so
340 we’ll kill any comments that could be conditional.</p>
341 </dd></dl>
342
343 <dl class="py attribute">
344 <dt id="lxml.html.clean.Cleaner._tag_link_attrs">
345 <code class="sig-name descname">_tag_link_attrs</code><em class="property"> = {'a': 'href', 'applet': ['code', 'object'], 'embed': 'src', 'iframe': 'src', 'layer': 'src', 'link': 'href', 'script': 'src'}</em><a class="headerlink" href="#lxml.html.clean.Cleaner._tag_link_attrs" title="Permalink to this definition">¶</a></dt>
346 <dd></dd></dl>
347
348 <dl class="py attribute">
349 <dt id="lxml.html.clean.Cleaner.add_nofollow">
350 <code class="sig-name descname">add_nofollow</code><em class="property"> = False</em><a class="headerlink" href="#lxml.html.clean.Cleaner.add_nofollow" title="Permalink to this definition">¶</a></dt>
351 <dd></dd></dl>
352
353 <dl class="py attribute">
354 <dt id="lxml.html.clean.Cleaner.allow_tags">
355 <code class="sig-name descname">allow_tags</code><em class="property"> = None</em><a class="headerlink" href="#lxml.html.clean.Cleaner.allow_tags" title="Permalink to this definition">¶</a></dt>
356 <dd></dd></dl>
357
358 <dl class="py attribute">
359 <dt id="lxml.html.clean.Cleaner.annoying_tags">
360 <code class="sig-name descname">annoying_tags</code><em class="property"> = True</em><a class="headerlink" href="#lxml.html.clean.Cleaner.annoying_tags" title="Permalink to this definition">¶</a></dt>
361 <dd></dd></dl>
362
363 <dl class="py attribute">
364 <dt id="lxml.html.clean.Cleaner.comments">
365 <code class="sig-name descname">comments</code><em class="property"> = True</em><a class="headerlink" href="#lxml.html.clean.Cleaner.comments" title="Permalink to this definition">¶</a></dt>
366 <dd></dd></dl>
367
368 <dl class="py attribute">
369 <dt id="lxml.html.clean.Cleaner.embedded">
370 <code class="sig-name descname">embedded</code><em class="property"> = True</em><a class="headerlink" href="#lxml.html.clean.Cleaner.embedded" title="Permalink to this definition">¶</a></dt>
371 <dd></dd></dl>
372
373 <dl class="py attribute">
374 <dt id="lxml.html.clean.Cleaner.forms">
375 <code class="sig-name descname">forms</code><em class="property"> = True</em><a class="headerlink" href="#lxml.html.clean.Cleaner.forms" title="Permalink to this definition">¶</a></dt>
376 <dd></dd></dl>
377
378 <dl class="py attribute">
379 <dt id="lxml.html.clean.Cleaner.frames">
380 <code class="sig-name descname">frames</code><em class="property"> = True</em><a class="headerlink" href="#lxml.html.clean.Cleaner.frames" title="Permalink to this definition">¶</a></dt>
381 <dd></dd></dl>
382
383 <dl class="py attribute">
384 <dt id="lxml.html.clean.Cleaner.host_whitelist">
385 <code class="sig-name descname">host_whitelist</code><em class="property"> = ()</em><a class="headerlink" href="#lxml.html.clean.Cleaner.host_whitelist" title="Permalink to this definition">¶</a></dt>
386 <dd></dd></dl>
387
388 <dl class="py attribute">
389 <dt id="lxml.html.clean.Cleaner.inline_style">
390 <code class="sig-name descname">inline_style</code><em class="property"> = None</em><a class="headerlink" href="#lxml.html.clean.Cleaner.inline_style" title="Permalink to this definition">¶</a></dt>
391 <dd></dd></dl>
392
393 <dl class="py attribute">
394 <dt id="lxml.html.clean.Cleaner.javascript">
395 <code class="sig-name descname">javascript</code><em class="property"> = True</em><a class="headerlink" href="#lxml.html.clean.Cleaner.javascript" title="Permalink to this definition">¶</a></dt>
396 <dd></dd></dl>
397
398 <dl class="py attribute">
399 <dt id="lxml.html.clean.Cleaner.kill_tags">
400 <code class="sig-name descname">kill_tags</code><em class="property"> = None</em><a class="headerlink" href="#lxml.html.clean.Cleaner.kill_tags" title="Permalink to this definition">¶</a></dt>
401 <dd></dd></dl>
402
403 <dl class="py attribute">
404 <dt id="lxml.html.clean.Cleaner.links">
405 <code class="sig-name descname">links</code><em class="property"> = True</em><a class="headerlink" href="#lxml.html.clean.Cleaner.links" title="Permalink to this definition">¶</a></dt>
406 <dd></dd></dl>
407
408 <dl class="py attribute">
409 <dt id="lxml.html.clean.Cleaner.meta">
410 <code class="sig-name descname">meta</code><em class="property"> = True</em><a class="headerlink" href="#lxml.html.clean.Cleaner.meta" title="Permalink to this definition">¶</a></dt>
411 <dd></dd></dl>
412
413 <dl class="py attribute">
414 <dt id="lxml.html.clean.Cleaner.page_structure">
415 <code class="sig-name descname">page_structure</code><em class="property"> = True</em><a class="headerlink" href="#lxml.html.clean.Cleaner.page_structure" title="Permalink to this definition">¶</a></dt>
416 <dd></dd></dl>
417
418 <dl class="py attribute">
419 <dt id="lxml.html.clean.Cleaner.processing_instructions">
420 <code class="sig-name descname">processing_instructions</code><em class="property"> = True</em><a class="headerlink" href="#lxml.html.clean.Cleaner.processing_instructions" title="Permalink to this definition">¶</a></dt>
421 <dd></dd></dl>
422
423 <dl class="py attribute">
424 <dt id="lxml.html.clean.Cleaner.remove_tags">
425 <code class="sig-name descname">remove_tags</code><em class="property"> = None</em><a class="headerlink" href="#lxml.html.clean.Cleaner.remove_tags" title="Permalink to this definition">¶</a></dt>
426 <dd></dd></dl>
427
428 <dl class="py attribute">
429 <dt id="lxml.html.clean.Cleaner.remove_unknown_tags">
430 <code class="sig-name descname">remove_unknown_tags</code><em class="property"> = True</em><a class="headerlink" href="#lxml.html.clean.Cleaner.remove_unknown_tags" title="Permalink to this definition">¶</a></dt>
431 <dd></dd></dl>
432
433 <dl class="py attribute">
434 <dt id="lxml.html.clean.Cleaner.safe_attrs">
435 <code class="sig-name descname">safe_attrs</code><em class="property"> = frozenset({'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols', 'colspan', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width'})</em><a class="headerlink" href="#lxml.html.clean.Cleaner.safe_attrs" title="Permalink to this definition">¶</a></dt>
436 <dd></dd></dl>
437
438 <dl class="py attribute">
439 <dt id="lxml.html.clean.Cleaner.safe_attrs_only">
440 <code class="sig-name descname">safe_attrs_only</code><em class="property"> = True</em><a class="headerlink" href="#lxml.html.clean.Cleaner.safe_attrs_only" title="Permalink to this definition">¶</a></dt>
441 <dd></dd></dl>
442
443 <dl class="py attribute">
444 <dt id="lxml.html.clean.Cleaner.scripts">
445 <code class="sig-name descname">scripts</code><em class="property"> = True</em><a class="headerlink" href="#lxml.html.clean.Cleaner.scripts" title="Permalink to this definition">¶</a></dt>
446 <dd></dd></dl>
447
448 <dl class="py attribute">
449 <dt id="lxml.html.clean.Cleaner.style">
450 <code class="sig-name descname">style</code><em class="property"> = False</em><a class="headerlink" href="#lxml.html.clean.Cleaner.style" title="Permalink to this definition">¶</a></dt>
451 <dd></dd></dl>
452
453 <dl class="py attribute">
454 <dt id="lxml.html.clean.Cleaner.whitelist_tags">
455 <code class="sig-name descname">whitelist_tags</code><em class="property"> = {'embed', 'iframe'}</em><a class="headerlink" href="#lxml.html.clean.Cleaner.whitelist_tags" title="Permalink to this definition">¶</a></dt>
456 <dd></dd></dl>
457
458 </dd></dl>
459
460 <dl class="py function">
461 <dt id="lxml.html.clean._break_text">
462 <code class="sig-prename descclassname">lxml.html.clean.</code><code class="sig-name descname">_break_text</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">text</span></em>, <em class="sig-param"><span class="n">max_width</span></em>, <em class="sig-param"><span class="n">break_character</span></em><span class="sig-paren">)</span><a class="headerlink" href="#lxml.html.clean._break_text" title="Permalink to this definition">¶</a></dt>
463 <dd></dd></dl>
464
465 <dl class="py function">
466 <dt id="lxml.html.clean._insert_break">
467 <code class="sig-prename descclassname">lxml.html.clean.</code><code class="sig-name descname">_insert_break</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">word</span></em>, <em class="sig-param"><span class="n">width</span></em>, <em class="sig-param"><span class="n">break_character</span></em><span class="sig-paren">)</span><a class="headerlink" href="#lxml.html.clean._insert_break" title="Permalink to this definition">¶</a></dt>
468 <dd></dd></dl>
469
470 <dl class="py function">
471 <dt id="lxml.html.clean._is_image_dataurl">
472 <code class="sig-prename descclassname">lxml.html.clean.</code><code class="sig-name descname">_is_image_dataurl</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">string</span></em>, <em class="sig-param"><span class="n">pos</span><span class="o">=</span><span class="default_value">0</span></em>, <em class="sig-param"><span class="n">endpos</span><span class="o">=</span><span class="default_value">9223372036854775807</span></em><span class="sig-paren">)</span><a class="headerlink" href="#lxml.html.clean._is_image_dataurl" title="Permalink to this definition">¶</a></dt>
473 <dd><p>Scan through string looking for a match, and return a corresponding match object instance.</p>
474 <p>Return None if no position in the string matches.</p>
475 </dd></dl>
476
477 <dl class="py function">
478 <dt id="lxml.html.clean._is_javascript_scheme">
479 <code class="sig-prename descclassname">lxml.html.clean.</code><code class="sig-name descname">_is_javascript_scheme</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">s</span></em><span class="sig-paren">)</span><a class="headerlink" href="#lxml.html.clean._is_javascript_scheme" title="Permalink to this definition">¶</a></dt>
480 <dd></dd></dl>
481
482 <dl class="py function">
483 <dt id="lxml.html.clean._is_possibly_malicious_scheme">
484 <code class="sig-prename descclassname">lxml.html.clean.</code><code class="sig-name descname">_is_possibly_malicious_scheme</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">string</span></em>, <em class="sig-param"><span class="n">pos</span><span class="o">=</span><span class="default_value">0</span></em>, <em class="sig-param"><span class="n">endpos</span><span class="o">=</span><span class="default_value">9223372036854775807</span></em><span class="sig-paren">)</span><a class="headerlink" href="#lxml.html.clean._is_possibly_malicious_scheme" title="Permalink to this definition">¶</a></dt>
485 <dd><p>Scan through string looking for a match, and return a corresponding match object instance.</p>
486 <p>Return None if no position in the string matches.</p>
487 </dd></dl>
488
489 <dl class="py function">
490 <dt id="lxml.html.clean._link_text">
491 <code class="sig-prename descclassname">lxml.html.clean.</code><code class="sig-name descname">_link_text</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">text</span></em>, <em class="sig-param"><span class="n">link_regexes</span></em>, <em class="sig-param"><span class="n">avoid_hosts</span></em>, <em class="sig-param"><span class="n">factory</span></em><span class="sig-paren">)</span><a class="headerlink" href="#lxml.html.clean._link_text" title="Permalink to this definition">¶</a></dt>
492 <dd></dd></dl>
493
494 <dl class="py function">
495 <dt id="lxml.html.clean._substitute_whitespace">
496 <code class="sig-prename descclassname">lxml.html.clean.</code><code class="sig-name descname">_substitute_whitespace</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">repl</span></em>, <em class="sig-param"><span class="n">string</span></em>, <em class="sig-param"><span class="n">count</span><span class="o">=</span><span class="default_value">0</span></em><span class="sig-paren">)</span><a class="headerlink" href="#lxml.html.clean._substitute_whitespace" title="Permalink to this definition">¶</a></dt>
497 <dd><p>Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl.</p>
498 </dd></dl>
499
500 <dl class="py function">
501 <dt id="lxml.html.clean.autolink">
502 <code class="sig-prename descclassname">lxml.html.clean.</code><code class="sig-name descname">autolink</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">el</span></em>, <em class="sig-param"><span class="n">link_regexes</span><span class="o">=</span><span class="default_value">[re.compile('(?P&lt;body&gt;https?://(?P&lt;host&gt;[a-z0-9._-]+)(?:/[/\\-_.,a-z0-9%&amp;?;=~]*)?(?:\\([/\\-_.,a-z0-9%&amp;?;=~]*\\))?)', re.IGNORECASE), re.compile('mailto:(?P&lt;body&gt;[a-z0-9._-]+&#64;(?P&lt;host&gt;[a-z0-9_.-]+[a-z]))', re.IGNORECASE)]</span></em>, <em class="sig-param"><span class="n">avoid_elements</span><span class="o">=</span><span class="default_value">['textarea', 'pre', 'code', 'head', 'select', 'a']</span></em>, <em class="sig-param"><span class="n">avoid_hosts</span><span class="o">=</span><span class="default_value">[re.compile('^localhost', re.IGNORECASE), re.compile('\\bexample\\.(?:com|org|net)$', re.IGNORECASE), re.compile('^127\\.0\\.0\\.1$')]</span></em>, <em class="sig-param"><span class="n">avoid_classes</span><span class="o">=</span><span class="default_value">['nolink']</span></em><span class="sig-paren">)</span><a class="headerlink" href="#lxml.html.clean.autolink" title="Permalink to this definition">¶</a></dt>
503 <dd><p>Turn any URLs into links.</p>
504 <p>It will search for links identified by the given regular
505 expressions (by default mailto and http(s) links).</p>
506 <p>It won’t link text in an element in avoid_elements, or an element
507 with a class in avoid_classes.  It won’t link to anything with a
508 host that matches one of the regular expressions in avoid_hosts
509 (default localhost and 127.0.0.1).</p>
510 <p>If you pass in an element, the element’s tail will not be
511 substituted, only the contents of the element.</p>
512 </dd></dl>
513
514 <dl class="py function">
515 <dt id="lxml.html.clean.autolink_html">
516 <code class="sig-prename descclassname">lxml.html.clean.</code><code class="sig-name descname">autolink_html</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">html</span></em>, <em class="sig-param"><span class="o">*</span><span class="n">args</span></em>, <em class="sig-param"><span class="o">**</span><span class="n">kw</span></em><span class="sig-paren">)</span><a class="headerlink" href="#lxml.html.clean.autolink_html" title="Permalink to this definition">¶</a></dt>
517 <dd><p>Turn any URLs into links.</p>
518 <p>It will search for links identified by the given regular
519 expressions (by default mailto and http(s) links).</p>
520 <p>It won’t link text in an element in avoid_elements, or an element
521 with a class in avoid_classes.  It won’t link to anything with a
522 host that matches one of the regular expressions in avoid_hosts
523 (default localhost and 127.0.0.1).</p>
524 <p>If you pass in an element, the element’s tail will not be
525 substituted, only the contents of the element.</p>
526 </dd></dl>
527
528 <dl class="py function">
529 <dt id="lxml.html.clean.clean_html">
530 <code class="sig-prename descclassname">lxml.html.clean.</code><code class="sig-name descname">clean_html</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">html</span></em><span class="sig-paren">)</span><a class="headerlink" href="#lxml.html.clean.clean_html" title="Permalink to this definition">¶</a></dt>
531 <dd></dd></dl>
532
533 <dl class="py function">
534 <dt id="lxml.html.clean.word_break">
535 <code class="sig-prename descclassname">lxml.html.clean.</code><code class="sig-name descname">word_break</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">el</span></em>, <em class="sig-param"><span class="n">max_width</span><span class="o">=</span><span class="default_value">40</span></em>, <em class="sig-param"><span class="n">avoid_elements</span><span class="o">=</span><span class="default_value">['pre', 'textarea', 'code']</span></em>, <em class="sig-param"><span class="n">avoid_classes</span><span class="o">=</span><span class="default_value">['nobreak']</span></em>, <em class="sig-param"><span class="n">break_character</span><span class="o">=</span><span class="default_value">'\u200b'</span></em><span class="sig-paren">)</span><a class="headerlink" href="#lxml.html.clean.word_break" title="Permalink to this definition">¶</a></dt>
536 <dd><p>Breaks any long words found in the body of the text (not attributes).</p>
537 <p>Doesn’t effect any of the tags in avoid_elements, by default
538 <code class="docutils literal notranslate"><span class="pre">&lt;textarea&gt;</span></code> and <code class="docutils literal notranslate"><span class="pre">&lt;pre&gt;</span></code></p>
539 <p>Breaks words by inserting &amp;#8203;, which is a unicode character
540 for Zero Width Space character.  This generally takes up no space
541 in rendering, but does copy as a space, and in monospace contexts
542 usually takes up space.</p>
543 <p>See <a class="reference external" href="http://www.cs.tut.fi/~jkorpela/html/nobr.html">http://www.cs.tut.fi/~jkorpela/html/nobr.html</a> for a discussion</p>
544 </dd></dl>
545
546 <dl class="py function">
547 <dt id="lxml.html.clean.word_break_html">
548 <code class="sig-prename descclassname">lxml.html.clean.</code><code class="sig-name descname">word_break_html</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">html</span></em>, <em class="sig-param"><span class="o">*</span><span class="n">args</span></em>, <em class="sig-param"><span class="o">**</span><span class="n">kw</span></em><span class="sig-paren">)</span><a class="headerlink" href="#lxml.html.clean.word_break_html" title="Permalink to this definition">¶</a></dt>
549 <dd></dd></dl>
550
551 </div>
552
553
554            </div>
555            
556           </div>
557           <footer>
558   
559     <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
560       
561         <a href="lxml.html.defs.html" class="btn btn-neutral float-right" title="lxml.html.defs module" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
562       
563       
564         <a href="lxml.html.builder.html" class="btn btn-neutral float-left" title="lxml.html.builder module" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
565       
566     </div>
567   
568
569   <hr/>
570
571   <div role="contentinfo">
572     <p>
573         
574         &copy; Copyright 2020, lxml dev team
575
576     </p>
577   </div>
578     
579     
580     
581     Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a
582     
583     <a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a>
584     
585     provided by <a href="https://readthedocs.org">Read the Docs</a>. 
586
587 </footer>
588
589         </div>
590       </div>
591
592     </section>
593
594   </div>
595   
596
597   <script type="text/javascript">
598       jQuery(function () {
599           SphinxRtdTheme.Navigation.enable(true);
600       });
601   </script>
602
603   
604   
605     
606    
607
608 </body>
609 </html>