1 <?xml version="1.0" encoding="ascii"?>
2 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
3 "DTD/xhtml1-transitional.dtd">
4 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
6 <title>lxml.html.clean</title>
7 <link rel="stylesheet" href="epydoc.css" type="text/css" />
8 <script type="text/javascript" src="epydoc.js"></script>
11 <body bgcolor="white" text="black" link="blue" vlink="#204080"
13 <!-- ==================== NAVIGATION BAR ==================== -->
14 <table class="navbar" border="0" width="100%" cellpadding="0"
15 bgcolor="#a0c0ff" cellspacing="0">
18 <th> <a
19 href="lxml-module.html">Home</a> </th>
22 <th> <a
23 href="module-tree.html">Trees</a> </th>
26 <th> <a
27 href="identifier-index.html">Indices</a> </th>
30 <th> <a
31 href="help.html">Help</a> </th>
33 <!-- Project homepage -->
34 <th class="navbar" align="right" width="100%">
35 <table border="0" cellpadding="0" cellspacing="0">
36 <tr><th class="navbar" align="center"
37 ><a class="navbar" target="_top" href="/">lxml API</a></th>
41 <table width="100%" cellpadding="0" cellspacing="0">
44 <span class="breadcrumbs">
45 <a href="lxml-module.html">Package lxml</a> ::
46 <a href="lxml.html-module.html">Package html</a> ::
51 <table cellpadding="0" cellspacing="0">
52 <!-- hide/show private -->
53 <tr><td align="right"><span class="options">[<a href="javascript:void(0);" class="privatelink"
54 onclick="toggle_private();">hide private</a>]</span></td></tr>
55 <tr><td align="right"><span class="options"
56 >[<a href="frames.html" target="_top">frames</a
57 >] | <a href="lxml.html.clean-module.html"
58 target="_top">no frames</a>]</span></td></tr>
63 <!-- ==================== MODULE DESCRIPTION ==================== -->
64 <h1 class="epydoc">Module clean</h1><p class="nomargin-top"><span class="codelink"><a href="lxml.html.clean-pysrc.html">source code</a></span></p>
65 <p>A cleanup tool for HTML.</p>
66 <p>Removes unwanted tags and content. See the <a href="lxml.html.clean.Cleaner-class.html" class="link">Cleaner</a> class for
69 <!-- ==================== CLASSES ==================== -->
70 <a name="section-Classes"></a>
71 <table class="summary" border="1" cellpadding="3"
72 cellspacing="0" width="100%" bgcolor="white">
73 <tr bgcolor="#70b0f0" class="table-header">
74 <td colspan="2" class="table-header">
75 <table border="0" cellpadding="0" cellspacing="0" width="100%">
77 <td align="left"><span class="table-header">Classes</span></td>
78 <td align="right" valign="top"
79 ><span class="options">[<a href="#section-Classes"
80 class="privatelink" onclick="toggle_private();"
81 >hide private</a>]</span></td>
87 <td width="15%" align="right" valign="top" class="summary">
88 <span class="summary-type"> </span>
89 </td><td class="summary">
90 <a href="lxml.html.clean.Cleaner-class.html" class="summary-name">Cleaner</a><br />
91 Instances cleans the document of each of the possible offending
92 elements. The cleaning is controlled by attributes; you can
93 override attributes in a subclass, or set them in the constructor.
97 <!-- ==================== FUNCTIONS ==================== -->
98 <a name="section-Functions"></a>
99 <table class="summary" border="1" cellpadding="3"
100 cellspacing="0" width="100%" bgcolor="white">
101 <tr bgcolor="#70b0f0" class="table-header">
102 <td colspan="2" class="table-header">
103 <table border="0" cellpadding="0" cellspacing="0" width="100%">
105 <td align="left"><span class="table-header">Functions</span></td>
106 <td align="right" valign="top"
107 ><span class="options">[<a href="#section-Functions"
108 class="privatelink" onclick="toggle_private();"
109 >hide private</a>]</span></td>
115 <td width="15%" align="right" valign="top" class="summary">
116 <span class="summary-type"> </span>
117 </td><td class="summary">
118 <table width="100%" cellpadding="0" cellspacing="0" border="0">
120 <td><span class="summary-sig"><a name="_substitute_whitespace"></a><span class="summary-sig-name">_substitute_whitespace</span>(<span class="summary-sig-arg">...</span>)</span><br />
121 sub(repl, string[, count = 0]) --> newstring
122 Return the string obtained by replacing the leftmost non-overlapping
123 occurrences of pattern in string by the replacement repl.</td>
124 <td align="right" valign="top">
125 <span class="codelink"><a href="lxml.html.clean-pysrc.html#_substitute_whitespace">source code</a></span>
134 <td width="15%" align="right" valign="top" class="summary">
135 <span class="summary-type"> </span>
136 </td><td class="summary">
137 <table width="100%" cellpadding="0" cellspacing="0" border="0">
139 <td><span class="summary-sig"><a name="clean_html"></a><span class="summary-sig-name">clean_html</span>(<span class="summary-sig-arg">html</span>)</span></td>
140 <td align="right" valign="top">
141 <span class="codelink"><a href="lxml.html.clean-pysrc.html#clean_html">source code</a></span>
150 <td width="15%" align="right" valign="top" class="summary">
151 <span class="summary-type"> </span>
152 </td><td class="summary">
153 <table width="100%" cellpadding="0" cellspacing="0" border="0">
155 <td><span class="summary-sig"><a href="lxml.html.clean-module.html#autolink" class="summary-sig-name">autolink</a>(<span class="summary-sig-arg">el</span>,
156 <span class="summary-sig-arg">link_regexes</span>=<span class="summary-sig-default"><code class="variable-group">[</code>re.compile(r'<code class="re-flags">(?i)</code><code class="re-group">(?P<</code><code class="re-ref">body</code><code class="re-group">></code>https<code class="re-op">?</code>://<code class="re-group">(?P<</code><code class="re-ref">host</code><code class="re-group">></code><code class="re-group">[</code>a<code class="re-op">-</code>z0<code class="re-op">-</code>9\._-<code class="re-group">]</code><code class="re-op">+</code><code class="re-group">)</code><code class="re-group">(?:</code><code class="variable-ellipsis">...</code></span>,
157 <span class="summary-sig-arg">avoid_elements</span>=<span class="summary-sig-default"><code class="variable-group">[</code><code class="variable-quote">'</code><code class="variable-string">textarea</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">pre</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">code</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">head</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">select</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">a</code><code class="variable-quote">'</code><code class="variable-group">]</code></span>,
158 <span class="summary-sig-arg">avoid_hosts</span>=<span class="summary-sig-default"><code class="variable-group">[</code>re.compile(r'<code class="re-flags">(?i)</code>^localhost')<code class="variable-op">, </code>re.compile(r'<code class="re-flags">(?i)</code>\bexample\.<code class="re-group">(?</code><code class="variable-ellipsis">...</code></span>,
159 <span class="summary-sig-arg">avoid_classes</span>=<span class="summary-sig-default"><code class="variable-group">[</code><code class="variable-quote">'</code><code class="variable-string">nolink</code><code class="variable-quote">'</code><code class="variable-group">]</code></span>)</span><br />
160 Turn any URLs into links.</td>
161 <td align="right" valign="top">
162 <span class="codelink"><a href="lxml.html.clean-pysrc.html#autolink">source code</a></span>
171 <td width="15%" align="right" valign="top" class="summary">
172 <span class="summary-type"> </span>
173 </td><td class="summary">
174 <table width="100%" cellpadding="0" cellspacing="0" border="0">
176 <td><span class="summary-sig"><a name="_link_text"></a><span class="summary-sig-name">_link_text</span>(<span class="summary-sig-arg">text</span>,
177 <span class="summary-sig-arg">link_regexes</span>,
178 <span class="summary-sig-arg">avoid_hosts</span>,
179 <span class="summary-sig-arg">factory</span>)</span></td>
180 <td align="right" valign="top">
181 <span class="codelink"><a href="lxml.html.clean-pysrc.html#_link_text">source code</a></span>
190 <td width="15%" align="right" valign="top" class="summary">
191 <span class="summary-type"> </span>
192 </td><td class="summary">
193 <table width="100%" cellpadding="0" cellspacing="0" border="0">
195 <td><span class="summary-sig"><a href="lxml.html.clean-module.html#autolink_html" class="summary-sig-name">autolink_html</a>(<span class="summary-sig-arg">html</span>,
196 <span class="summary-sig-arg">*args</span>,
197 <span class="summary-sig-arg">**kw</span>)</span><br />
198 Turn any URLs into links.</td>
199 <td align="right" valign="top">
200 <span class="codelink"><a href="lxml.html.clean-pysrc.html#autolink_html">source code</a></span>
209 <td width="15%" align="right" valign="top" class="summary">
210 <span class="summary-type"> </span>
211 </td><td class="summary">
212 <table width="100%" cellpadding="0" cellspacing="0" border="0">
214 <td><span class="summary-sig"><a href="lxml.html.clean-module.html#word_break" class="summary-sig-name">word_break</a>(<span class="summary-sig-arg">el</span>,
215 <span class="summary-sig-arg">max_width</span>=<span class="summary-sig-default">40</span>,
216 <span class="summary-sig-arg">avoid_elements</span>=<span class="summary-sig-default"><code class="variable-group">[</code><code class="variable-quote">'</code><code class="variable-string">pre</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">textarea</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">code</code><code class="variable-quote">'</code><code class="variable-group">]</code></span>,
217 <span class="summary-sig-arg">avoid_classes</span>=<span class="summary-sig-default"><code class="variable-group">[</code><code class="variable-quote">'</code><code class="variable-string">nobreak</code><code class="variable-quote">'</code><code class="variable-group">]</code></span>,
218 <span class="summary-sig-arg">break_character</span>=<span class="summary-sig-default"><code class="variable-quote">u'</code><code class="variable-string">​</code><code class="variable-quote">'</code></span>)</span><br />
219 Breaks any long words found in the body of the text (not attributes).</td>
220 <td align="right" valign="top">
221 <span class="codelink"><a href="lxml.html.clean-pysrc.html#word_break">source code</a></span>
230 <td width="15%" align="right" valign="top" class="summary">
231 <span class="summary-type"> </span>
232 </td><td class="summary">
233 <table width="100%" cellpadding="0" cellspacing="0" border="0">
235 <td><span class="summary-sig"><a name="word_break_html"></a><span class="summary-sig-name">word_break_html</span>(<span class="summary-sig-arg">html</span>,
236 <span class="summary-sig-arg">*args</span>,
237 <span class="summary-sig-arg">**kw</span>)</span></td>
238 <td align="right" valign="top">
239 <span class="codelink"><a href="lxml.html.clean-pysrc.html#word_break_html">source code</a></span>
248 <td width="15%" align="right" valign="top" class="summary">
249 <span class="summary-type"> </span>
250 </td><td class="summary">
251 <table width="100%" cellpadding="0" cellspacing="0" border="0">
253 <td><span class="summary-sig"><a name="_break_text"></a><span class="summary-sig-name">_break_text</span>(<span class="summary-sig-arg">text</span>,
254 <span class="summary-sig-arg">max_width</span>,
255 <span class="summary-sig-arg">break_character</span>)</span></td>
256 <td align="right" valign="top">
257 <span class="codelink"><a href="lxml.html.clean-pysrc.html#_break_text">source code</a></span>
266 <td width="15%" align="right" valign="top" class="summary">
267 <span class="summary-type"> </span>
268 </td><td class="summary">
269 <table width="100%" cellpadding="0" cellspacing="0" border="0">
271 <td><span class="summary-sig"><a name="_insert_break"></a><span class="summary-sig-name">_insert_break</span>(<span class="summary-sig-arg">word</span>,
272 <span class="summary-sig-arg">width</span>,
273 <span class="summary-sig-arg">break_character</span>)</span></td>
274 <td align="right" valign="top">
275 <span class="codelink"><a href="lxml.html.clean-pysrc.html#_insert_break">source code</a></span>
284 <!-- ==================== VARIABLES ==================== -->
285 <a name="section-Variables"></a>
286 <table class="summary" border="1" cellpadding="3"
287 cellspacing="0" width="100%" bgcolor="white">
288 <tr bgcolor="#70b0f0" class="table-header">
289 <td colspan="2" class="table-header">
290 <table border="0" cellpadding="0" cellspacing="0" width="100%">
292 <td align="left"><span class="table-header">Variables</span></td>
293 <td align="right" valign="top"
294 ><span class="options">[<a href="#section-Variables"
295 class="privatelink" onclick="toggle_private();"
296 >hide private</a>]</span></td>
302 <td width="15%" align="right" valign="top" class="summary">
303 <span class="summary-type"> </span>
304 </td><td class="summary">
305 <a name="_css_javascript_re"></a><span class="summary-name">_css_javascript_re</span> = <code title="re.compile(r'(?is)expression\s*\(.*?\)')">re.compile(r'<code class="re-flags">(?is)</code>expression\s<code class="re-op">*</code>\(.<code class="re-op">*?</code>\)')</code>
309 <td width="15%" align="right" valign="top" class="summary">
310 <span class="summary-type"> </span>
311 </td><td class="summary">
312 <a name="_css_import_re"></a><span class="summary-name">_css_import_re</span> = <code title="re.compile(r'(?i)@\s*import')">re.compile(r'<code class="re-flags">(?i)</code>@\s<code class="re-op">*</code>import')</code>
316 <td width="15%" align="right" valign="top" class="summary">
317 <span class="summary-type"> </span>
318 </td><td class="summary">
319 <a href="lxml.html.clean-module.html#_javascript_scheme_re" class="summary-name" onclick="show_private();">_javascript_scheme_re</a> = <code title="re.compile(r'(?i)\s*(?:javascript|jscript|livescript|vbscript|data|abo\
320 ut|mocha):')">re.compile(r'<code class="re-flags">(?i)</code>\s<code class="re-op">*</code><code class="re-group">(?:</code>javascript<code class="re-op">|</code>jscr<code class="variable-ellipsis">...</code></code>
324 <td width="15%" align="right" valign="top" class="summary">
325 <span class="summary-type"> </span>
326 </td><td class="summary">
327 <a href="lxml.html.clean-module.html#_conditional_comment_re" class="summary-name" onclick="show_private();">_conditional_comment_re</a> = <code title="re.compile(r'(?is)\[if[\s\n\r]+.*?\][\s\n\r]*>')">re.compile(r'<code class="re-flags">(?is)</code>\[if<code class="re-group">[</code>\s\n\r<code class="re-group">]</code><code class="re-op">+</code>.<code class="re-op">*?</code>\]<code class="variable-ellipsis">...</code></code>
331 <td width="15%" align="right" valign="top" class="summary">
332 <span class="summary-type"> </span>
333 </td><td class="summary">
334 <a name="_find_styled_elements"></a><span class="summary-name">_find_styled_elements</span> = <code title="descendant-or-self::*[@style]">descendant-or-self::*[@style]</code>
338 <td width="15%" align="right" valign="top" class="summary">
339 <span class="summary-type"> </span>
340 </td><td class="summary">
341 <a href="lxml.html.clean-module.html#_find_external_links" class="summary-name" onclick="show_private();">_find_external_links</a> = <code title="descendant-or-self::a [normalize-space(@href) and substring(normalize\
342 -space(@href),1,1) != '#'] |descendant-or-self::x:a[normalize-space(@h\
343 ref) and substring(normalize-space(@href),1,1) != '#']">descendant-or-self::a [normalize-space<code class="variable-ellipsis">...</code></code>
347 <td width="15%" align="right" valign="top" class="summary">
348 <span class="summary-type"> </span>
349 </td><td class="summary">
350 <a name="clean"></a><span class="summary-name">clean</span> = <code title="Cleaner()">Cleaner()</code>
354 <td width="15%" align="right" valign="top" class="summary">
355 <span class="summary-type"> </span>
356 </td><td class="summary">
357 <a href="lxml.html.clean-module.html#_link_regexes" class="summary-name" onclick="show_private();">_link_regexes</a> = <code title="[re.compile(r'(?i)(?P<body>https?://(?P<host>[a-z0-9\._-]+)(?:/[/-_\.,\
358 a-z0-9%&\?;=~]*)?(?:\([/-_\.,a-z0-9%&\?;=~]*\))?)'),
359 re.compile(r'(?i)mailto:(?P<body>[a-z0-9\._-]+@(?P<host>[a-z0-9_\._]+\
360 [a-z]))')]"><code class="variable-group">[</code>re.compile(r'<code class="re-flags">(?i)</code><code class="re-group">(?P<</code><code class="re-ref">body</code><code class="re-group">></code>https<code class="re-op">?</code>://<code class="re-group">(?P<</code><code class="re-ref">host</code><code class="re-group">></code><code class="re-group">[</code><code class="variable-ellipsis">...</code></code>
364 <td width="15%" align="right" valign="top" class="summary">
365 <span class="summary-type"> </span>
366 </td><td class="summary">
367 <a href="lxml.html.clean-module.html#_avoid_elements" class="summary-name" onclick="show_private();">_avoid_elements</a> = <code title="['textarea', 'pre', 'code', 'head', 'select', 'a']"><code class="variable-group">[</code><code class="variable-quote">'</code><code class="variable-string">textarea</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">pre</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">code</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">head</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">select</code><code class="variable-quote">'</code><code class="variable-ellipsis">...</code></code>
371 <td width="15%" align="right" valign="top" class="summary">
372 <span class="summary-type"> </span>
373 </td><td class="summary">
374 <a href="lxml.html.clean-module.html#_avoid_hosts" class="summary-name" onclick="show_private();">_avoid_hosts</a> = <code title="[re.compile(r'(?i)^localhost'),
375 re.compile(r'(?i)\bexample\.(?:com|org|net)$'),
376 re.compile(r'^127\.0\.0\.1$')]"><code class="variable-group">[</code>re.compile(r'<code class="re-flags">(?i)</code>^localhost')<code class="variable-op">, </code>re.compile(r'<code class="re-flags">(?</code><code class="variable-ellipsis">...</code></code>
380 <td width="15%" align="right" valign="top" class="summary">
381 <span class="summary-type"> </span>
382 </td><td class="summary">
383 <a name="_avoid_classes"></a><span class="summary-name">_avoid_classes</span> = <code title="['nolink']"><code class="variable-group">[</code><code class="variable-quote">'</code><code class="variable-string">nolink</code><code class="variable-quote">'</code><code class="variable-group">]</code></code>
387 <td width="15%" align="right" valign="top" class="summary">
388 <span class="summary-type"> </span>
389 </td><td class="summary">
390 <a name="_avoid_word_break_elements"></a><span class="summary-name">_avoid_word_break_elements</span> = <code title="['pre', 'textarea', 'code']"><code class="variable-group">[</code><code class="variable-quote">'</code><code class="variable-string">pre</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">textarea</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">code</code><code class="variable-quote">'</code><code class="variable-group">]</code></code>
394 <td width="15%" align="right" valign="top" class="summary">
395 <span class="summary-type"> </span>
396 </td><td class="summary">
397 <a name="_avoid_word_break_classes"></a><span class="summary-name">_avoid_word_break_classes</span> = <code title="['nobreak']"><code class="variable-group">[</code><code class="variable-quote">'</code><code class="variable-string">nobreak</code><code class="variable-quote">'</code><code class="variable-group">]</code></code>
401 <td width="15%" align="right" valign="top" class="summary">
402 <span class="summary-type"> </span>
403 </td><td class="summary">
404 <a name="_break_prefer_re"></a><span class="summary-name">_break_prefer_re</span> = <code title="re.compile(r'(?i)[^a-z]')">re.compile(r'<code class="re-flags">(?i)</code><code class="re-group">[</code><code class="re-op">^</code>a<code class="re-op">-</code>z<code class="re-group">]</code>')</code>
408 <td width="15%" align="right" valign="top" class="summary">
409 <span class="summary-type"> </span>
410 </td><td class="summary">
411 <a name="__package__"></a><span class="summary-name">__package__</span> = <code title="'lxml.html'"><code class="variable-quote">'</code><code class="variable-string">lxml.html</code><code class="variable-quote">'</code></code>
415 <!-- ==================== FUNCTION DETAILS ==================== -->
416 <a name="section-FunctionDetails"></a>
417 <table class="details" border="1" cellpadding="3"
418 cellspacing="0" width="100%" bgcolor="white">
419 <tr bgcolor="#70b0f0" class="table-header">
420 <td colspan="2" class="table-header">
421 <table border="0" cellpadding="0" cellspacing="0" width="100%">
423 <td align="left"><span class="table-header">Function Details</span></td>
424 <td align="right" valign="top"
425 ><span class="options">[<a href="#section-FunctionDetails"
426 class="privatelink" onclick="toggle_private();"
427 >hide private</a>]</span></td>
433 <a name="autolink"></a>
435 <table class="details" border="1" cellpadding="3"
436 cellspacing="0" width="100%" bgcolor="white">
438 <table width="100%" cellpadding="0" cellspacing="0" border="0">
439 <tr valign="top"><td>
440 <h3 class="epydoc"><span class="sig"><span class="sig-name">autolink</span>(<span class="sig-arg">el</span>,
441 <span class="sig-arg">link_regexes</span>=<span class="sig-default"><code class="variable-group">[</code>re.compile(r'<code class="re-flags">(?i)</code><code class="re-group">(?P<</code><code class="re-ref">body</code><code class="re-group">></code>https<code class="re-op">?</code>://<code class="re-group">(?P<</code><code class="re-ref">host</code><code class="re-group">></code><code class="re-group">[</code>a<code class="re-op">-</code>z0<code class="re-op">-</code>9\._-<code class="re-group">]</code><code class="re-op">+</code><code class="re-group">)</code><code class="re-group">(?:</code><code class="variable-ellipsis">...</code></span>,
442 <span class="sig-arg">avoid_elements</span>=<span class="sig-default"><code class="variable-group">[</code><code class="variable-quote">'</code><code class="variable-string">textarea</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">pre</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">code</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">head</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">select</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">a</code><code class="variable-quote">'</code><code class="variable-group">]</code></span>,
443 <span class="sig-arg">avoid_hosts</span>=<span class="sig-default"><code class="variable-group">[</code>re.compile(r'<code class="re-flags">(?i)</code>^localhost')<code class="variable-op">, </code>re.compile(r'<code class="re-flags">(?i)</code>\bexample\.<code class="re-group">(?</code><code class="variable-ellipsis">...</code></span>,
444 <span class="sig-arg">avoid_classes</span>=<span class="sig-default"><code class="variable-group">[</code><code class="variable-quote">'</code><code class="variable-string">nolink</code><code class="variable-quote">'</code><code class="variable-group">]</code></span>)</span>
446 </td><td align="right" valign="top"
447 ><span class="codelink"><a href="lxml.html.clean-pysrc.html#autolink">source code</a></span>
451 <p>Turn any URLs into links.</p>
452 <p>It will search for links identified by the given regular
453 expressions (by default mailto and http(s) links).</p>
454 <p>It won't link text in an element in avoid_elements, or an element
455 with a class in avoid_classes. It won't link to anything with a
456 host that matches one of the regular expressions in avoid_hosts
457 (default localhost and 127.0.0.1).</p>
458 <p>If you pass in an element, the element's tail will not be
459 substituted, only the contents of the element.</p>
464 <a name="autolink_html"></a>
466 <table class="details" border="1" cellpadding="3"
467 cellspacing="0" width="100%" bgcolor="white">
469 <table width="100%" cellpadding="0" cellspacing="0" border="0">
470 <tr valign="top"><td>
471 <h3 class="epydoc"><span class="sig"><span class="sig-name">autolink_html</span>(<span class="sig-arg">html</span>,
472 <span class="sig-arg">*args</span>,
473 <span class="sig-arg">**kw</span>)</span>
475 </td><td align="right" valign="top"
476 ><span class="codelink"><a href="lxml.html.clean-pysrc.html#autolink_html">source code</a></span>
480 <p>Turn any URLs into links.</p>
481 <p>It will search for links identified by the given regular
482 expressions (by default mailto and http(s) links).</p>
483 <p>It won't link text in an element in avoid_elements, or an element
484 with a class in avoid_classes. It won't link to anything with a
485 host that matches one of the regular expressions in avoid_hosts
486 (default localhost and 127.0.0.1).</p>
487 <p>If you pass in an element, the element's tail will not be
488 substituted, only the contents of the element.</p>
493 <a name="word_break"></a>
495 <table class="details" border="1" cellpadding="3"
496 cellspacing="0" width="100%" bgcolor="white">
498 <table width="100%" cellpadding="0" cellspacing="0" border="0">
499 <tr valign="top"><td>
500 <h3 class="epydoc"><span class="sig"><span class="sig-name">word_break</span>(<span class="sig-arg">el</span>,
501 <span class="sig-arg">max_width</span>=<span class="sig-default">40</span>,
502 <span class="sig-arg">avoid_elements</span>=<span class="sig-default"><code class="variable-group">[</code><code class="variable-quote">'</code><code class="variable-string">pre</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">textarea</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">code</code><code class="variable-quote">'</code><code class="variable-group">]</code></span>,
503 <span class="sig-arg">avoid_classes</span>=<span class="sig-default"><code class="variable-group">[</code><code class="variable-quote">'</code><code class="variable-string">nobreak</code><code class="variable-quote">'</code><code class="variable-group">]</code></span>,
504 <span class="sig-arg">break_character</span>=<span class="sig-default"><code class="variable-quote">u'</code><code class="variable-string">​</code><code class="variable-quote">'</code></span>)</span>
506 </td><td align="right" valign="top"
507 ><span class="codelink"><a href="lxml.html.clean-pysrc.html#word_break">source code</a></span>
511 <p>Breaks any long words found in the body of the text (not attributes).</p>
512 <p>Doesn't effect any of the tags in avoid_elements, by default
513 <tt class="rst-docutils literal"><textarea></tt> and <tt class="rst-docutils literal"><pre></tt></p>
514 <p>Breaks words by inserting &#8203;, which is a unicode character
515 for Zero Width Space character. This generally takes up no space
516 in rendering, but does copy as a space, and in monospace contexts
517 usually takes up space.</p>
518 <p>See <a class="rst-reference external" href="http://www.cs.tut.fi/~jkorpela/html/nobr.html" target="_top">http://www.cs.tut.fi/~jkorpela/html/nobr.html</a> for a discussion</p>
524 <!-- ==================== VARIABLES DETAILS ==================== -->
525 <a name="section-VariablesDetails"></a>
526 <table class="details" border="1" cellpadding="3"
527 cellspacing="0" width="100%" bgcolor="white">
528 <tr bgcolor="#70b0f0" class="table-header">
529 <td colspan="2" class="table-header">
530 <table border="0" cellpadding="0" cellspacing="0" width="100%">
532 <td align="left"><span class="table-header">Variables Details</span></td>
533 <td align="right" valign="top"
534 ><span class="options">[<a href="#section-VariablesDetails"
535 class="privatelink" onclick="toggle_private();"
536 >hide private</a>]</span></td>
542 <a name="_javascript_scheme_re"></a>
543 <div class="private">
544 <table class="details" border="1" cellpadding="3"
545 cellspacing="0" width="100%" bgcolor="white">
547 <h3 class="epydoc">_javascript_scheme_re</h3>
553 <dd><table><tr><td><pre class="variable">
554 re.compile(r'<code class="re-flags">(?i)</code>\s<code class="re-op">*</code><code class="re-group">(?:</code>javascript<code class="re-op">|</code>jscript<code class="re-op">|</code>livescript<code class="re-op">|</code>vbscript<code class="re-op">|</code>data<code class="re-op">|</code>abo<span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
555 ut<code class="re-op">|</code>mocha<code class="re-group">)</code>:')
556 </pre></td></tr></table>
561 <a name="_conditional_comment_re"></a>
562 <div class="private">
563 <table class="details" border="1" cellpadding="3"
564 cellspacing="0" width="100%" bgcolor="white">
566 <h3 class="epydoc">_conditional_comment_re</h3>
572 <dd><table><tr><td><pre class="variable">
573 re.compile(r'<code class="re-flags">(?is)</code>\[if<code class="re-group">[</code>\s\n\r<code class="re-group">]</code><code class="re-op">+</code>.<code class="re-op">*?</code>\]<code class="re-group">[</code>\s\n\r<code class="re-group">]</code><code class="re-op">*</code>>')
574 </pre></td></tr></table>
579 <a name="_find_external_links"></a>
580 <div class="private">
581 <table class="details" border="1" cellpadding="3"
582 cellspacing="0" width="100%" bgcolor="white">
584 <h3 class="epydoc">_find_external_links</h3>
590 <dd><table><tr><td><pre class="variable">
591 descendant-or-self::a [normalize-space(@href) and substring(normalize<span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
592 -space(@href),1,1) != '#'] |descendant-or-self::x:a[normalize-space(@h<span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
593 ref) and substring(normalize-space(@href),1,1) != '#']
594 </pre></td></tr></table>
599 <a name="_link_regexes"></a>
600 <div class="private">
601 <table class="details" border="1" cellpadding="3"
602 cellspacing="0" width="100%" bgcolor="white">
604 <h3 class="epydoc">_link_regexes</h3>
610 <dd><table><tr><td><pre class="variable">
611 <code class="variable-group">[</code>re.compile(r'<code class="re-flags">(?i)</code><code class="re-group">(?P<</code><code class="re-ref">body</code><code class="re-group">></code>https<code class="re-op">?</code>://<code class="re-group">(?P<</code><code class="re-ref">host</code><code class="re-group">></code><code class="re-group">[</code>a<code class="re-op">-</code>z0<code class="re-op">-</code>9\._-<code class="re-group">]</code><code class="re-op">+</code><code class="re-group">)</code><code class="re-group">(?:</code>/<code class="re-group">[</code>/-_\.,<span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
612 a<code class="re-op">-</code>z0<code class="re-op">-</code>9%&\?;=~<code class="re-group">]</code><code class="re-op">*</code><code class="re-group">)</code><code class="re-op">?</code><code class="re-group">(?:</code>\(<code class="re-group">[</code>/-_\.,a<code class="re-op">-</code>z0<code class="re-op">-</code>9%&\?;=~<code class="re-group">]</code><code class="re-op">*</code>\)<code class="re-group">)</code><code class="re-op">?</code><code class="re-group">)</code>')<code class="variable-op">,</code>
613 re.compile(r'<code class="re-flags">(?i)</code>mailto:<code class="re-group">(?P<</code><code class="re-ref">body</code><code class="re-group">></code><code class="re-group">[</code>a<code class="re-op">-</code>z0<code class="re-op">-</code>9\._-<code class="re-group">]</code><code class="re-op">+</code>@<code class="re-group">(?P<</code><code class="re-ref">host</code><code class="re-group">></code><code class="re-group">[</code>a<code class="re-op">-</code>z0<code class="re-op">-</code>9_\._<code class="re-group">]</code><code class="re-op">+</code><code class="re-group"></code><span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
614 <code class="re-group">[</code>a<code class="re-op">-</code>z<code class="re-group">]</code><code class="re-group">)</code><code class="re-group">)</code>')<code class="variable-group">]</code>
615 </pre></td></tr></table>
620 <a name="_avoid_elements"></a>
621 <div class="private">
622 <table class="details" border="1" cellpadding="3"
623 cellspacing="0" width="100%" bgcolor="white">
625 <h3 class="epydoc">_avoid_elements</h3>
631 <dd><table><tr><td><pre class="variable">
632 <code class="variable-group">[</code><code class="variable-quote">'</code><code class="variable-string">textarea</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">pre</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">code</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">head</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">select</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">a</code><code class="variable-quote">'</code><code class="variable-group">]</code>
633 </pre></td></tr></table>
638 <a name="_avoid_hosts"></a>
639 <div class="private">
640 <table class="details" border="1" cellpadding="3"
641 cellspacing="0" width="100%" bgcolor="white">
643 <h3 class="epydoc">_avoid_hosts</h3>
649 <dd><table><tr><td><pre class="variable">
650 <code class="variable-group">[</code>re.compile(r'<code class="re-flags">(?i)</code>^localhost')<code class="variable-op">,</code>
651 re.compile(r'<code class="re-flags">(?i)</code>\bexample\.<code class="re-group">(?:</code>com<code class="re-op">|</code>org<code class="re-op">|</code>net<code class="re-group">)</code>$')<code class="variable-op">,</code>
652 re.compile(r'^127\.0\.0\.1$')<code class="variable-group">]</code>
653 </pre></td></tr></table>
659 <!-- ==================== NAVIGATION BAR ==================== -->
660 <table class="navbar" border="0" width="100%" cellpadding="0"
661 bgcolor="#a0c0ff" cellspacing="0">
664 <th> <a
665 href="lxml-module.html">Home</a> </th>
668 <th> <a
669 href="module-tree.html">Trees</a> </th>
672 <th> <a
673 href="identifier-index.html">Indices</a> </th>
676 <th> <a
677 href="help.html">Help</a> </th>
679 <!-- Project homepage -->
680 <th class="navbar" align="right" width="100%">
681 <table border="0" cellpadding="0" cellspacing="0">
682 <tr><th class="navbar" align="center"
683 ><a class="navbar" target="_top" href="/">lxml API</a></th>
687 <table border="0" cellpadding="0" cellspacing="0" width="100%%">
689 <td align="left" class="footer">
690 Generated by Epydoc 3.0.1 on Tue Jul 31 10:14:17 2012
692 <td align="right" class="footer">
693 <a target="mainFrame" href="http://epydoc.sourceforge.net"
694 >http://epydoc.sourceforge.net</a>
699 <script type="text/javascript">
701 // Private objects are initially displayed (because if
702 // javascript is turned off then we want them to be
703 // visible); but by default, we want to hide them. So hide
704 // them unless we have a cookie that says to show them.