1 <?xml version="1.0" encoding="ascii"?>
2 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
3 "DTD/xhtml1-transitional.dtd">
4 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
6 <title>lxml.html.clean</title>
7 <link rel="stylesheet" href="epydoc.css" type="text/css" />
8 <script type="text/javascript" src="epydoc.js"></script>
11 <body bgcolor="white" text="black" link="blue" vlink="#204080"
13 <!-- ==================== NAVIGATION BAR ==================== -->
14 <table class="navbar" border="0" width="100%" cellpadding="0"
15 bgcolor="#a0c0ff" cellspacing="0">
18 <th> <a
19 href="lxml-module.html">Home</a> </th>
22 <th> <a
23 href="module-tree.html">Trees</a> </th>
26 <th> <a
27 href="identifier-index.html">Indices</a> </th>
30 <th> <a
31 href="help.html">Help</a> </th>
33 <!-- Project homepage -->
34 <th class="navbar" align="right" width="100%">
35 <table border="0" cellpadding="0" cellspacing="0">
36 <tr><th class="navbar" align="center"
37 ><a class="navbar" target="_top" href="/">lxml API</a></th>
41 <table width="100%" cellpadding="0" cellspacing="0">
44 <span class="breadcrumbs">
45 <a href="lxml-module.html">Package lxml</a> ::
46 <a href="lxml.html-module.html">Package html</a> ::
51 <table cellpadding="0" cellspacing="0">
52 <!-- hide/show private -->
53 <tr><td align="right"><span class="options">[<a href="javascript:void(0);" class="privatelink"
54 onclick="toggle_private();">hide private</a>]</span></td></tr>
55 <tr><td align="right"><span class="options"
56 >[<a href="frames.html" target="_top">frames</a
57 >] | <a href="lxml.html.clean-module.html"
58 target="_top">no frames</a>]</span></td></tr>
63 <!-- ==================== MODULE DESCRIPTION ==================== -->
64 <h1 class="epydoc">Module clean</h1><p class="nomargin-top"><span class="codelink"><a href="lxml.html.clean-pysrc.html">source code</a></span></p>
65 <p>A cleanup tool for HTML.</p>
66 <p>Removes unwanted tags and content. See the <a href="lxml.html.clean.Cleaner-class.html" class="link">Cleaner</a> class for
69 <!-- ==================== CLASSES ==================== -->
70 <a name="section-Classes"></a>
71 <table class="summary" border="1" cellpadding="3"
72 cellspacing="0" width="100%" bgcolor="white">
73 <tr bgcolor="#70b0f0" class="table-header">
74 <td colspan="2" class="table-header">
75 <table border="0" cellpadding="0" cellspacing="0" width="100%">
77 <td align="left"><span class="table-header">Classes</span></td>
78 <td align="right" valign="top"
79 ><span class="options">[<a href="#section-Classes"
80 class="privatelink" onclick="toggle_private();"
81 >hide private</a>]</span></td>
87 <td width="15%" align="right" valign="top" class="summary">
88 <span class="summary-type"> </span>
89 </td><td class="summary">
90 <a href="str-class.html" class="summary-name" onclick="show_private();">unicode</a><br />
91 str(object='') -> string
95 <td width="15%" align="right" valign="top" class="summary">
96 <span class="summary-type"> </span>
97 </td><td class="summary">
98 <a href="lxml.html.clean.Cleaner-class.html" class="summary-name">Cleaner</a><br />
99 Instances cleans the document of each of the possible offending
100 elements. The cleaning is controlled by attributes; you can
101 override attributes in a subclass, or set them in the constructor.
105 <!-- ==================== FUNCTIONS ==================== -->
106 <a name="section-Functions"></a>
107 <table class="summary" border="1" cellpadding="3"
108 cellspacing="0" width="100%" bgcolor="white">
109 <tr bgcolor="#70b0f0" class="table-header">
110 <td colspan="2" class="table-header">
111 <table border="0" cellpadding="0" cellspacing="0" width="100%">
113 <td align="left"><span class="table-header">Functions</span></td>
114 <td align="right" valign="top"
115 ><span class="options">[<a href="#section-Functions"
116 class="privatelink" onclick="toggle_private();"
117 >hide private</a>]</span></td>
123 <td width="15%" align="right" valign="top" class="summary">
124 <span class="summary-type">character</span>
125 </td><td class="summary">
126 <table width="100%" cellpadding="0" cellspacing="0" border="0">
128 <td><span class="summary-sig"><a name="unichr"></a><span class="summary-sig-name">unichr</span>(<span class="summary-sig-arg">i</span>)</span><br />
129 Return a string of one character with ordinal i; 0 <= i < 256.</td>
130 <td align="right" valign="top">
140 <td width="15%" align="right" valign="top" class="summary">
141 <span class="summary-type"> </span>
142 </td><td class="summary">
143 <table width="100%" cellpadding="0" cellspacing="0" border="0">
145 <td><span class="summary-sig"><a name="_is_image_dataurl"></a><span class="summary-sig-name">_is_image_dataurl</span>(<span class="summary-sig-arg">...</span>)</span><br />
146 search(string[, pos[, endpos]]) --> match object or None.
147 Scan through string looking for a match, and return a corresponding
148 match object instance. Return None if no position in the string matches.</td>
149 <td align="right" valign="top">
150 <span class="codelink"><a href="lxml.html.clean-pysrc.html#_is_image_dataurl">source code</a></span>
159 <td width="15%" align="right" valign="top" class="summary">
160 <span class="summary-type"> </span>
161 </td><td class="summary">
162 <table width="100%" cellpadding="0" cellspacing="0" border="0">
164 <td><span class="summary-sig"><a name="_is_possibly_malicious_scheme"></a><span class="summary-sig-name">_is_possibly_malicious_scheme</span>(<span class="summary-sig-arg">...</span>)</span><br />
165 search(string[, pos[, endpos]]) --> match object or None.
166 Scan through string looking for a match, and return a corresponding
167 match object instance. Return None if no position in the string matches.</td>
168 <td align="right" valign="top">
169 <span class="codelink"><a href="lxml.html.clean-pysrc.html#_is_possibly_malicious_scheme">source code</a></span>
178 <td width="15%" align="right" valign="top" class="summary">
179 <span class="summary-type"> </span>
180 </td><td class="summary">
181 <table width="100%" cellpadding="0" cellspacing="0" border="0">
183 <td><span class="summary-sig"><a name="_is_javascript_scheme"></a><span class="summary-sig-name">_is_javascript_scheme</span>(<span class="summary-sig-arg">s</span>)</span></td>
184 <td align="right" valign="top">
185 <span class="codelink"><a href="lxml.html.clean-pysrc.html#_is_javascript_scheme">source code</a></span>
194 <td width="15%" align="right" valign="top" class="summary">
195 <span class="summary-type"> </span>
196 </td><td class="summary">
197 <table width="100%" cellpadding="0" cellspacing="0" border="0">
199 <td><span class="summary-sig"><a name="_substitute_whitespace"></a><span class="summary-sig-name">_substitute_whitespace</span>(<span class="summary-sig-arg">...</span>)</span><br />
200 sub(repl, string[, count = 0]) --> newstring
201 Return the string obtained by replacing the leftmost non-overlapping
202 occurrences of pattern in string by the replacement repl.</td>
203 <td align="right" valign="top">
204 <span class="codelink"><a href="lxml.html.clean-pysrc.html#_substitute_whitespace">source code</a></span>
213 <td width="15%" align="right" valign="top" class="summary">
214 <span class="summary-type"> </span>
215 </td><td class="summary">
216 <table width="100%" cellpadding="0" cellspacing="0" border="0">
218 <td><span class="summary-sig"><a name="clean_html"></a><span class="summary-sig-name">clean_html</span>(<span class="summary-sig-arg">...</span>)</span></td>
219 <td align="right" valign="top">
220 <span class="codelink"><a href="lxml.html.clean-pysrc.html#clean_html">source code</a></span>
229 <td width="15%" align="right" valign="top" class="summary">
230 <span class="summary-type"> </span>
231 </td><td class="summary">
232 <table width="100%" cellpadding="0" cellspacing="0" border="0">
234 <td><span class="summary-sig"><a href="lxml.html.clean-module.html#autolink" class="summary-sig-name">autolink</a>(<span class="summary-sig-arg">el</span>,
235 <span class="summary-sig-arg">link_regexes</span>=<span class="summary-sig-default">_link_regexes</span>,
236 <span class="summary-sig-arg">avoid_elements</span>=<span class="summary-sig-default">_avoid_elements</span>,
237 <span class="summary-sig-arg">avoid_hosts</span>=<span class="summary-sig-default">_avoid_hosts</span>,
238 <span class="summary-sig-arg">avoid_classes</span>=<span class="summary-sig-default">_avoid_classes</span>)</span><br />
239 Turn any URLs into links.</td>
240 <td align="right" valign="top">
241 <span class="codelink"><a href="lxml.html.clean-pysrc.html#autolink">source code</a></span>
250 <td width="15%" align="right" valign="top" class="summary">
251 <span class="summary-type"> </span>
252 </td><td class="summary">
253 <table width="100%" cellpadding="0" cellspacing="0" border="0">
255 <td><span class="summary-sig"><a name="_link_text"></a><span class="summary-sig-name">_link_text</span>(<span class="summary-sig-arg">text</span>,
256 <span class="summary-sig-arg">link_regexes</span>,
257 <span class="summary-sig-arg">avoid_hosts</span>,
258 <span class="summary-sig-arg">factory</span>)</span></td>
259 <td align="right" valign="top">
260 <span class="codelink"><a href="lxml.html.clean-pysrc.html#_link_text">source code</a></span>
269 <td width="15%" align="right" valign="top" class="summary">
270 <span class="summary-type"> </span>
271 </td><td class="summary">
272 <table width="100%" cellpadding="0" cellspacing="0" border="0">
274 <td><span class="summary-sig"><a href="lxml.html.clean-module.html#autolink_html" class="summary-sig-name">autolink_html</a>(<span class="summary-sig-arg">html</span>)</span><br />
275 Turn any URLs into links.</td>
276 <td align="right" valign="top">
277 <span class="codelink"><a href="lxml.html.clean-pysrc.html#autolink_html">source code</a></span>
286 <td width="15%" align="right" valign="top" class="summary">
287 <span class="summary-type"> </span>
288 </td><td class="summary">
289 <table width="100%" cellpadding="0" cellspacing="0" border="0">
291 <td><span class="summary-sig"><a href="lxml.html.clean-module.html#word_break" class="summary-sig-name">word_break</a>(<span class="summary-sig-arg">el</span>,
292 <span class="summary-sig-arg">max_width</span>=<span class="summary-sig-default">40</span>,
293 <span class="summary-sig-arg">avoid_elements</span>=<span class="summary-sig-default">_avoid_word_break_elements</span>,
294 <span class="summary-sig-arg">avoid_classes</span>=<span class="summary-sig-default">_avoid_word_break_classes</span>,
295 <span class="summary-sig-arg">break_character</span>=<span class="summary-sig-default">unichr(0x200b)</span>)</span><br />
296 Breaks any long words found in the body of the text (not attributes).</td>
297 <td align="right" valign="top">
298 <span class="codelink"><a href="lxml.html.clean-pysrc.html#word_break">source code</a></span>
307 <td width="15%" align="right" valign="top" class="summary">
308 <span class="summary-type"> </span>
309 </td><td class="summary">
310 <table width="100%" cellpadding="0" cellspacing="0" border="0">
312 <td><span class="summary-sig"><a name="word_break_html"></a><span class="summary-sig-name">word_break_html</span>(<span class="summary-sig-arg">html</span>)</span></td>
313 <td align="right" valign="top">
314 <span class="codelink"><a href="lxml.html.clean-pysrc.html#word_break_html">source code</a></span>
323 <td width="15%" align="right" valign="top" class="summary">
324 <span class="summary-type"> </span>
325 </td><td class="summary">
326 <table width="100%" cellpadding="0" cellspacing="0" border="0">
328 <td><span class="summary-sig"><a name="_break_text"></a><span class="summary-sig-name">_break_text</span>(<span class="summary-sig-arg">text</span>,
329 <span class="summary-sig-arg">max_width</span>,
330 <span class="summary-sig-arg">break_character</span>)</span></td>
331 <td align="right" valign="top">
332 <span class="codelink"><a href="lxml.html.clean-pysrc.html#_break_text">source code</a></span>
341 <td width="15%" align="right" valign="top" class="summary">
342 <span class="summary-type"> </span>
343 </td><td class="summary">
344 <table width="100%" cellpadding="0" cellspacing="0" border="0">
346 <td><span class="summary-sig"><a name="_insert_break"></a><span class="summary-sig-name">_insert_break</span>(<span class="summary-sig-arg">word</span>,
347 <span class="summary-sig-arg">width</span>,
348 <span class="summary-sig-arg">break_character</span>)</span></td>
349 <td align="right" valign="top">
350 <span class="codelink"><a href="lxml.html.clean-pysrc.html#_insert_break">source code</a></span>
359 <!-- ==================== VARIABLES ==================== -->
360 <a name="section-Variables"></a>
361 <table class="summary" border="1" cellpadding="3"
362 cellspacing="0" width="100%" bgcolor="white">
363 <tr bgcolor="#70b0f0" class="table-header">
364 <td colspan="2" class="table-header">
365 <table border="0" cellpadding="0" cellspacing="0" width="100%">
367 <td align="left"><span class="table-header">Variables</span></td>
368 <td align="right" valign="top"
369 ><span class="options">[<a href="#section-Variables"
370 class="privatelink" onclick="toggle_private();"
371 >hide private</a>]</span></td>
377 <td width="15%" align="right" valign="top" class="summary">
378 <span class="summary-type"> </span>
379 </td><td class="summary">
380 <a name="basestring"></a><span class="summary-name">basestring</span> = <code title="str, bytes">str, bytes</code>
384 <td width="15%" align="right" valign="top" class="summary">
385 <span class="summary-type"> </span>
386 </td><td class="summary">
387 <a name="_css_javascript_re"></a><span class="summary-name">_css_javascript_re</span> = <code title="re.compile(r'(?is)expression\s*\(.*?\)')">re.compile(r'<code class="re-flags">(?is)</code>expression\s<code class="re-op">*</code>\(.<code class="re-op">*?</code>\)')</code>
391 <td width="15%" align="right" valign="top" class="summary">
392 <span class="summary-type"> </span>
393 </td><td class="summary">
394 <a name="_css_import_re"></a><span class="summary-name">_css_import_re</span> = <code title="re.compile(r'(?i)@\s*import')">re.compile(r'<code class="re-flags">(?i)</code>@\s<code class="re-op">*</code>import')</code>
398 <td width="15%" align="right" valign="top" class="summary">
399 <span class="summary-type"> </span>
400 </td><td class="summary">
401 <a href="lxml.html.clean-module.html#_conditional_comment_re" class="summary-name" onclick="show_private();">_conditional_comment_re</a> = <code title="re.compile(r'(?is)\[if[\s\n\r]+.*?\][\s\n\r]*>')">re.compile(r'<code class="re-flags">(?is)</code>\[if<code class="re-group">[</code>\s\n\r<code class="re-group">]</code><code class="re-op">+</code>.<code class="re-op">*?</code>\]<code class="variable-ellipsis">...</code></code>
405 <td width="15%" align="right" valign="top" class="summary">
406 <span class="summary-type"> </span>
407 </td><td class="summary">
408 <a name="_find_styled_elements"></a><span class="summary-name">_find_styled_elements</span> = <code title="descendant-or-self::*[@style]">descendant-or-self::*[@style]</code>
412 <td width="15%" align="right" valign="top" class="summary">
413 <span class="summary-type"> </span>
414 </td><td class="summary">
415 <a href="lxml.html.clean-module.html#_find_external_links" class="summary-name" onclick="show_private();">_find_external_links</a> = <code title="descendant-or-self::a [normalize-space(@href) and substring(normalize\
416 -space(@href),1,1) != '#'] |descendant-or-self::x:a[normalize-space(@h\
417 ref) and substring(normalize-space(@href),1,1) != '#']">descendant-or-self::a [normalize-space<code class="variable-ellipsis">...</code></code>
421 <td width="15%" align="right" valign="top" class="summary">
422 <span class="summary-type"> </span>
423 </td><td class="summary">
424 <a name="clean"></a><span class="summary-name">clean</span> = <code title="<lxml.html.clean.Cleaner object>"><lxml.html.clean.Cleaner object></code>
428 <td width="15%" align="right" valign="top" class="summary">
429 <span class="summary-type"> </span>
430 </td><td class="summary">
431 <a href="lxml.html.clean-module.html#_link_regexes" class="summary-name" onclick="show_private();">_link_regexes</a> = <code title="[re.compile(r'(?i)(?P<body>https?://(?P<host>[a-z0-9\._-]+)(?:/[/-_\.,\
432 a-z0-9%&\?;=~]*)?(?:\([/-_\.,a-z0-9%&\?;=~]*\))?)'),
433 re.compile(r'(?i)mailto:(?P<body>[a-z0-9\._-]+@(?P<host>[a-z0-9_\.-]+\
434 [a-z]))')]"><code class="variable-group">[</code>re.compile(r'<code class="re-flags">(?i)</code><code class="re-group">(?P<</code><code class="re-ref">body</code><code class="re-group">></code>https<code class="re-op">?</code>://<code class="re-group">(?P<</code><code class="re-ref">host</code><code class="re-group">></code><code class="re-group">[</code><code class="variable-ellipsis">...</code></code>
438 <td width="15%" align="right" valign="top" class="summary">
439 <span class="summary-type"> </span>
440 </td><td class="summary">
441 <a href="lxml.html.clean-module.html#_avoid_elements" class="summary-name" onclick="show_private();">_avoid_elements</a> = <code title="['textarea', 'pre', 'code', 'head', 'select', 'a']"><code class="variable-group">[</code><code class="variable-quote">'</code><code class="variable-string">textarea</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">pre</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">code</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">head</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">select</code><code class="variable-quote">'</code><code class="variable-ellipsis">...</code></code>
445 <td width="15%" align="right" valign="top" class="summary">
446 <span class="summary-type"> </span>
447 </td><td class="summary">
448 <a href="lxml.html.clean-module.html#_avoid_hosts" class="summary-name" onclick="show_private();">_avoid_hosts</a> = <code title="[re.compile(r'(?i)^localhost'),
449 re.compile(r'(?i)\bexample\.(?:com|org|net)$'),
450 re.compile(r'^127\.0\.0\.1$')]"><code class="variable-group">[</code>re.compile(r'<code class="re-flags">(?i)</code>^localhost')<code class="variable-op">, </code>re.compile(r'<code class="re-flags">(?</code><code class="variable-ellipsis">...</code></code>
454 <td width="15%" align="right" valign="top" class="summary">
455 <span class="summary-type"> </span>
456 </td><td class="summary">
457 <a name="_avoid_classes"></a><span class="summary-name">_avoid_classes</span> = <code title="['nolink']"><code class="variable-group">[</code><code class="variable-quote">'</code><code class="variable-string">nolink</code><code class="variable-quote">'</code><code class="variable-group">]</code></code>
461 <td width="15%" align="right" valign="top" class="summary">
462 <span class="summary-type"> </span>
463 </td><td class="summary">
464 <a name="_avoid_word_break_elements"></a><span class="summary-name">_avoid_word_break_elements</span> = <code title="['pre', 'textarea', 'code']"><code class="variable-group">[</code><code class="variable-quote">'</code><code class="variable-string">pre</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">textarea</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">code</code><code class="variable-quote">'</code><code class="variable-group">]</code></code>
468 <td width="15%" align="right" valign="top" class="summary">
469 <span class="summary-type"> </span>
470 </td><td class="summary">
471 <a name="_avoid_word_break_classes"></a><span class="summary-name">_avoid_word_break_classes</span> = <code title="['nobreak']"><code class="variable-group">[</code><code class="variable-quote">'</code><code class="variable-string">nobreak</code><code class="variable-quote">'</code><code class="variable-group">]</code></code>
475 <td width="15%" align="right" valign="top" class="summary">
476 <span class="summary-type"> </span>
477 </td><td class="summary">
478 <a name="_break_prefer_re"></a><span class="summary-name">_break_prefer_re</span> = <code title="re.compile(r'(?i)[^a-z]')">re.compile(r'<code class="re-flags">(?i)</code><code class="re-group">[</code><code class="re-op">^</code>a<code class="re-op">-</code>z<code class="re-group">]</code>')</code>
482 <td width="15%" align="right" valign="top" class="summary">
483 <span class="summary-type"> </span>
484 </td><td class="summary">
485 <a name="__package__"></a><span class="summary-name">__package__</span> = <code title="None">None</code><br />
490 <td width="15%" align="right" valign="top" class="summary">
491 <span class="summary-type"> </span>
492 </td><td class="summary">
493 <a name="__test__"></a><span class="summary-name">__test__</span> = <code title="{}"><code class="variable-group">{</code><code class="variable-group">}</code></code>
497 <!-- ==================== FUNCTION DETAILS ==================== -->
498 <a name="section-FunctionDetails"></a>
499 <table class="details" border="1" cellpadding="3"
500 cellspacing="0" width="100%" bgcolor="white">
501 <tr bgcolor="#70b0f0" class="table-header">
502 <td colspan="2" class="table-header">
503 <table border="0" cellpadding="0" cellspacing="0" width="100%">
505 <td align="left"><span class="table-header">Function Details</span></td>
506 <td align="right" valign="top"
507 ><span class="options">[<a href="#section-FunctionDetails"
508 class="privatelink" onclick="toggle_private();"
509 >hide private</a>]</span></td>
515 <a name="autolink"></a>
517 <table class="details" border="1" cellpadding="3"
518 cellspacing="0" width="100%" bgcolor="white">
520 <table width="100%" cellpadding="0" cellspacing="0" border="0">
521 <tr valign="top"><td>
522 <h3 class="epydoc"><span class="sig"><span class="sig-name">autolink</span>(<span class="sig-arg">el</span>,
523 <span class="sig-arg">link_regexes</span>=<span class="sig-default">_link_regexes</span>,
524 <span class="sig-arg">avoid_elements</span>=<span class="sig-default">_avoid_elements</span>,
525 <span class="sig-arg">avoid_hosts</span>=<span class="sig-default">_avoid_hosts</span>,
526 <span class="sig-arg">avoid_classes</span>=<span class="sig-default">_avoid_classes</span>)</span>
528 </td><td align="right" valign="top"
529 ><span class="codelink"><a href="lxml.html.clean-pysrc.html#autolink">source code</a></span>
533 <p>Turn any URLs into links.</p>
534 <p>It will search for links identified by the given regular
535 expressions (by default mailto and http(s) links).</p>
536 <p>It won't link text in an element in avoid_elements, or an element
537 with a class in avoid_classes. It won't link to anything with a
538 host that matches one of the regular expressions in avoid_hosts
539 (default localhost and 127.0.0.1).</p>
540 <p>If you pass in an element, the element's tail will not be
541 substituted, only the contents of the element.</p>
546 <a name="autolink_html"></a>
548 <table class="details" border="1" cellpadding="3"
549 cellspacing="0" width="100%" bgcolor="white">
551 <table width="100%" cellpadding="0" cellspacing="0" border="0">
552 <tr valign="top"><td>
553 <h3 class="epydoc"><span class="sig"><span class="sig-name">autolink_html</span>(<span class="sig-arg">html</span>)</span>
555 </td><td align="right" valign="top"
556 ><span class="codelink"><a href="lxml.html.clean-pysrc.html#autolink_html">source code</a></span>
560 <p>Turn any URLs into links.</p>
561 <p>It will search for links identified by the given regular
562 expressions (by default mailto and http(s) links).</p>
563 <p>It won't link text in an element in avoid_elements, or an element
564 with a class in avoid_classes. It won't link to anything with a
565 host that matches one of the regular expressions in avoid_hosts
566 (default localhost and 127.0.0.1).</p>
567 <p>If you pass in an element, the element's tail will not be
568 substituted, only the contents of the element.</p>
573 <a name="word_break"></a>
575 <table class="details" border="1" cellpadding="3"
576 cellspacing="0" width="100%" bgcolor="white">
578 <table width="100%" cellpadding="0" cellspacing="0" border="0">
579 <tr valign="top"><td>
580 <h3 class="epydoc"><span class="sig"><span class="sig-name">word_break</span>(<span class="sig-arg">el</span>,
581 <span class="sig-arg">max_width</span>=<span class="sig-default">40</span>,
582 <span class="sig-arg">avoid_elements</span>=<span class="sig-default">_avoid_word_break_elements</span>,
583 <span class="sig-arg">avoid_classes</span>=<span class="sig-default">_avoid_word_break_classes</span>,
584 <span class="sig-arg">break_character</span>=<span class="sig-default">unichr(0x200b)</span>)</span>
586 </td><td align="right" valign="top"
587 ><span class="codelink"><a href="lxml.html.clean-pysrc.html#word_break">source code</a></span>
591 <p>Breaks any long words found in the body of the text (not attributes).</p>
592 <p>Doesn't effect any of the tags in avoid_elements, by default
593 <tt class="rst-docutils literal"><textarea></tt> and <tt class="rst-docutils literal"><pre></tt></p>
594 <p>Breaks words by inserting &#8203;, which is a unicode character
595 for Zero Width Space character. This generally takes up no space
596 in rendering, but does copy as a space, and in monospace contexts
597 usually takes up space.</p>
598 <p>See <a class="rst-reference external" href="http://www.cs.tut.fi/~jkorpela/html/nobr.html" target="_top">http://www.cs.tut.fi/~jkorpela/html/nobr.html</a> for a discussion</p>
604 <!-- ==================== VARIABLES DETAILS ==================== -->
605 <a name="section-VariablesDetails"></a>
606 <table class="details" border="1" cellpadding="3"
607 cellspacing="0" width="100%" bgcolor="white">
608 <tr bgcolor="#70b0f0" class="table-header">
609 <td colspan="2" class="table-header">
610 <table border="0" cellpadding="0" cellspacing="0" width="100%">
612 <td align="left"><span class="table-header">Variables Details</span></td>
613 <td align="right" valign="top"
614 ><span class="options">[<a href="#section-VariablesDetails"
615 class="privatelink" onclick="toggle_private();"
616 >hide private</a>]</span></td>
622 <a name="_conditional_comment_re"></a>
623 <div class="private">
624 <table class="details" border="1" cellpadding="3"
625 cellspacing="0" width="100%" bgcolor="white">
627 <h3 class="epydoc">_conditional_comment_re</h3>
633 <dd><table><tr><td><pre class="variable">
634 re.compile(r'<code class="re-flags">(?is)</code>\[if<code class="re-group">[</code>\s\n\r<code class="re-group">]</code><code class="re-op">+</code>.<code class="re-op">*?</code>\]<code class="re-group">[</code>\s\n\r<code class="re-group">]</code><code class="re-op">*</code>>')
635 </pre></td></tr></table>
640 <a name="_find_external_links"></a>
641 <div class="private">
642 <table class="details" border="1" cellpadding="3"
643 cellspacing="0" width="100%" bgcolor="white">
645 <h3 class="epydoc">_find_external_links</h3>
651 <dd><table><tr><td><pre class="variable">
652 descendant-or-self::a [normalize-space(@href) and substring(normalize<span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
653 -space(@href),1,1) != '#'] |descendant-or-self::x:a[normalize-space(@h<span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
654 ref) and substring(normalize-space(@href),1,1) != '#']
655 </pre></td></tr></table>
660 <a name="_link_regexes"></a>
661 <div class="private">
662 <table class="details" border="1" cellpadding="3"
663 cellspacing="0" width="100%" bgcolor="white">
665 <h3 class="epydoc">_link_regexes</h3>
671 <dd><table><tr><td><pre class="variable">
672 <code class="variable-group">[</code>re.compile(r'<code class="re-flags">(?i)</code><code class="re-group">(?P<</code><code class="re-ref">body</code><code class="re-group">></code>https<code class="re-op">?</code>://<code class="re-group">(?P<</code><code class="re-ref">host</code><code class="re-group">></code><code class="re-group">[</code>a<code class="re-op">-</code>z0<code class="re-op">-</code>9\._-<code class="re-group">]</code><code class="re-op">+</code><code class="re-group">)</code><code class="re-group">(?:</code>/<code class="re-group">[</code>/-_\.,<span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
673 a<code class="re-op">-</code>z0<code class="re-op">-</code>9%&\?;=~<code class="re-group">]</code><code class="re-op">*</code><code class="re-group">)</code><code class="re-op">?</code><code class="re-group">(?:</code>\(<code class="re-group">[</code>/-_\.,a<code class="re-op">-</code>z0<code class="re-op">-</code>9%&\?;=~<code class="re-group">]</code><code class="re-op">*</code>\)<code class="re-group">)</code><code class="re-op">?</code><code class="re-group">)</code>')<code class="variable-op">,</code>
674 re.compile(r'<code class="re-flags">(?i)</code>mailto:<code class="re-group">(?P<</code><code class="re-ref">body</code><code class="re-group">></code><code class="re-group">[</code>a<code class="re-op">-</code>z0<code class="re-op">-</code>9\._-<code class="re-group">]</code><code class="re-op">+</code>@<code class="re-group">(?P<</code><code class="re-ref">host</code><code class="re-group">></code><code class="re-group">[</code>a<code class="re-op">-</code>z0<code class="re-op">-</code>9_\.-<code class="re-group">]</code><code class="re-op">+</code><code class="re-group"></code><span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
675 <code class="re-group">[</code>a<code class="re-op">-</code>z<code class="re-group">]</code><code class="re-group">)</code><code class="re-group">)</code>')<code class="variable-group">]</code>
676 </pre></td></tr></table>
681 <a name="_avoid_elements"></a>
682 <div class="private">
683 <table class="details" border="1" cellpadding="3"
684 cellspacing="0" width="100%" bgcolor="white">
686 <h3 class="epydoc">_avoid_elements</h3>
692 <dd><table><tr><td><pre class="variable">
693 <code class="variable-group">[</code><code class="variable-quote">'</code><code class="variable-string">textarea</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">pre</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">code</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">head</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">select</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">a</code><code class="variable-quote">'</code><code class="variable-group">]</code>
694 </pre></td></tr></table>
699 <a name="_avoid_hosts"></a>
700 <div class="private">
701 <table class="details" border="1" cellpadding="3"
702 cellspacing="0" width="100%" bgcolor="white">
704 <h3 class="epydoc">_avoid_hosts</h3>
710 <dd><table><tr><td><pre class="variable">
711 <code class="variable-group">[</code>re.compile(r'<code class="re-flags">(?i)</code>^localhost')<code class="variable-op">,</code>
712 re.compile(r'<code class="re-flags">(?i)</code>\bexample\.<code class="re-group">(?:</code>com<code class="re-op">|</code>org<code class="re-op">|</code>net<code class="re-group">)</code>$')<code class="variable-op">,</code>
713 re.compile(r'^127\.0\.0\.1$')<code class="variable-group">]</code>
714 </pre></td></tr></table>
720 <!-- ==================== NAVIGATION BAR ==================== -->
721 <table class="navbar" border="0" width="100%" cellpadding="0"
722 bgcolor="#a0c0ff" cellspacing="0">
725 <th> <a
726 href="lxml-module.html">Home</a> </th>
729 <th> <a
730 href="module-tree.html">Trees</a> </th>
733 <th> <a
734 href="identifier-index.html">Indices</a> </th>
737 <th> <a
738 href="help.html">Help</a> </th>
740 <!-- Project homepage -->
741 <th class="navbar" align="right" width="100%">
742 <table border="0" cellpadding="0" cellspacing="0">
743 <tr><th class="navbar" align="center"
744 ><a class="navbar" target="_top" href="/">lxml API</a></th>
748 <table border="0" cellpadding="0" cellspacing="0" width="100%%">
750 <td align="left" class="footer">
751 Generated by Epydoc 3.0.1
752 on Wed Jan 29 12:26:21 2020
754 <td align="right" class="footer">
755 <a target="mainFrame" href="http://epydoc.sourceforge.net"
756 >http://epydoc.sourceforge.net</a>
761 <script type="text/javascript">
763 // Private objects are initially displayed (because if
764 // javascript is turned off then we want them to be
765 // visible); but by default, we want to hide them. So hide
766 // them unless we have a cookie that says to show them.