Upload Tizen:Base source
[toolchains/python-lxml.git] / doc / html / api / lxml.html.diff-module.html
1 <?xml version="1.0" encoding="ascii"?>
2 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
3           "DTD/xhtml1-transitional.dtd">
4 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
5 <head>
6   <title>lxml.html.diff</title>
7   <link rel="stylesheet" href="epydoc.css" type="text/css" />
8   <script type="text/javascript" src="epydoc.js"></script>
9 </head>
10
11 <body bgcolor="white" text="black" link="blue" vlink="#204080"
12       alink="#204080">
13 <!-- ==================== NAVIGATION BAR ==================== -->
14 <table class="navbar" border="0" width="100%" cellpadding="0"
15        bgcolor="#a0c0ff" cellspacing="0">
16   <tr valign="middle">
17   <!-- Home link -->
18       <th>&nbsp;&nbsp;&nbsp;<a
19         href="lxml-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>
20
21   <!-- Tree link -->
22       <th>&nbsp;&nbsp;&nbsp;<a
23         href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>
24
25   <!-- Index link -->
26       <th>&nbsp;&nbsp;&nbsp;<a
27         href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
28
29   <!-- Help link -->
30       <th>&nbsp;&nbsp;&nbsp;<a
31         href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
32
33   <!-- Project homepage -->
34       <th class="navbar" align="right" width="100%">
35         <table border="0" cellpadding="0" cellspacing="0">
36           <tr><th class="navbar" align="center"
37             ><a class="navbar" target="_top" href="http://codespeak.net/lxml/">lxml API</a></th>
38           </tr></table></th>
39   </tr>
40 </table>
41 <table width="100%" cellpadding="0" cellspacing="0">
42   <tr valign="top">
43     <td width="100%">
44       <span class="breadcrumbs">
45         <a href="lxml-module.html">Package&nbsp;lxml</a> ::
46         <a href="lxml.html-module.html">Package&nbsp;html</a> ::
47         Module&nbsp;diff
48       </span>
49     </td>
50     <td>
51       <table cellpadding="0" cellspacing="0">
52         <!-- hide/show private -->
53         <tr><td align="right"><span class="options">[<a href="javascript:void(0);" class="privatelink"
54     onclick="toggle_private();">hide&nbsp;private</a>]</span></td></tr>
55         <tr><td align="right"><span class="options"
56             >[<a href="frames.html" target="_top">frames</a
57             >]&nbsp;|&nbsp;<a href="lxml.html.diff-module.html"
58             target="_top">no&nbsp;frames</a>]</span></td></tr>
59       </table>
60     </td>
61   </tr>
62 </table>
63 <!-- ==================== MODULE DESCRIPTION ==================== -->
64 <h1 class="epydoc">Module diff</h1><p class="nomargin-top"><span class="codelink"><a href="lxml.html.diff-pysrc.html">source&nbsp;code</a></span></p>
65 <!-- ==================== CLASSES ==================== -->
66 <a name="section-Classes"></a>
67 <table class="summary" border="1" cellpadding="3"
68        cellspacing="0" width="100%" bgcolor="white">
69 <tr bgcolor="#70b0f0" class="table-header">
70   <td colspan="2" class="table-header">
71     <table border="0" cellpadding="0" cellspacing="0" width="100%">
72       <tr valign="top">
73         <td align="left"><span class="table-header">Classes</span></td>
74         <td align="right" valign="top"
75          ><span class="options">[<a href="#section-Classes"
76          class="privatelink" onclick="toggle_private();"
77          >hide private</a>]</span></td>
78       </tr>
79     </table>
80   </td>
81 </tr>
82 <tr class="private">
83     <td width="15%" align="right" valign="top" class="summary">
84       <span class="summary-type">&nbsp;</span>
85     </td><td class="summary">
86         <a href="lxml.html.diff.DEL_START-class.html" class="summary-name" onclick="show_private();">DEL_START</a>
87     </td>
88   </tr>
89 <tr class="private">
90     <td width="15%" align="right" valign="top" class="summary">
91       <span class="summary-type">&nbsp;</span>
92     </td><td class="summary">
93         <a href="lxml.html.diff.DEL_END-class.html" class="summary-name" onclick="show_private();">DEL_END</a>
94     </td>
95   </tr>
96 <tr class="private">
97     <td width="15%" align="right" valign="top" class="summary">
98       <span class="summary-type">&nbsp;</span>
99     </td><td class="summary">
100         <a href="lxml.html.diff.NoDeletes-class.html" class="summary-name" onclick="show_private();">NoDeletes</a><br />
101       Raised when the document no longer contains any pending deletes
102 (DEL_START/DEL_END)
103     </td>
104   </tr>
105 <tr class="private">
106     <td width="15%" align="right" valign="top" class="summary">
107       <span class="summary-type">&nbsp;</span>
108     </td><td class="summary">
109         <a href="lxml.html.diff.token-class.html" class="summary-name" onclick="show_private();">token</a><br />
110       Represents a diffable token, generally a word that is displayed to
111 the user.
112     </td>
113   </tr>
114 <tr class="private">
115     <td width="15%" align="right" valign="top" class="summary">
116       <span class="summary-type">&nbsp;</span>
117     </td><td class="summary">
118         <a href="lxml.html.diff.tag_token-class.html" class="summary-name" onclick="show_private();">tag_token</a><br />
119       Represents a token that is actually a tag.
120     </td>
121   </tr>
122 <tr class="private">
123     <td width="15%" align="right" valign="top" class="summary">
124       <span class="summary-type">&nbsp;</span>
125     </td><td class="summary">
126         <a href="lxml.html.diff.href_token-class.html" class="summary-name" onclick="show_private();">href_token</a><br />
127       Represents the href in an anchor tag.
128     </td>
129   </tr>
130 <tr class="private">
131     <td width="15%" align="right" valign="top" class="summary">
132       <span class="summary-type">&nbsp;</span>
133     </td><td class="summary">
134         <a href="lxml.html.diff.InsensitiveSequenceMatcher-class.html" class="summary-name" onclick="show_private();">InsensitiveSequenceMatcher</a><br />
135       Acts like SequenceMatcher, but tries not to find very small equal
136 blocks amidst large spans of changes
137     </td>
138   </tr>
139 </table>
140 <!-- ==================== FUNCTIONS ==================== -->
141 <a name="section-Functions"></a>
142 <table class="summary" border="1" cellpadding="3"
143        cellspacing="0" width="100%" bgcolor="white">
144 <tr bgcolor="#70b0f0" class="table-header">
145   <td colspan="2" class="table-header">
146     <table border="0" cellpadding="0" cellspacing="0" width="100%">
147       <tr valign="top">
148         <td align="left"><span class="table-header">Functions</span></td>
149         <td align="right" valign="top"
150          ><span class="options">[<a href="#section-Functions"
151          class="privatelink" onclick="toggle_private();"
152          >hide private</a>]</span></td>
153       </tr>
154     </table>
155   </td>
156 </tr>
157 <tr class="private">
158     <td width="15%" align="right" valign="top" class="summary">
159       <span class="summary-type">&nbsp;</span>
160     </td><td class="summary">
161       <table width="100%" cellpadding="0" cellspacing="0" border="0">
162         <tr>
163           <td><span class="summary-sig"><a name="default_markup"></a><span class="summary-sig-name">default_markup</span>(<span class="summary-sig-arg">text</span>,
164         <span class="summary-sig-arg">version</span>)</span></td>
165           <td align="right" valign="top">
166             <span class="codelink"><a href="lxml.html.diff-pysrc.html#default_markup">source&nbsp;code</a></span>
167             
168           </td>
169         </tr>
170       </table>
171       
172     </td>
173   </tr>
174 <tr>
175     <td width="15%" align="right" valign="top" class="summary">
176       <span class="summary-type">&nbsp;</span>
177     </td><td class="summary">
178       <table width="100%" cellpadding="0" cellspacing="0" border="0">
179         <tr>
180           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#html_annotate" class="summary-sig-name">html_annotate</a>(<span class="summary-sig-arg">doclist</span>,
181         <span class="summary-sig-arg">markup</span>=<span class="summary-sig-default">&lt;function default_markup at 0x9800d4c&gt;</span>)</span><br />
182       doclist should be ordered from oldest to newest, like:</td>
183           <td align="right" valign="top">
184             <span class="codelink"><a href="lxml.html.diff-pysrc.html#html_annotate">source&nbsp;code</a></span>
185             
186           </td>
187         </tr>
188       </table>
189       
190     </td>
191   </tr>
192 <tr class="private">
193     <td width="15%" align="right" valign="top" class="summary">
194       <span class="summary-type">&nbsp;</span>
195     </td><td class="summary">
196       <table width="100%" cellpadding="0" cellspacing="0" border="0">
197         <tr>
198           <td><span class="summary-sig"><a name="tokenize_annotated"></a><span class="summary-sig-name">tokenize_annotated</span>(<span class="summary-sig-arg">doc</span>,
199         <span class="summary-sig-arg">annotation</span>)</span><br />
200       Tokenize a document and add an annotation attribute to each token</td>
201           <td align="right" valign="top">
202             <span class="codelink"><a href="lxml.html.diff-pysrc.html#tokenize_annotated">source&nbsp;code</a></span>
203             
204           </td>
205         </tr>
206       </table>
207       
208     </td>
209   </tr>
210 <tr class="private">
211     <td width="15%" align="right" valign="top" class="summary">
212       <span class="summary-type">&nbsp;</span>
213     </td><td class="summary">
214       <table width="100%" cellpadding="0" cellspacing="0" border="0">
215         <tr>
216           <td><span class="summary-sig"><a name="html_annotate_merge_annotations"></a><span class="summary-sig-name">html_annotate_merge_annotations</span>(<span class="summary-sig-arg">tokens_old</span>,
217         <span class="summary-sig-arg">tokens_new</span>)</span><br />
218       Merge the annotations from tokens_old into tokens_new, when the
219 tokens in the new document already existed in the old document.</td>
220           <td align="right" valign="top">
221             <span class="codelink"><a href="lxml.html.diff-pysrc.html#html_annotate_merge_annotations">source&nbsp;code</a></span>
222             
223           </td>
224         </tr>
225       </table>
226       
227     </td>
228   </tr>
229 <tr class="private">
230     <td width="15%" align="right" valign="top" class="summary">
231       <span class="summary-type">&nbsp;</span>
232     </td><td class="summary">
233       <table width="100%" cellpadding="0" cellspacing="0" border="0">
234         <tr>
235           <td><span class="summary-sig"><a name="copy_annotations"></a><span class="summary-sig-name">copy_annotations</span>(<span class="summary-sig-arg">src</span>,
236         <span class="summary-sig-arg">dest</span>)</span><br />
237       Copy annotations from the tokens listed in src to the tokens in dest</td>
238           <td align="right" valign="top">
239             <span class="codelink"><a href="lxml.html.diff-pysrc.html#copy_annotations">source&nbsp;code</a></span>
240             
241           </td>
242         </tr>
243       </table>
244       
245     </td>
246   </tr>
247 <tr class="private">
248     <td width="15%" align="right" valign="top" class="summary">
249       <span class="summary-type">&nbsp;</span>
250     </td><td class="summary">
251       <table width="100%" cellpadding="0" cellspacing="0" border="0">
252         <tr>
253           <td><span class="summary-sig"><a name="compress_tokens"></a><span class="summary-sig-name">compress_tokens</span>(<span class="summary-sig-arg">tokens</span>)</span><br />
254       Combine adjacent tokens when there is no HTML between the tokens,
255 and they share an annotation</td>
256           <td align="right" valign="top">
257             <span class="codelink"><a href="lxml.html.diff-pysrc.html#compress_tokens">source&nbsp;code</a></span>
258             
259           </td>
260         </tr>
261       </table>
262       
263     </td>
264   </tr>
265 <tr class="private">
266     <td width="15%" align="right" valign="top" class="summary">
267       <span class="summary-type">&nbsp;</span>
268     </td><td class="summary">
269       <table width="100%" cellpadding="0" cellspacing="0" border="0">
270         <tr>
271           <td><span class="summary-sig"><a name="compress_merge_back"></a><span class="summary-sig-name">compress_merge_back</span>(<span class="summary-sig-arg">tokens</span>,
272         <span class="summary-sig-arg">tok</span>)</span><br />
273       Merge tok into the last element of tokens (modifying the list of
274 tokens in-place).</td>
275           <td align="right" valign="top">
276             <span class="codelink"><a href="lxml.html.diff-pysrc.html#compress_merge_back">source&nbsp;code</a></span>
277             
278           </td>
279         </tr>
280       </table>
281       
282     </td>
283   </tr>
284 <tr class="private">
285     <td width="15%" align="right" valign="top" class="summary">
286       <span class="summary-type">&nbsp;</span>
287     </td><td class="summary">
288       <table width="100%" cellpadding="0" cellspacing="0" border="0">
289         <tr>
290           <td><span class="summary-sig"><a name="markup_serialize_tokens"></a><span class="summary-sig-name">markup_serialize_tokens</span>(<span class="summary-sig-arg">tokens</span>,
291         <span class="summary-sig-arg">markup_func</span>)</span><br />
292       Serialize the list of tokens into a list of text chunks, calling
293 markup_func around text to add annotations.</td>
294           <td align="right" valign="top">
295             <span class="codelink"><a href="lxml.html.diff-pysrc.html#markup_serialize_tokens">source&nbsp;code</a></span>
296             
297           </td>
298         </tr>
299       </table>
300       
301     </td>
302   </tr>
303 <tr>
304     <td width="15%" align="right" valign="top" class="summary">
305       <span class="summary-type">&nbsp;</span>
306     </td><td class="summary">
307       <table width="100%" cellpadding="0" cellspacing="0" border="0">
308         <tr>
309           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#htmldiff" class="summary-sig-name">htmldiff</a>(<span class="summary-sig-arg">old_html</span>,
310         <span class="summary-sig-arg">new_html</span>)</span><br />
311       Do a diff of the old and new document.</td>
312           <td align="right" valign="top">
313             <span class="codelink"><a href="lxml.html.diff-pysrc.html#htmldiff">source&nbsp;code</a></span>
314             
315           </td>
316         </tr>
317       </table>
318       
319     </td>
320   </tr>
321 <tr class="private">
322     <td width="15%" align="right" valign="top" class="summary">
323       <span class="summary-type">&nbsp;</span>
324     </td><td class="summary">
325       <table width="100%" cellpadding="0" cellspacing="0" border="0">
326         <tr>
327           <td><span class="summary-sig"><a name="htmldiff_tokens"></a><span class="summary-sig-name">htmldiff_tokens</span>(<span class="summary-sig-arg">html1_tokens</span>,
328         <span class="summary-sig-arg">html2_tokens</span>)</span><br />
329       Does a diff on the tokens themselves, returning a list of text
330 chunks (not tokens).</td>
331           <td align="right" valign="top">
332             <span class="codelink"><a href="lxml.html.diff-pysrc.html#htmldiff_tokens">source&nbsp;code</a></span>
333             
334           </td>
335         </tr>
336       </table>
337       
338     </td>
339   </tr>
340 <tr class="private">
341     <td width="15%" align="right" valign="top" class="summary">
342       <span class="summary-type">&nbsp;</span>
343     </td><td class="summary">
344       <table width="100%" cellpadding="0" cellspacing="0" border="0">
345         <tr>
346           <td><span class="summary-sig"><a name="expand_tokens"></a><span class="summary-sig-name">expand_tokens</span>(<span class="summary-sig-arg">tokens</span>,
347         <span class="summary-sig-arg">equal</span>=<span class="summary-sig-default">False</span>)</span><br />
348       Given a list of tokens, return a generator of the chunks of
349 text for the data in the tokens.</td>
350           <td align="right" valign="top">
351             <span class="codelink"><a href="lxml.html.diff-pysrc.html#expand_tokens">source&nbsp;code</a></span>
352             
353           </td>
354         </tr>
355       </table>
356       
357     </td>
358   </tr>
359 <tr class="private">
360     <td width="15%" align="right" valign="top" class="summary">
361       <span class="summary-type">&nbsp;</span>
362     </td><td class="summary">
363       <table width="100%" cellpadding="0" cellspacing="0" border="0">
364         <tr>
365           <td><span class="summary-sig"><a name="merge_insert"></a><span class="summary-sig-name">merge_insert</span>(<span class="summary-sig-arg">ins_chunks</span>,
366         <span class="summary-sig-arg">doc</span>)</span><br />
367       doc is the already-handled document (as a list of text chunks);
368 here we add &lt;ins&gt;ins_chunks&lt;/ins&gt; to the end of that.</td>
369           <td align="right" valign="top">
370             <span class="codelink"><a href="lxml.html.diff-pysrc.html#merge_insert">source&nbsp;code</a></span>
371             
372           </td>
373         </tr>
374       </table>
375       
376     </td>
377   </tr>
378 <tr class="private">
379     <td width="15%" align="right" valign="top" class="summary">
380       <span class="summary-type">&nbsp;</span>
381     </td><td class="summary">
382       <table width="100%" cellpadding="0" cellspacing="0" border="0">
383         <tr>
384           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#merge_delete" class="summary-sig-name" onclick="show_private();">merge_delete</a>(<span class="summary-sig-arg">del_chunks</span>,
385         <span class="summary-sig-arg">doc</span>)</span><br />
386       Adds the text chunks in del_chunks to the document doc (another
387 list of text chunks) with marker to show it is a delete.</td>
388           <td align="right" valign="top">
389             <span class="codelink"><a href="lxml.html.diff-pysrc.html#merge_delete">source&nbsp;code</a></span>
390             
391           </td>
392         </tr>
393       </table>
394       
395     </td>
396   </tr>
397 <tr class="private">
398     <td width="15%" align="right" valign="top" class="summary">
399       <span class="summary-type">&nbsp;</span>
400     </td><td class="summary">
401       <table width="100%" cellpadding="0" cellspacing="0" border="0">
402         <tr>
403           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#cleanup_delete" class="summary-sig-name" onclick="show_private();">cleanup_delete</a>(<span class="summary-sig-arg">chunks</span>)</span><br />
404       Cleans up any DEL_START/DEL_END markers in the document, replacing
405 them with &lt;del&gt;&lt;/del&gt;.</td>
406           <td align="right" valign="top">
407             <span class="codelink"><a href="lxml.html.diff-pysrc.html#cleanup_delete">source&nbsp;code</a></span>
408             
409           </td>
410         </tr>
411       </table>
412       
413     </td>
414   </tr>
415 <tr class="private">
416     <td width="15%" align="right" valign="top" class="summary">
417       <span class="summary-type">&nbsp;</span>
418     </td><td class="summary">
419       <table width="100%" cellpadding="0" cellspacing="0" border="0">
420         <tr>
421           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#split_unbalanced" class="summary-sig-name" onclick="show_private();">split_unbalanced</a>(<span class="summary-sig-arg">chunks</span>)</span><br />
422       Return (unbalanced_start, balanced, unbalanced_end), where each is
423 a list of text and tag chunks.</td>
424           <td align="right" valign="top">
425             <span class="codelink"><a href="lxml.html.diff-pysrc.html#split_unbalanced">source&nbsp;code</a></span>
426             
427           </td>
428         </tr>
429       </table>
430       
431     </td>
432   </tr>
433 <tr class="private">
434     <td width="15%" align="right" valign="top" class="summary">
435       <span class="summary-type">&nbsp;</span>
436     </td><td class="summary">
437       <table width="100%" cellpadding="0" cellspacing="0" border="0">
438         <tr>
439           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#split_delete" class="summary-sig-name" onclick="show_private();">split_delete</a>(<span class="summary-sig-arg">chunks</span>)</span><br />
440       Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
441 stuff_after_DEL_END).</td>
442           <td align="right" valign="top">
443             <span class="codelink"><a href="lxml.html.diff-pysrc.html#split_delete">source&nbsp;code</a></span>
444             
445           </td>
446         </tr>
447       </table>
448       
449     </td>
450   </tr>
451 <tr class="private">
452     <td width="15%" align="right" valign="top" class="summary">
453       <span class="summary-type">&nbsp;</span>
454     </td><td class="summary">
455       <table width="100%" cellpadding="0" cellspacing="0" border="0">
456         <tr>
457           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#locate_unbalanced_start" class="summary-sig-name" onclick="show_private();">locate_unbalanced_start</a>(<span class="summary-sig-arg">unbalanced_start</span>,
458         <span class="summary-sig-arg">pre_delete</span>,
459         <span class="summary-sig-arg">post_delete</span>)</span><br />
460       pre_delete and post_delete implicitly point to a place in the
461 document (where the two were split).</td>
462           <td align="right" valign="top">
463             <span class="codelink"><a href="lxml.html.diff-pysrc.html#locate_unbalanced_start">source&nbsp;code</a></span>
464             
465           </td>
466         </tr>
467       </table>
468       
469     </td>
470   </tr>
471 <tr class="private">
472     <td width="15%" align="right" valign="top" class="summary">
473       <span class="summary-type">&nbsp;</span>
474     </td><td class="summary">
475       <table width="100%" cellpadding="0" cellspacing="0" border="0">
476         <tr>
477           <td><span class="summary-sig"><a name="locate_unbalanced_end"></a><span class="summary-sig-name">locate_unbalanced_end</span>(<span class="summary-sig-arg">unbalanced_end</span>,
478         <span class="summary-sig-arg">pre_delete</span>,
479         <span class="summary-sig-arg">post_delete</span>)</span><br />
480       like locate_unbalanced_start, except handling end tags and
481 possibly moving the point earlier in the document.</td>
482           <td align="right" valign="top">
483             <span class="codelink"><a href="lxml.html.diff-pysrc.html#locate_unbalanced_end">source&nbsp;code</a></span>
484             
485           </td>
486         </tr>
487       </table>
488       
489     </td>
490   </tr>
491 <tr class="private">
492     <td width="15%" align="right" valign="top" class="summary">
493       <span class="summary-type">&nbsp;</span>
494     </td><td class="summary">
495       <table width="100%" cellpadding="0" cellspacing="0" border="0">
496         <tr>
497           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#tokenize" class="summary-sig-name" onclick="show_private();">tokenize</a>(<span class="summary-sig-arg">html</span>,
498         <span class="summary-sig-arg">include_hrefs</span>=<span class="summary-sig-default">True</span>)</span><br />
499       Parse the given HTML and returns token objects (words with attached tags).</td>
500           <td align="right" valign="top">
501             <span class="codelink"><a href="lxml.html.diff-pysrc.html#tokenize">source&nbsp;code</a></span>
502             
503           </td>
504         </tr>
505       </table>
506       
507     </td>
508   </tr>
509 <tr class="private">
510     <td width="15%" align="right" valign="top" class="summary">
511       <span class="summary-type">&nbsp;</span>
512     </td><td class="summary">
513       <table width="100%" cellpadding="0" cellspacing="0" border="0">
514         <tr>
515           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#parse_html" class="summary-sig-name" onclick="show_private();">parse_html</a>(<span class="summary-sig-arg">html</span>,
516         <span class="summary-sig-arg">cleanup</span>=<span class="summary-sig-default">True</span>)</span><br />
517       Parses an HTML fragment, returning an lxml element.</td>
518           <td align="right" valign="top">
519             <span class="codelink"><a href="lxml.html.diff-pysrc.html#parse_html">source&nbsp;code</a></span>
520             
521           </td>
522         </tr>
523       </table>
524       
525     </td>
526   </tr>
527 <tr class="private">
528     <td width="15%" align="right" valign="top" class="summary">
529       <span class="summary-type">&nbsp;</span>
530     </td><td class="summary">
531       <table width="100%" cellpadding="0" cellspacing="0" border="0">
532         <tr>
533           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#cleanup_html" class="summary-sig-name" onclick="show_private();">cleanup_html</a>(<span class="summary-sig-arg">html</span>)</span><br />
534       This 'cleans' the HTML, meaning that any page structure is removed
535 (only the contents of &lt;body&gt; are used, if there is any &lt;body).</td>
536           <td align="right" valign="top">
537             <span class="codelink"><a href="lxml.html.diff-pysrc.html#cleanup_html">source&nbsp;code</a></span>
538             
539           </td>
540         </tr>
541       </table>
542       
543     </td>
544   </tr>
545 <tr class="private">
546     <td width="15%" align="right" valign="top" class="summary">
547       <span class="summary-type">&nbsp;</span>
548     </td><td class="summary">
549       <table width="100%" cellpadding="0" cellspacing="0" border="0">
550         <tr>
551           <td><span class="summary-sig"><a name="fixup_chunks"></a><span class="summary-sig-name">fixup_chunks</span>(<span class="summary-sig-arg">chunks</span>)</span><br />
552       This function takes a list of chunks and produces a list of tokens.</td>
553           <td align="right" valign="top">
554             <span class="codelink"><a href="lxml.html.diff-pysrc.html#fixup_chunks">source&nbsp;code</a></span>
555             
556           </td>
557         </tr>
558       </table>
559       
560     </td>
561   </tr>
562 <tr class="private">
563     <td width="15%" align="right" valign="top" class="summary">
564       <span class="summary-type">&nbsp;</span>
565     </td><td class="summary">
566       <table width="100%" cellpadding="0" cellspacing="0" border="0">
567         <tr>
568           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#flatten_el" class="summary-sig-name" onclick="show_private();">flatten_el</a>(<span class="summary-sig-arg">el</span>,
569         <span class="summary-sig-arg">include_hrefs</span>,
570         <span class="summary-sig-arg">skip_tag</span>=<span class="summary-sig-default">False</span>)</span><br />
571       Takes an lxml element el, and generates all the text chunks for
572 that tag.</td>
573           <td align="right" valign="top">
574             <span class="codelink"><a href="lxml.html.diff-pysrc.html#flatten_el">source&nbsp;code</a></span>
575             
576           </td>
577         </tr>
578       </table>
579       
580     </td>
581   </tr>
582 <tr class="private">
583     <td width="15%" align="right" valign="top" class="summary">
584       <span class="summary-type">&nbsp;</span>
585     </td><td class="summary">
586       <table width="100%" cellpadding="0" cellspacing="0" border="0">
587         <tr>
588           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#split_words" class="summary-sig-name" onclick="show_private();">split_words</a>(<span class="summary-sig-arg">text</span>)</span><br />
589       Splits some text into words.</td>
590           <td align="right" valign="top">
591             <span class="codelink"><a href="lxml.html.diff-pysrc.html#split_words">source&nbsp;code</a></span>
592             
593           </td>
594         </tr>
595       </table>
596       
597     </td>
598   </tr>
599 <tr class="private">
600     <td width="15%" align="right" valign="top" class="summary">
601       <span class="summary-type">&nbsp;</span>
602     </td><td class="summary">
603       <table width="100%" cellpadding="0" cellspacing="0" border="0">
604         <tr>
605           <td><span class="summary-sig"><a name="start_tag"></a><span class="summary-sig-name">start_tag</span>(<span class="summary-sig-arg">el</span>)</span><br />
606       The text representation of the start tag for a tag.</td>
607           <td align="right" valign="top">
608             <span class="codelink"><a href="lxml.html.diff-pysrc.html#start_tag">source&nbsp;code</a></span>
609             
610           </td>
611         </tr>
612       </table>
613       
614     </td>
615   </tr>
616 <tr class="private">
617     <td width="15%" align="right" valign="top" class="summary">
618       <span class="summary-type">&nbsp;</span>
619     </td><td class="summary">
620       <table width="100%" cellpadding="0" cellspacing="0" border="0">
621         <tr>
622           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#end_tag" class="summary-sig-name" onclick="show_private();">end_tag</a>(<span class="summary-sig-arg">el</span>)</span><br />
623       The text representation of an end tag for a tag.</td>
624           <td align="right" valign="top">
625             <span class="codelink"><a href="lxml.html.diff-pysrc.html#end_tag">source&nbsp;code</a></span>
626             
627           </td>
628         </tr>
629       </table>
630       
631     </td>
632   </tr>
633 <tr class="private">
634     <td width="15%" align="right" valign="top" class="summary">
635       <span class="summary-type">&nbsp;</span>
636     </td><td class="summary">
637       <table width="100%" cellpadding="0" cellspacing="0" border="0">
638         <tr>
639           <td><span class="summary-sig"><a name="is_word"></a><span class="summary-sig-name">is_word</span>(<span class="summary-sig-arg">tok</span>)</span></td>
640           <td align="right" valign="top">
641             <span class="codelink"><a href="lxml.html.diff-pysrc.html#is_word">source&nbsp;code</a></span>
642             
643           </td>
644         </tr>
645       </table>
646       
647     </td>
648   </tr>
649 <tr class="private">
650     <td width="15%" align="right" valign="top" class="summary">
651       <span class="summary-type">&nbsp;</span>
652     </td><td class="summary">
653       <table width="100%" cellpadding="0" cellspacing="0" border="0">
654         <tr>
655           <td><span class="summary-sig"><a name="is_end_tag"></a><span class="summary-sig-name">is_end_tag</span>(<span class="summary-sig-arg">tok</span>)</span></td>
656           <td align="right" valign="top">
657             <span class="codelink"><a href="lxml.html.diff-pysrc.html#is_end_tag">source&nbsp;code</a></span>
658             
659           </td>
660         </tr>
661       </table>
662       
663     </td>
664   </tr>
665 <tr class="private">
666     <td width="15%" align="right" valign="top" class="summary">
667       <span class="summary-type">&nbsp;</span>
668     </td><td class="summary">
669       <table width="100%" cellpadding="0" cellspacing="0" border="0">
670         <tr>
671           <td><span class="summary-sig"><a name="is_start_tag"></a><span class="summary-sig-name">is_start_tag</span>(<span class="summary-sig-arg">tok</span>)</span></td>
672           <td align="right" valign="top">
673             <span class="codelink"><a href="lxml.html.diff-pysrc.html#is_start_tag">source&nbsp;code</a></span>
674             
675           </td>
676         </tr>
677       </table>
678       
679     </td>
680   </tr>
681 <tr class="private">
682     <td width="15%" align="right" valign="top" class="summary">
683       <span class="summary-type">&nbsp;</span>
684     </td><td class="summary">
685       <table width="100%" cellpadding="0" cellspacing="0" border="0">
686         <tr>
687           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#fixup_ins_del_tags" class="summary-sig-name" onclick="show_private();">fixup_ins_del_tags</a>(<span class="summary-sig-arg">html</span>)</span><br />
688       Given an html string, move any &lt;ins&gt; or &lt;del&gt; tags inside of any
689 block-level elements, e.g.</td>
690           <td align="right" valign="top">
691             <span class="codelink"><a href="lxml.html.diff-pysrc.html#fixup_ins_del_tags">source&nbsp;code</a></span>
692             
693           </td>
694         </tr>
695       </table>
696       
697     </td>
698   </tr>
699 <tr class="private">
700     <td width="15%" align="right" valign="top" class="summary">
701       <span class="summary-type">&nbsp;</span>
702     </td><td class="summary">
703       <table width="100%" cellpadding="0" cellspacing="0" border="0">
704         <tr>
705           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#serialize_html_fragment" class="summary-sig-name" onclick="show_private();">serialize_html_fragment</a>(<span class="summary-sig-arg">el</span>,
706         <span class="summary-sig-arg">skip_outer</span>=<span class="summary-sig-default">False</span>)</span><br />
707       Serialize a single lxml element as HTML.</td>
708           <td align="right" valign="top">
709             <span class="codelink"><a href="lxml.html.diff-pysrc.html#serialize_html_fragment">source&nbsp;code</a></span>
710             
711           </td>
712         </tr>
713       </table>
714       
715     </td>
716   </tr>
717 <tr class="private">
718     <td width="15%" align="right" valign="top" class="summary">
719       <span class="summary-type">&nbsp;</span>
720     </td><td class="summary">
721       <table width="100%" cellpadding="0" cellspacing="0" border="0">
722         <tr>
723           <td><span class="summary-sig"><a name="_fixup_ins_del_tags"></a><span class="summary-sig-name">_fixup_ins_del_tags</span>(<span class="summary-sig-arg">doc</span>)</span><br />
724       fixup_ins_del_tags that works on an lxml document in-place</td>
725           <td align="right" valign="top">
726             <span class="codelink"><a href="lxml.html.diff-pysrc.html#_fixup_ins_del_tags">source&nbsp;code</a></span>
727             
728           </td>
729         </tr>
730       </table>
731       
732     </td>
733   </tr>
734 <tr class="private">
735     <td width="15%" align="right" valign="top" class="summary">
736       <span class="summary-type">&nbsp;</span>
737     </td><td class="summary">
738       <table width="100%" cellpadding="0" cellspacing="0" border="0">
739         <tr>
740           <td><span class="summary-sig"><a name="_contains_block_level_tag"></a><span class="summary-sig-name">_contains_block_level_tag</span>(<span class="summary-sig-arg">el</span>)</span><br />
741       True if the element contains any block-level elements, like &lt;p&gt;, &lt;td&gt;, etc.</td>
742           <td align="right" valign="top">
743             <span class="codelink"><a href="lxml.html.diff-pysrc.html#_contains_block_level_tag">source&nbsp;code</a></span>
744             
745           </td>
746         </tr>
747       </table>
748       
749     </td>
750   </tr>
751 <tr class="private">
752     <td width="15%" align="right" valign="top" class="summary">
753       <span class="summary-type">&nbsp;</span>
754     </td><td class="summary">
755       <table width="100%" cellpadding="0" cellspacing="0" border="0">
756         <tr>
757           <td><span class="summary-sig"><a name="_move_el_inside_block"></a><span class="summary-sig-name">_move_el_inside_block</span>(<span class="summary-sig-arg">el</span>,
758         <span class="summary-sig-arg">tag</span>)</span><br />
759       helper for _fixup_ins_del_tags; actually takes the &lt;ins&gt; etc tags
760 and moves them inside any block-level tags.</td>
761           <td align="right" valign="top">
762             <span class="codelink"><a href="lxml.html.diff-pysrc.html#_move_el_inside_block">source&nbsp;code</a></span>
763             
764           </td>
765         </tr>
766       </table>
767       
768     </td>
769   </tr>
770 <tr class="private">
771     <td width="15%" align="right" valign="top" class="summary">
772       <span class="summary-type">&nbsp;</span>
773     </td><td class="summary">
774       <table width="100%" cellpadding="0" cellspacing="0" border="0">
775         <tr>
776           <td><span class="summary-sig"><a name="_merge_element_contents"></a><span class="summary-sig-name">_merge_element_contents</span>(<span class="summary-sig-arg">el</span>)</span><br />
777       Removes an element, but merges its contents into its place, e.g.,
778 given &lt;p&gt;Hi &lt;i&gt;there!&lt;/i&gt;&lt;/p&gt;, if you remove the &lt;i&gt; element you get
779 &lt;p&gt;Hi there!&lt;/p&gt;</td>
780           <td align="right" valign="top">
781             <span class="codelink"><a href="lxml.html.diff-pysrc.html#_merge_element_contents">source&nbsp;code</a></span>
782             
783           </td>
784         </tr>
785       </table>
786       
787     </td>
788   </tr>
789 </table>
790 <!-- ==================== VARIABLES ==================== -->
791 <a name="section-Variables"></a>
792 <table class="summary" border="1" cellpadding="3"
793        cellspacing="0" width="100%" bgcolor="white">
794 <tr bgcolor="#70b0f0" class="table-header">
795   <td colspan="2" class="table-header">
796     <table border="0" cellpadding="0" cellspacing="0" width="100%">
797       <tr valign="top">
798         <td align="left"><span class="table-header">Variables</span></td>
799         <td align="right" valign="top"
800          ><span class="options">[<a href="#section-Variables"
801          class="privatelink" onclick="toggle_private();"
802          >hide private</a>]</span></td>
803       </tr>
804     </table>
805   </td>
806 </tr>
807 <tr class="private">
808     <td width="15%" align="right" valign="top" class="summary">
809       <span class="summary-type">&nbsp;</span>
810     </td><td class="summary">
811         <a name="_body_re"></a><span class="summary-name">_body_re</span> = <code title="re.compile(r'(?is)&lt;body.*?&gt;')">re.compile(r'<code class="re-flags">(?is)</code>&lt;body.<code class="re-op">*?</code>&gt;')</code>
812     </td>
813   </tr>
814 <tr class="private">
815     <td width="15%" align="right" valign="top" class="summary">
816       <span class="summary-type">&nbsp;</span>
817     </td><td class="summary">
818         <a name="_end_body_re"></a><span class="summary-name">_end_body_re</span> = <code title="re.compile(r'(?is)&lt;/body.*?&gt;')">re.compile(r'<code class="re-flags">(?is)</code>&lt;/body.<code class="re-op">*?</code>&gt;')</code>
819     </td>
820   </tr>
821 <tr class="private">
822     <td width="15%" align="right" valign="top" class="summary">
823       <span class="summary-type">&nbsp;</span>
824     </td><td class="summary">
825         <a name="_ins_del_re"></a><span class="summary-name">_ins_del_re</span> = <code title="re.compile(r'(?is)&lt;/?(ins|del).*?&gt;')">re.compile(r'<code class="re-flags">(?is)</code>&lt;/<code class="re-op">?</code><code class="re-group">(</code>ins<code class="re-op">|</code>del<code class="re-group">)</code>.<code class="re-op">*?</code>&gt;')</code>
826     </td>
827   </tr>
828 <tr class="private">
829     <td width="15%" align="right" valign="top" class="summary">
830       <span class="summary-type">&nbsp;</span>
831     </td><td class="summary">
832         <a name="end_whitespace_re"></a><span class="summary-name">end_whitespace_re</span> = <code title="re.compile(r'[ \t\n\r]$')">re.compile(r'<code class="re-group">[</code> \t\n\r<code class="re-group">]</code>$')</code>
833     </td>
834   </tr>
835 <tr class="private">
836     <td width="15%" align="right" valign="top" class="summary">
837       <span class="summary-type">&nbsp;</span>
838     </td><td class="summary">
839         <a href="lxml.html.diff-module.html#empty_tags" class="summary-name" onclick="show_private();">empty_tags</a> = <code title="('param',
840  'img',
841  'area',
842  'br',
843  'basefont',
844  'input',
845  'base',
846  'meta',
847 ..."><code class="variable-group">(</code><code class="variable-quote">'</code><code class="variable-string">param</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">img</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">area</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">br</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">basefont</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">input</code><code class="variable-ellipsis">...</code></code>
848     </td>
849   </tr>
850 <tr class="private">
851     <td width="15%" align="right" valign="top" class="summary">
852       <span class="summary-type">&nbsp;</span>
853     </td><td class="summary">
854         <a href="lxml.html.diff-module.html#block_level_tags" class="summary-name" onclick="show_private();">block_level_tags</a> = <code title="('address',
855  'blockquote',
856  'center',
857  'dir',
858  'div',
859  'dl',
860  'fieldset',
861  'form',
862 ..."><code class="variable-group">(</code><code class="variable-quote">'</code><code class="variable-string">address</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">blockquote</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">center</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">dir</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-ellipsis">...</code></code>
863     </td>
864   </tr>
865 <tr class="private">
866     <td width="15%" align="right" valign="top" class="summary">
867       <span class="summary-type">&nbsp;</span>
868     </td><td class="summary">
869         <a href="lxml.html.diff-module.html#block_level_container_tags" class="summary-name" onclick="show_private();">block_level_container_tags</a> = <code title="('dd',
870  'dt',
871  'frameset',
872  'li',
873  'tbody',
874  'td',
875  'tfoot',
876  'th',
877 ..."><code class="variable-group">(</code><code class="variable-quote">'</code><code class="variable-string">dd</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">dt</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">frameset</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">li</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">'</code><code class="variable-string">t</code><code class="variable-ellipsis">...</code></code>
878     </td>
879   </tr>
880 <tr class="private">
881     <td width="15%" align="right" valign="top" class="summary">
882       <span class="summary-type">&nbsp;</span>
883     </td><td class="summary">
884         <a name="start_whitespace_re"></a><span class="summary-name">start_whitespace_re</span> = <code title="re.compile(r'^[ \t\n\r]')">re.compile(r'^<code class="re-group">[</code> \t\n\r<code class="re-group">]</code>')</code>
885     </td>
886   </tr>
887 </table>
888 <!-- ==================== FUNCTION DETAILS ==================== -->
889 <a name="section-FunctionDetails"></a>
890 <table class="details" border="1" cellpadding="3"
891        cellspacing="0" width="100%" bgcolor="white">
892 <tr bgcolor="#70b0f0" class="table-header">
893   <td colspan="2" class="table-header">
894     <table border="0" cellpadding="0" cellspacing="0" width="100%">
895       <tr valign="top">
896         <td align="left"><span class="table-header">Function Details</span></td>
897         <td align="right" valign="top"
898          ><span class="options">[<a href="#section-FunctionDetails"
899          class="privatelink" onclick="toggle_private();"
900          >hide private</a>]</span></td>
901       </tr>
902     </table>
903   </td>
904 </tr>
905 </table>
906 <a name="html_annotate"></a>
907 <div>
908 <table class="details" border="1" cellpadding="3"
909        cellspacing="0" width="100%" bgcolor="white">
910 <tr><td>
911   <table width="100%" cellpadding="0" cellspacing="0" border="0">
912   <tr valign="top"><td>
913   <h3 class="epydoc"><span class="sig"><span class="sig-name">html_annotate</span>(<span class="sig-arg">doclist</span>,
914         <span class="sig-arg">markup</span>=<span class="sig-default">&lt;function default_markup at 0x9800d4c&gt;</span>)</span>
915   </h3>
916   </td><td align="right" valign="top"
917     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#html_annotate">source&nbsp;code</a></span>&nbsp;
918     </td>
919   </tr></table>
920   
921   <p>doclist should be ordered from oldest to newest, like:</p>
922 <pre class="rst-literal-block">
923 &gt;&gt;&gt; version1 = 'Hello World'
924 &gt;&gt;&gt; version2 = 'Goodbye World'
925 &gt;&gt;&gt; print(html_annotate([(version1, 'version 1'),
926 ...                      (version2, 'version 2')]))
927 &lt;span title=&quot;version 2&quot;&gt;Goodbye&lt;/span&gt; &lt;span title=&quot;version 1&quot;&gt;World&lt;/span&gt;
928 </pre>
929 <p>The documents must be <em>fragments</em> (str/UTF8 or unicode), not
930 complete documents</p>
931 <p>The markup argument is a function to markup the spans of words.
932 This function is called like markup('Hello', 'version 2'), and
933 returns HTML.  The first argument is text and never includes any
934 markup.  The default uses a span with a title:</p>
935 <blockquote>
936 <pre class="py-doctest">
937 <span class="py-prompt">&gt;&gt;&gt; </span><span class="py-keyword">print</span>(default_markup(<span class="py-string">'Some Text'</span>, <span class="py-string">'by Joe'</span>))
938 <span class="py-output">&lt;span title=&quot;by Joe&quot;&gt;Some Text&lt;/span&gt;</span></pre>
939 </blockquote>
940   <dl class="fields">
941   </dl>
942 </td></tr></table>
943 </div>
944 <a name="htmldiff"></a>
945 <div>
946 <table class="details" border="1" cellpadding="3"
947        cellspacing="0" width="100%" bgcolor="white">
948 <tr><td>
949   <table width="100%" cellpadding="0" cellspacing="0" border="0">
950   <tr valign="top"><td>
951   <h3 class="epydoc"><span class="sig"><span class="sig-name">htmldiff</span>(<span class="sig-arg">old_html</span>,
952         <span class="sig-arg">new_html</span>)</span>
953   </h3>
954   </td><td align="right" valign="top"
955     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#htmldiff">source&nbsp;code</a></span>&nbsp;
956     </td>
957   </tr></table>
958   
959   <p>Do a diff of the old and new document.  The documents are HTML
960 <em>fragments</em> (str/UTF8 or unicode), they are not complete documents
961 (i.e., no &lt;html&gt; tag).</p>
962 <p>Returns HTML with &lt;ins&gt; and &lt;del&gt; tags added around the
963 appropriate text.</p>
964 <p>Markup is generally ignored, with the markup from new_html
965 preserved, and possibly some markup from old_html (though it is
966 considered acceptable to lose some of the old markup).  Only the
967 words in the HTML are diffed.  The exception is &lt;img&gt; tags, which
968 are treated like words, and the href attribute of &lt;a&gt; tags, which
969 are noted inside the tag itself when there are changes.</p>
970   <dl class="fields">
971   </dl>
972 </td></tr></table>
973 </div>
974 <a name="merge_delete"></a>
975 <div class="private">
976 <table class="details" border="1" cellpadding="3"
977        cellspacing="0" width="100%" bgcolor="white">
978 <tr><td>
979   <table width="100%" cellpadding="0" cellspacing="0" border="0">
980   <tr valign="top"><td>
981   <h3 class="epydoc"><span class="sig"><span class="sig-name">merge_delete</span>(<span class="sig-arg">del_chunks</span>,
982         <span class="sig-arg">doc</span>)</span>
983   </h3>
984   </td><td align="right" valign="top"
985     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#merge_delete">source&nbsp;code</a></span>&nbsp;
986     </td>
987   </tr></table>
988   
989   Adds the text chunks in del_chunks to the document doc (another
990 list of text chunks) with marker to show it is a delete.
991 cleanup_delete later resolves these markers into &lt;del&gt; tags.
992   <dl class="fields">
993   </dl>
994 </td></tr></table>
995 </div>
996 <a name="cleanup_delete"></a>
997 <div class="private">
998 <table class="details" border="1" cellpadding="3"
999        cellspacing="0" width="100%" bgcolor="white">
1000 <tr><td>
1001   <table width="100%" cellpadding="0" cellspacing="0" border="0">
1002   <tr valign="top"><td>
1003   <h3 class="epydoc"><span class="sig"><span class="sig-name">cleanup_delete</span>(<span class="sig-arg">chunks</span>)</span>
1004   </h3>
1005   </td><td align="right" valign="top"
1006     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#cleanup_delete">source&nbsp;code</a></span>&nbsp;
1007     </td>
1008   </tr></table>
1009   
1010   <p>Cleans up any DEL_START/DEL_END markers in the document, replacing
1011 them with &lt;del&gt;&lt;/del&gt;.  To do this while keeping the document
1012 valid, it may need to drop some tags (either start or end tags).</p>
1013 <p>It may also move the del into adjacent tags to try to move it to a
1014 similar location where it was originally located (e.g., moving a
1015 delete into preceding &lt;div&gt; tag, if the del looks like (DEL_START,
1016 'Text&lt;/div&gt;', DEL_END)</p>
1017   <dl class="fields">
1018   </dl>
1019 </td></tr></table>
1020 </div>
1021 <a name="split_unbalanced"></a>
1022 <div class="private">
1023 <table class="details" border="1" cellpadding="3"
1024        cellspacing="0" width="100%" bgcolor="white">
1025 <tr><td>
1026   <table width="100%" cellpadding="0" cellspacing="0" border="0">
1027   <tr valign="top"><td>
1028   <h3 class="epydoc"><span class="sig"><span class="sig-name">split_unbalanced</span>(<span class="sig-arg">chunks</span>)</span>
1029   </h3>
1030   </td><td align="right" valign="top"
1031     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#split_unbalanced">source&nbsp;code</a></span>&nbsp;
1032     </td>
1033   </tr></table>
1034   
1035   <p>Return (unbalanced_start, balanced, unbalanced_end), where each is
1036 a list of text and tag chunks.</p>
1037 <p>unbalanced_start is a list of all the tags that are opened, but
1038 not closed in this span.  Similarly, unbalanced_end is a list of
1039 tags that are closed but were not opened.  Extracting these might
1040 mean some reordering of the chunks.</p>
1041   <dl class="fields">
1042   </dl>
1043 </td></tr></table>
1044 </div>
1045 <a name="split_delete"></a>
1046 <div class="private">
1047 <table class="details" border="1" cellpadding="3"
1048        cellspacing="0" width="100%" bgcolor="white">
1049 <tr><td>
1050   <table width="100%" cellpadding="0" cellspacing="0" border="0">
1051   <tr valign="top"><td>
1052   <h3 class="epydoc"><span class="sig"><span class="sig-name">split_delete</span>(<span class="sig-arg">chunks</span>)</span>
1053   </h3>
1054   </td><td align="right" valign="top"
1055     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#split_delete">source&nbsp;code</a></span>&nbsp;
1056     </td>
1057   </tr></table>
1058   
1059   Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
1060 stuff_after_DEL_END).  Returns the first case found (there may be
1061 more DEL_STARTs in stuff_after_DEL_END).  Raises NoDeletes if
1062 there's no DEL_START found.
1063   <dl class="fields">
1064   </dl>
1065 </td></tr></table>
1066 </div>
1067 <a name="locate_unbalanced_start"></a>
1068 <div class="private">
1069 <table class="details" border="1" cellpadding="3"
1070        cellspacing="0" width="100%" bgcolor="white">
1071 <tr><td>
1072   <table width="100%" cellpadding="0" cellspacing="0" border="0">
1073   <tr valign="top"><td>
1074   <h3 class="epydoc"><span class="sig"><span class="sig-name">locate_unbalanced_start</span>(<span class="sig-arg">unbalanced_start</span>,
1075         <span class="sig-arg">pre_delete</span>,
1076         <span class="sig-arg">post_delete</span>)</span>
1077   </h3>
1078   </td><td align="right" valign="top"
1079     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#locate_unbalanced_start">source&nbsp;code</a></span>&nbsp;
1080     </td>
1081   </tr></table>
1082   
1083   <p>pre_delete and post_delete implicitly point to a place in the
1084 document (where the two were split).  This moves that point (by
1085 popping items from one and pushing them onto the other).  It moves
1086 the point to try to find a place where unbalanced_start applies.</p>
1087 <p>As an example:</p>
1088 <pre class="rst-literal-block">
1089 &gt;&gt;&gt; unbalanced_start = ['&lt;div&gt;']
1090 &gt;&gt;&gt; doc = ['&lt;p&gt;', 'Text', '&lt;/p&gt;', '&lt;div&gt;', 'More Text', '&lt;/div&gt;']
1091 &gt;&gt;&gt; pre, post = doc[:3], doc[3:]
1092 &gt;&gt;&gt; pre, post
1093 (['&lt;p&gt;', 'Text', '&lt;/p&gt;'], ['&lt;div&gt;', 'More Text', '&lt;/div&gt;'])
1094 &gt;&gt;&gt; locate_unbalanced_start(unbalanced_start, pre, post)
1095 &gt;&gt;&gt; pre, post
1096 (['&lt;p&gt;', 'Text', '&lt;/p&gt;', '&lt;div&gt;'], ['More Text', '&lt;/div&gt;'])
1097 </pre>
1098 <p>As you can see, we moved the point so that the dangling &lt;div&gt; that
1099 we found will be effectively replaced by the div in the original
1100 document.  If this doesn't work out, we just throw away
1101 unbalanced_start without doing anything.</p>
1102   <dl class="fields">
1103   </dl>
1104 </td></tr></table>
1105 </div>
1106 <a name="tokenize"></a>
1107 <div class="private">
1108 <table class="details" border="1" cellpadding="3"
1109        cellspacing="0" width="100%" bgcolor="white">
1110 <tr><td>
1111   <table width="100%" cellpadding="0" cellspacing="0" border="0">
1112   <tr valign="top"><td>
1113   <h3 class="epydoc"><span class="sig"><span class="sig-name">tokenize</span>(<span class="sig-arg">html</span>,
1114         <span class="sig-arg">include_hrefs</span>=<span class="sig-default">True</span>)</span>
1115   </h3>
1116   </td><td align="right" valign="top"
1117     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#tokenize">source&nbsp;code</a></span>&nbsp;
1118     </td>
1119   </tr></table>
1120   
1121   <p>Parse the given HTML and returns token objects (words with attached tags).</p>
1122 <p>This parses only the content of a page; anything in the head is
1123 ignored, and the &lt;head&gt; and &lt;body&gt; elements are themselves
1124 optional.  The content is then parsed by lxml, which ensures the
1125 validity of the resulting parsed document (though lxml may make
1126 incorrect guesses when the markup is particular bad).</p>
1127 <p>&lt;ins&gt; and &lt;del&gt; tags are also eliminated from the document, as
1128 that gets confusing.</p>
1129 <p>If include_hrefs is true, then the href attribute of &lt;a&gt; tags is
1130 included as a special kind of diffable token.</p>
1131   <dl class="fields">
1132   </dl>
1133 </td></tr></table>
1134 </div>
1135 <a name="parse_html"></a>
1136 <div class="private">
1137 <table class="details" border="1" cellpadding="3"
1138        cellspacing="0" width="100%" bgcolor="white">
1139 <tr><td>
1140   <table width="100%" cellpadding="0" cellspacing="0" border="0">
1141   <tr valign="top"><td>
1142   <h3 class="epydoc"><span class="sig"><span class="sig-name">parse_html</span>(<span class="sig-arg">html</span>,
1143         <span class="sig-arg">cleanup</span>=<span class="sig-default">True</span>)</span>
1144   </h3>
1145   </td><td align="right" valign="top"
1146     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#parse_html">source&nbsp;code</a></span>&nbsp;
1147     </td>
1148   </tr></table>
1149   
1150   <p>Parses an HTML fragment, returning an lxml element.  Note that the HTML will be
1151 wrapped in a &lt;div&gt; tag that was not in the original document.</p>
1152 <p>If cleanup is true, make sure there's no &lt;head&gt; or &lt;body&gt;, and get
1153 rid of any &lt;ins&gt; and &lt;del&gt; tags.</p>
1154   <dl class="fields">
1155   </dl>
1156 </td></tr></table>
1157 </div>
1158 <a name="cleanup_html"></a>
1159 <div class="private">
1160 <table class="details" border="1" cellpadding="3"
1161        cellspacing="0" width="100%" bgcolor="white">
1162 <tr><td>
1163   <table width="100%" cellpadding="0" cellspacing="0" border="0">
1164   <tr valign="top"><td>
1165   <h3 class="epydoc"><span class="sig"><span class="sig-name">cleanup_html</span>(<span class="sig-arg">html</span>)</span>
1166   </h3>
1167   </td><td align="right" valign="top"
1168     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#cleanup_html">source&nbsp;code</a></span>&nbsp;
1169     </td>
1170   </tr></table>
1171   
1172   This 'cleans' the HTML, meaning that any page structure is removed
1173 (only the contents of &lt;body&gt; are used, if there is any &lt;body).
1174 Also &lt;ins&gt; and &lt;del&gt; tags are removed.
1175   <dl class="fields">
1176   </dl>
1177 </td></tr></table>
1178 </div>
1179 <a name="flatten_el"></a>
1180 <div class="private">
1181 <table class="details" border="1" cellpadding="3"
1182        cellspacing="0" width="100%" bgcolor="white">
1183 <tr><td>
1184   <table width="100%" cellpadding="0" cellspacing="0" border="0">
1185   <tr valign="top"><td>
1186   <h3 class="epydoc"><span class="sig"><span class="sig-name">flatten_el</span>(<span class="sig-arg">el</span>,
1187         <span class="sig-arg">include_hrefs</span>,
1188         <span class="sig-arg">skip_tag</span>=<span class="sig-default">False</span>)</span>
1189   </h3>
1190   </td><td align="right" valign="top"
1191     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#flatten_el">source&nbsp;code</a></span>&nbsp;
1192     </td>
1193   </tr></table>
1194   
1195   <p>Takes an lxml element el, and generates all the text chunks for
1196 that tag.  Each start tag is a chunk, each word is a chunk, and each
1197 end tag is a chunk.</p>
1198 <p>If skip_tag is true, then the outermost container tag is
1199 not returned (just its contents).</p>
1200   <dl class="fields">
1201   </dl>
1202 </td></tr></table>
1203 </div>
1204 <a name="split_words"></a>
1205 <div class="private">
1206 <table class="details" border="1" cellpadding="3"
1207        cellspacing="0" width="100%" bgcolor="white">
1208 <tr><td>
1209   <table width="100%" cellpadding="0" cellspacing="0" border="0">
1210   <tr valign="top"><td>
1211   <h3 class="epydoc"><span class="sig"><span class="sig-name">split_words</span>(<span class="sig-arg">text</span>)</span>
1212   </h3>
1213   </td><td align="right" valign="top"
1214     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#split_words">source&nbsp;code</a></span>&nbsp;
1215     </td>
1216   </tr></table>
1217   
1218   Splits some text into words. Includes trailing whitespace (one
1219 space) on each word when appropriate.
1220   <dl class="fields">
1221   </dl>
1222 </td></tr></table>
1223 </div>
1224 <a name="end_tag"></a>
1225 <div class="private">
1226 <table class="details" border="1" cellpadding="3"
1227        cellspacing="0" width="100%" bgcolor="white">
1228 <tr><td>
1229   <table width="100%" cellpadding="0" cellspacing="0" border="0">
1230   <tr valign="top"><td>
1231   <h3 class="epydoc"><span class="sig"><span class="sig-name">end_tag</span>(<span class="sig-arg">el</span>)</span>
1232   </h3>
1233   </td><td align="right" valign="top"
1234     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#end_tag">source&nbsp;code</a></span>&nbsp;
1235     </td>
1236   </tr></table>
1237   
1238   The text representation of an end tag for a tag.  Includes
1239 trailing whitespace when appropriate.
1240   <dl class="fields">
1241   </dl>
1242 </td></tr></table>
1243 </div>
1244 <a name="fixup_ins_del_tags"></a>
1245 <div class="private">
1246 <table class="details" border="1" cellpadding="3"
1247        cellspacing="0" width="100%" bgcolor="white">
1248 <tr><td>
1249   <table width="100%" cellpadding="0" cellspacing="0" border="0">
1250   <tr valign="top"><td>
1251   <h3 class="epydoc"><span class="sig"><span class="sig-name">fixup_ins_del_tags</span>(<span class="sig-arg">html</span>)</span>
1252   </h3>
1253   </td><td align="right" valign="top"
1254     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#fixup_ins_del_tags">source&nbsp;code</a></span>&nbsp;
1255     </td>
1256   </tr></table>
1257   
1258   Given an html string, move any &lt;ins&gt; or &lt;del&gt; tags inside of any
1259 block-level elements, e.g. transform &lt;ins&gt;&lt;p&gt;word&lt;/p&gt;&lt;/ins&gt; to
1260 &lt;p&gt;&lt;ins&gt;word&lt;/ins&gt;&lt;/p&gt;
1261   <dl class="fields">
1262   </dl>
1263 </td></tr></table>
1264 </div>
1265 <a name="serialize_html_fragment"></a>
1266 <div class="private">
1267 <table class="details" border="1" cellpadding="3"
1268        cellspacing="0" width="100%" bgcolor="white">
1269 <tr><td>
1270   <table width="100%" cellpadding="0" cellspacing="0" border="0">
1271   <tr valign="top"><td>
1272   <h3 class="epydoc"><span class="sig"><span class="sig-name">serialize_html_fragment</span>(<span class="sig-arg">el</span>,
1273         <span class="sig-arg">skip_outer</span>=<span class="sig-default">False</span>)</span>
1274   </h3>
1275   </td><td align="right" valign="top"
1276     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#serialize_html_fragment">source&nbsp;code</a></span>&nbsp;
1277     </td>
1278   </tr></table>
1279   
1280   <p>Serialize a single lxml element as HTML.  The serialized form
1281 includes the elements tail.</p>
1282 <p>If skip_outer is true, then don't serialize the outermost tag</p>
1283   <dl class="fields">
1284   </dl>
1285 </td></tr></table>
1286 </div>
1287 <br />
1288 <!-- ==================== VARIABLES DETAILS ==================== -->
1289 <a name="section-VariablesDetails"></a>
1290 <table class="details" border="1" cellpadding="3"
1291        cellspacing="0" width="100%" bgcolor="white">
1292 <tr bgcolor="#70b0f0" class="table-header">
1293   <td colspan="2" class="table-header">
1294     <table border="0" cellpadding="0" cellspacing="0" width="100%">
1295       <tr valign="top">
1296         <td align="left"><span class="table-header">Variables Details</span></td>
1297         <td align="right" valign="top"
1298          ><span class="options">[<a href="#section-VariablesDetails"
1299          class="privatelink" onclick="toggle_private();"
1300          >hide private</a>]</span></td>
1301       </tr>
1302     </table>
1303   </td>
1304 </tr>
1305 </table>
1306 <a name="empty_tags"></a>
1307 <div class="private">
1308 <table class="details" border="1" cellpadding="3"
1309        cellspacing="0" width="100%" bgcolor="white">
1310 <tr><td>
1311   <h3 class="epydoc">empty_tags</h3>
1312   
1313   <dl class="fields">
1314   </dl>
1315   <dl class="fields">
1316     <dt>Value:</dt>
1317       <dd><table><tr><td><pre class="variable">
1318 <code class="variable-group">(</code><code class="variable-quote">'</code><code class="variable-string">param</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1319  <code class="variable-quote">'</code><code class="variable-string">img</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1320  <code class="variable-quote">'</code><code class="variable-string">area</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1321  <code class="variable-quote">'</code><code class="variable-string">br</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1322  <code class="variable-quote">'</code><code class="variable-string">basefont</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1323  <code class="variable-quote">'</code><code class="variable-string">input</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1324  <code class="variable-quote">'</code><code class="variable-string">base</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1325  <code class="variable-quote">'</code><code class="variable-string">meta</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1326 <code class="variable-ellipsis">...</code>
1327 </pre></td></tr></table>
1328 </dd>
1329   </dl>
1330 </td></tr></table>
1331 </div>
1332 <a name="block_level_tags"></a>
1333 <div class="private">
1334 <table class="details" border="1" cellpadding="3"
1335        cellspacing="0" width="100%" bgcolor="white">
1336 <tr><td>
1337   <h3 class="epydoc">block_level_tags</h3>
1338   
1339   <dl class="fields">
1340   </dl>
1341   <dl class="fields">
1342     <dt>Value:</dt>
1343       <dd><table><tr><td><pre class="variable">
1344 <code class="variable-group">(</code><code class="variable-quote">'</code><code class="variable-string">address</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1345  <code class="variable-quote">'</code><code class="variable-string">blockquote</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1346  <code class="variable-quote">'</code><code class="variable-string">center</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1347  <code class="variable-quote">'</code><code class="variable-string">dir</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1348  <code class="variable-quote">'</code><code class="variable-string">div</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1349  <code class="variable-quote">'</code><code class="variable-string">dl</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1350  <code class="variable-quote">'</code><code class="variable-string">fieldset</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1351  <code class="variable-quote">'</code><code class="variable-string">form</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1352 <code class="variable-ellipsis">...</code>
1353 </pre></td></tr></table>
1354 </dd>
1355   </dl>
1356 </td></tr></table>
1357 </div>
1358 <a name="block_level_container_tags"></a>
1359 <div class="private">
1360 <table class="details" border="1" cellpadding="3"
1361        cellspacing="0" width="100%" bgcolor="white">
1362 <tr><td>
1363   <h3 class="epydoc">block_level_container_tags</h3>
1364   
1365   <dl class="fields">
1366   </dl>
1367   <dl class="fields">
1368     <dt>Value:</dt>
1369       <dd><table><tr><td><pre class="variable">
1370 <code class="variable-group">(</code><code class="variable-quote">'</code><code class="variable-string">dd</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1371  <code class="variable-quote">'</code><code class="variable-string">dt</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1372  <code class="variable-quote">'</code><code class="variable-string">frameset</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1373  <code class="variable-quote">'</code><code class="variable-string">li</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1374  <code class="variable-quote">'</code><code class="variable-string">tbody</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1375  <code class="variable-quote">'</code><code class="variable-string">td</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1376  <code class="variable-quote">'</code><code class="variable-string">tfoot</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1377  <code class="variable-quote">'</code><code class="variable-string">th</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1378 <code class="variable-ellipsis">...</code>
1379 </pre></td></tr></table>
1380 </dd>
1381   </dl>
1382 </td></tr></table>
1383 </div>
1384 <br />
1385 <!-- ==================== NAVIGATION BAR ==================== -->
1386 <table class="navbar" border="0" width="100%" cellpadding="0"
1387        bgcolor="#a0c0ff" cellspacing="0">
1388   <tr valign="middle">
1389   <!-- Home link -->
1390       <th>&nbsp;&nbsp;&nbsp;<a
1391         href="lxml-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>
1392
1393   <!-- Tree link -->
1394       <th>&nbsp;&nbsp;&nbsp;<a
1395         href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>
1396
1397   <!-- Index link -->
1398       <th>&nbsp;&nbsp;&nbsp;<a
1399         href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
1400
1401   <!-- Help link -->
1402       <th>&nbsp;&nbsp;&nbsp;<a
1403         href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
1404
1405   <!-- Project homepage -->
1406       <th class="navbar" align="right" width="100%">
1407         <table border="0" cellpadding="0" cellspacing="0">
1408           <tr><th class="navbar" align="center"
1409             ><a class="navbar" target="_top" href="http://codespeak.net/lxml/">lxml API</a></th>
1410           </tr></table></th>
1411   </tr>
1412 </table>
1413 <table border="0" cellpadding="0" cellspacing="0" width="100%%">
1414   <tr>
1415     <td align="left" class="footer">
1416     Generated by Epydoc 3.0 on Fri Oct 30 14:51:44 2009
1417     </td>
1418     <td align="right" class="footer">
1419       <a target="mainFrame" href="http://epydoc.sourceforge.net"
1420         >http://epydoc.sourceforge.net</a>
1421     </td>
1422   </tr>
1423 </table>
1424
1425 <script type="text/javascript">
1426   <!--
1427   // Private objects are initially displayed (because if
1428   // javascript is turned off then we want them to be
1429   // visible); but by default, we want to hide them.  So hide
1430   // them unless we have a cookie that says to show them.
1431   checkCookie();
1432   // -->
1433 </script>
1434 </body>
1435 </html>