Imported Upstream version 4.5.2
[platform/upstream/python-lxml.git] / doc / html / api / lxml.html.diff-module.html
1 <?xml version="1.0" encoding="ascii"?>
2 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
3           "DTD/xhtml1-transitional.dtd">
4 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
5 <head>
6   <title>lxml.html.diff</title>
7   <link rel="stylesheet" href="epydoc.css" type="text/css" />
8   <script type="text/javascript" src="epydoc.js"></script>
9 </head>
10
11 <body bgcolor="white" text="black" link="blue" vlink="#204080"
12       alink="#204080">
13 <!-- ==================== NAVIGATION BAR ==================== -->
14 <table class="navbar" border="0" width="100%" cellpadding="0"
15        bgcolor="#a0c0ff" cellspacing="0">
16   <tr valign="middle">
17   <!-- Home link -->
18       <th>&nbsp;&nbsp;&nbsp;<a
19         href="lxml-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>
20
21   <!-- Tree link -->
22       <th>&nbsp;&nbsp;&nbsp;<a
23         href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>
24
25   <!-- Index link -->
26       <th>&nbsp;&nbsp;&nbsp;<a
27         href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
28
29   <!-- Help link -->
30       <th>&nbsp;&nbsp;&nbsp;<a
31         href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
32
33   <!-- Project homepage -->
34       <th class="navbar" align="right" width="100%">
35         <table border="0" cellpadding="0" cellspacing="0">
36           <tr><th class="navbar" align="center"
37             ><a class="navbar" target="_top" href="/">lxml API</a></th>
38           </tr></table></th>
39   </tr>
40 </table>
41 <table width="100%" cellpadding="0" cellspacing="0">
42   <tr valign="top">
43     <td width="100%">
44       <span class="breadcrumbs">
45         <a href="lxml-module.html">Package&nbsp;lxml</a> ::
46         <a href="lxml.html-module.html">Package&nbsp;html</a> ::
47         Module&nbsp;diff
48       </span>
49     </td>
50     <td>
51       <table cellpadding="0" cellspacing="0">
52         <!-- hide/show private -->
53         <tr><td align="right"><span class="options">[<a href="javascript:void(0);" class="privatelink"
54     onclick="toggle_private();">hide&nbsp;private</a>]</span></td></tr>
55         <tr><td align="right"><span class="options"
56             >[<a href="frames.html" target="_top">frames</a
57             >]&nbsp;|&nbsp;<a href="lxml.html.diff-module.html"
58             target="_top">no&nbsp;frames</a>]</span></td></tr>
59       </table>
60     </td>
61   </tr>
62 </table>
63 <!-- ==================== MODULE DESCRIPTION ==================== -->
64 <h1 class="epydoc">Module diff</h1><p class="nomargin-top"><span class="codelink"><a href="lxml.html.diff-pysrc.html">source&nbsp;code</a></span></p>
65 <!-- ==================== CLASSES ==================== -->
66 <a name="section-Classes"></a>
67 <table class="summary" border="1" cellpadding="3"
68        cellspacing="0" width="100%" bgcolor="white">
69 <tr bgcolor="#70b0f0" class="table-header">
70   <td colspan="2" class="table-header">
71     <table border="0" cellpadding="0" cellspacing="0" width="100%">
72       <tr valign="top">
73         <td align="left"><span class="table-header">Classes</span></td>
74         <td align="right" valign="top"
75          ><span class="options">[<a href="#section-Classes"
76          class="privatelink" onclick="toggle_private();"
77          >hide private</a>]</span></td>
78       </tr>
79     </table>
80   </td>
81 </tr>
82 <tr class="private">
83     <td width="15%" align="right" valign="top" class="summary">
84       <span class="summary-type">&nbsp;</span>
85     </td><td class="summary">
86         <a href="str-class.html" class="summary-name" onclick="show_private();">basestring</a><br />
87       str(object='') -&gt; string
88     </td>
89   </tr>
90 <tr class="private">
91     <td width="15%" align="right" valign="top" class="summary">
92       <span class="summary-type">&nbsp;</span>
93     </td><td class="summary">
94         <a href="lxml.html.diff.DEL_START-class.html" class="summary-name" onclick="show_private();">DEL_START</a>
95     </td>
96   </tr>
97 <tr class="private">
98     <td width="15%" align="right" valign="top" class="summary">
99       <span class="summary-type">&nbsp;</span>
100     </td><td class="summary">
101         <a href="lxml.html.diff.DEL_END-class.html" class="summary-name" onclick="show_private();">DEL_END</a>
102     </td>
103   </tr>
104 <tr class="private">
105     <td width="15%" align="right" valign="top" class="summary">
106       <span class="summary-type">&nbsp;</span>
107     </td><td class="summary">
108         <a href="lxml.html.diff.NoDeletes-class.html" class="summary-name" onclick="show_private();">NoDeletes</a><br />
109       Raised when the document no longer contains any pending deletes
110 (DEL_START/DEL_END)
111     </td>
112   </tr>
113 <tr class="private">
114     <td width="15%" align="right" valign="top" class="summary">
115       <span class="summary-type">&nbsp;</span>
116     </td><td class="summary">
117         <a href="lxml.html.diff.token-class.html" class="summary-name" onclick="show_private();">token</a><br />
118       Represents a diffable token, generally a word that is displayed to
119 the user.  Opening tags are attached to this token when they are
120 adjacent (pre_tags) and closing tags that follow the word
121 (post_tags).  Some exceptions occur when there are empty tags
122 adjacent to a word, so there may be close tags in pre_tags, or
123 open tags in post_tags.
124     </td>
125   </tr>
126 <tr class="private">
127     <td width="15%" align="right" valign="top" class="summary">
128       <span class="summary-type">&nbsp;</span>
129     </td><td class="summary">
130         <a href="lxml.html.diff.tag_token-class.html" class="summary-name" onclick="show_private();">tag_token</a><br />
131       Represents a token that is actually a tag.  Currently this is just
132 the &lt;img&gt; tag, which takes up visible space just like a word but
133 is only represented in a document by a tag.
134     </td>
135   </tr>
136 <tr class="private">
137     <td width="15%" align="right" valign="top" class="summary">
138       <span class="summary-type">&nbsp;</span>
139     </td><td class="summary">
140         <a href="lxml.html.diff.href_token-class.html" class="summary-name" onclick="show_private();">href_token</a><br />
141       Represents the href in an anchor tag.  Unlike other words, we only
142 show the href when it changes.
143     </td>
144   </tr>
145 <tr class="private">
146     <td width="15%" align="right" valign="top" class="summary">
147       <span class="summary-type">&nbsp;</span>
148     </td><td class="summary">
149         <a href="lxml.html.diff.InsensitiveSequenceMatcher-class.html" class="summary-name" onclick="show_private();">InsensitiveSequenceMatcher</a><br />
150       Acts like SequenceMatcher, but tries not to find very small equal
151 blocks amidst large spans of changes
152     </td>
153   </tr>
154 </table>
155 <!-- ==================== FUNCTIONS ==================== -->
156 <a name="section-Functions"></a>
157 <table class="summary" border="1" cellpadding="3"
158        cellspacing="0" width="100%" bgcolor="white">
159 <tr bgcolor="#70b0f0" class="table-header">
160   <td colspan="2" class="table-header">
161     <table border="0" cellpadding="0" cellspacing="0" width="100%">
162       <tr valign="top">
163         <td align="left"><span class="table-header">Functions</span></td>
164         <td align="right" valign="top"
165          ><span class="options">[<a href="#section-Functions"
166          class="privatelink" onclick="toggle_private();"
167          >hide private</a>]</span></td>
168       </tr>
169     </table>
170   </td>
171 </tr>
172 <tr class="private">
173     <td width="15%" align="right" valign="top" class="summary">
174       <span class="summary-type">&nbsp;</span>
175     </td><td class="summary">
176       <table width="100%" cellpadding="0" cellspacing="0" border="0">
177         <tr>
178           <td><span class="summary-sig"><a name="default_markup"></a><span class="summary-sig-name">default_markup</span>(<span class="summary-sig-arg">text</span>,
179         <span class="summary-sig-arg">version</span>)</span></td>
180           <td align="right" valign="top">
181             <span class="codelink"><a href="lxml.html.diff-pysrc.html#default_markup">source&nbsp;code</a></span>
182             
183           </td>
184         </tr>
185       </table>
186       
187     </td>
188   </tr>
189 <tr>
190     <td width="15%" align="right" valign="top" class="summary">
191       <span class="summary-type">&nbsp;</span>
192     </td><td class="summary">
193       <table width="100%" cellpadding="0" cellspacing="0" border="0">
194         <tr>
195           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#html_annotate" class="summary-sig-name">html_annotate</a>(<span class="summary-sig-arg">doclist</span>,
196         <span class="summary-sig-arg">markup</span>=<span class="summary-sig-default">default_markup</span>)</span><br />
197       doclist should be ordered from oldest to newest, like:</td>
198           <td align="right" valign="top">
199             <span class="codelink"><a href="lxml.html.diff-pysrc.html#html_annotate">source&nbsp;code</a></span>
200             
201           </td>
202         </tr>
203       </table>
204       
205     </td>
206   </tr>
207 <tr class="private">
208     <td width="15%" align="right" valign="top" class="summary">
209       <span class="summary-type">&nbsp;</span>
210     </td><td class="summary">
211       <table width="100%" cellpadding="0" cellspacing="0" border="0">
212         <tr>
213           <td><span class="summary-sig"><a name="tokenize_annotated"></a><span class="summary-sig-name">tokenize_annotated</span>(<span class="summary-sig-arg">doc</span>,
214         <span class="summary-sig-arg">annotation</span>)</span><br />
215       Tokenize a document and add an annotation attribute to each token</td>
216           <td align="right" valign="top">
217             <span class="codelink"><a href="lxml.html.diff-pysrc.html#tokenize_annotated">source&nbsp;code</a></span>
218             
219           </td>
220         </tr>
221       </table>
222       
223     </td>
224   </tr>
225 <tr class="private">
226     <td width="15%" align="right" valign="top" class="summary">
227       <span class="summary-type">&nbsp;</span>
228     </td><td class="summary">
229       <table width="100%" cellpadding="0" cellspacing="0" border="0">
230         <tr>
231           <td><span class="summary-sig"><a name="html_annotate_merge_annotations"></a><span class="summary-sig-name">html_annotate_merge_annotations</span>(<span class="summary-sig-arg">tokens_old</span>,
232         <span class="summary-sig-arg">tokens_new</span>)</span><br />
233       Merge the annotations from tokens_old into tokens_new, when the
234 tokens in the new document already existed in the old document.</td>
235           <td align="right" valign="top">
236             <span class="codelink"><a href="lxml.html.diff-pysrc.html#html_annotate_merge_annotations">source&nbsp;code</a></span>
237             
238           </td>
239         </tr>
240       </table>
241       
242     </td>
243   </tr>
244 <tr class="private">
245     <td width="15%" align="right" valign="top" class="summary">
246       <span class="summary-type">&nbsp;</span>
247     </td><td class="summary">
248       <table width="100%" cellpadding="0" cellspacing="0" border="0">
249         <tr>
250           <td><span class="summary-sig"><a name="copy_annotations"></a><span class="summary-sig-name">copy_annotations</span>(<span class="summary-sig-arg">src</span>,
251         <span class="summary-sig-arg">dest</span>)</span><br />
252       Copy annotations from the tokens listed in src to the tokens in dest</td>
253           <td align="right" valign="top">
254             <span class="codelink"><a href="lxml.html.diff-pysrc.html#copy_annotations">source&nbsp;code</a></span>
255             
256           </td>
257         </tr>
258       </table>
259       
260     </td>
261   </tr>
262 <tr class="private">
263     <td width="15%" align="right" valign="top" class="summary">
264       <span class="summary-type">&nbsp;</span>
265     </td><td class="summary">
266       <table width="100%" cellpadding="0" cellspacing="0" border="0">
267         <tr>
268           <td><span class="summary-sig"><a name="compress_tokens"></a><span class="summary-sig-name">compress_tokens</span>(<span class="summary-sig-arg">tokens</span>)</span><br />
269       Combine adjacent tokens when there is no HTML between the tokens,
270 and they share an annotation</td>
271           <td align="right" valign="top">
272             <span class="codelink"><a href="lxml.html.diff-pysrc.html#compress_tokens">source&nbsp;code</a></span>
273             
274           </td>
275         </tr>
276       </table>
277       
278     </td>
279   </tr>
280 <tr class="private">
281     <td width="15%" align="right" valign="top" class="summary">
282       <span class="summary-type">&nbsp;</span>
283     </td><td class="summary">
284       <table width="100%" cellpadding="0" cellspacing="0" border="0">
285         <tr>
286           <td><span class="summary-sig"><a name="compress_merge_back"></a><span class="summary-sig-name">compress_merge_back</span>(<span class="summary-sig-arg">tokens</span>,
287         <span class="summary-sig-arg">tok</span>)</span><br />
288       Merge tok into the last element of tokens (modifying the list of
289 tokens in-place).</td>
290           <td align="right" valign="top">
291             <span class="codelink"><a href="lxml.html.diff-pysrc.html#compress_merge_back">source&nbsp;code</a></span>
292             
293           </td>
294         </tr>
295       </table>
296       
297     </td>
298   </tr>
299 <tr class="private">
300     <td width="15%" align="right" valign="top" class="summary">
301       <span class="summary-type">&nbsp;</span>
302     </td><td class="summary">
303       <table width="100%" cellpadding="0" cellspacing="0" border="0">
304         <tr>
305           <td><span class="summary-sig"><a name="markup_serialize_tokens"></a><span class="summary-sig-name">markup_serialize_tokens</span>(<span class="summary-sig-arg">tokens</span>,
306         <span class="summary-sig-arg">markup_func</span>)</span><br />
307       Serialize the list of tokens into a list of text chunks, calling
308 markup_func around text to add annotations.</td>
309           <td align="right" valign="top">
310             <span class="codelink"><a href="lxml.html.diff-pysrc.html#markup_serialize_tokens">source&nbsp;code</a></span>
311             
312           </td>
313         </tr>
314       </table>
315       
316     </td>
317   </tr>
318 <tr>
319     <td width="15%" align="right" valign="top" class="summary">
320       <span class="summary-type">&nbsp;</span>
321     </td><td class="summary">
322       <table width="100%" cellpadding="0" cellspacing="0" border="0">
323         <tr>
324           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#htmldiff" class="summary-sig-name">htmldiff</a>(<span class="summary-sig-arg">old_html</span>,
325         <span class="summary-sig-arg">new_html</span>)</span><br />
326       Do a diff of the old and new document.  The documents are HTML
327 <em>fragments</em> (str/UTF8 or unicode), they are not complete documents
328 (i.e., no &lt;html&gt; tag).</td>
329           <td align="right" valign="top">
330             <span class="codelink"><a href="lxml.html.diff-pysrc.html#htmldiff">source&nbsp;code</a></span>
331             
332           </td>
333         </tr>
334       </table>
335       
336     </td>
337   </tr>
338 <tr class="private">
339     <td width="15%" align="right" valign="top" class="summary">
340       <span class="summary-type">&nbsp;</span>
341     </td><td class="summary">
342       <table width="100%" cellpadding="0" cellspacing="0" border="0">
343         <tr>
344           <td><span class="summary-sig"><a name="htmldiff_tokens"></a><span class="summary-sig-name">htmldiff_tokens</span>(<span class="summary-sig-arg">html1_tokens</span>,
345         <span class="summary-sig-arg">html2_tokens</span>)</span><br />
346       Does a diff on the tokens themselves, returning a list of text
347 chunks (not tokens).</td>
348           <td align="right" valign="top">
349             <span class="codelink"><a href="lxml.html.diff-pysrc.html#htmldiff_tokens">source&nbsp;code</a></span>
350             
351           </td>
352         </tr>
353       </table>
354       
355     </td>
356   </tr>
357 <tr class="private">
358     <td width="15%" align="right" valign="top" class="summary">
359       <span class="summary-type">&nbsp;</span>
360     </td><td class="summary">
361       <table width="100%" cellpadding="0" cellspacing="0" border="0">
362         <tr>
363           <td><span class="summary-sig"><a name="expand_tokens"></a><span class="summary-sig-name">expand_tokens</span>(<span class="summary-sig-arg">tokens</span>,
364         <span class="summary-sig-arg">equal</span>=<span class="summary-sig-default">False</span>)</span><br />
365       Given a list of tokens, return a generator of the chunks of
366 text for the data in the tokens.</td>
367           <td align="right" valign="top">
368             <span class="codelink"><a href="lxml.html.diff-pysrc.html#expand_tokens">source&nbsp;code</a></span>
369             
370           </td>
371         </tr>
372       </table>
373       
374     </td>
375   </tr>
376 <tr class="private">
377     <td width="15%" align="right" valign="top" class="summary">
378       <span class="summary-type">&nbsp;</span>
379     </td><td class="summary">
380       <table width="100%" cellpadding="0" cellspacing="0" border="0">
381         <tr>
382           <td><span class="summary-sig"><a name="merge_insert"></a><span class="summary-sig-name">merge_insert</span>(<span class="summary-sig-arg">ins_chunks</span>,
383         <span class="summary-sig-arg">doc</span>)</span><br />
384       doc is the already-handled document (as a list of text chunks);
385 here we add &lt;ins&gt;ins_chunks&lt;/ins&gt; to the end of that.</td>
386           <td align="right" valign="top">
387             <span class="codelink"><a href="lxml.html.diff-pysrc.html#merge_insert">source&nbsp;code</a></span>
388             
389           </td>
390         </tr>
391       </table>
392       
393     </td>
394   </tr>
395 <tr class="private">
396     <td width="15%" align="right" valign="top" class="summary">
397       <span class="summary-type">&nbsp;</span>
398     </td><td class="summary">
399       <table width="100%" cellpadding="0" cellspacing="0" border="0">
400         <tr>
401           <td><span class="summary-sig"><a name="merge_delete"></a><span class="summary-sig-name">merge_delete</span>(<span class="summary-sig-arg">del_chunks</span>,
402         <span class="summary-sig-arg">doc</span>)</span><br />
403       Adds the text chunks in del_chunks to the document doc (another
404 list of text chunks) with marker to show it is a delete.
405 cleanup_delete later resolves these markers into &lt;del&gt; tags.</td>
406           <td align="right" valign="top">
407             <span class="codelink"><a href="lxml.html.diff-pysrc.html#merge_delete">source&nbsp;code</a></span>
408             
409           </td>
410         </tr>
411       </table>
412       
413     </td>
414   </tr>
415 <tr class="private">
416     <td width="15%" align="right" valign="top" class="summary">
417       <span class="summary-type">&nbsp;</span>
418     </td><td class="summary">
419       <table width="100%" cellpadding="0" cellspacing="0" border="0">
420         <tr>
421           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#cleanup_delete" class="summary-sig-name" onclick="show_private();">cleanup_delete</a>(<span class="summary-sig-arg">chunks</span>)</span><br />
422       Cleans up any DEL_START/DEL_END markers in the document, replacing
423 them with &lt;del&gt;&lt;/del&gt;.  To do this while keeping the document
424 valid, it may need to drop some tags (either start or end tags).</td>
425           <td align="right" valign="top">
426             <span class="codelink"><a href="lxml.html.diff-pysrc.html#cleanup_delete">source&nbsp;code</a></span>
427             
428           </td>
429         </tr>
430       </table>
431       
432     </td>
433   </tr>
434 <tr class="private">
435     <td width="15%" align="right" valign="top" class="summary">
436       <span class="summary-type">&nbsp;</span>
437     </td><td class="summary">
438       <table width="100%" cellpadding="0" cellspacing="0" border="0">
439         <tr>
440           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#split_unbalanced" class="summary-sig-name" onclick="show_private();">split_unbalanced</a>(<span class="summary-sig-arg">chunks</span>)</span><br />
441       Return (unbalanced_start, balanced, unbalanced_end), where each is
442 a list of text and tag chunks.</td>
443           <td align="right" valign="top">
444             <span class="codelink"><a href="lxml.html.diff-pysrc.html#split_unbalanced">source&nbsp;code</a></span>
445             
446           </td>
447         </tr>
448       </table>
449       
450     </td>
451   </tr>
452 <tr class="private">
453     <td width="15%" align="right" valign="top" class="summary">
454       <span class="summary-type">&nbsp;</span>
455     </td><td class="summary">
456       <table width="100%" cellpadding="0" cellspacing="0" border="0">
457         <tr>
458           <td><span class="summary-sig"><a name="split_delete"></a><span class="summary-sig-name">split_delete</span>(<span class="summary-sig-arg">chunks</span>)</span><br />
459       Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
460 stuff_after_DEL_END).  Returns the first case found (there may be
461 more DEL_STARTs in stuff_after_DEL_END).  Raises NoDeletes if
462 there's no DEL_START found.</td>
463           <td align="right" valign="top">
464             <span class="codelink"><a href="lxml.html.diff-pysrc.html#split_delete">source&nbsp;code</a></span>
465             
466           </td>
467         </tr>
468       </table>
469       
470     </td>
471   </tr>
472 <tr class="private">
473     <td width="15%" align="right" valign="top" class="summary">
474       <span class="summary-type">&nbsp;</span>
475     </td><td class="summary">
476       <table width="100%" cellpadding="0" cellspacing="0" border="0">
477         <tr>
478           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#locate_unbalanced_start" class="summary-sig-name" onclick="show_private();">locate_unbalanced_start</a>(<span class="summary-sig-arg">unbalanced_start</span>,
479         <span class="summary-sig-arg">pre_delete</span>,
480         <span class="summary-sig-arg">post_delete</span>)</span><br />
481       pre_delete and post_delete implicitly point to a place in the
482 document (where the two were split).  This moves that point (by
483 popping items from one and pushing them onto the other).  It moves
484 the point to try to find a place where unbalanced_start applies.</td>
485           <td align="right" valign="top">
486             <span class="codelink"><a href="lxml.html.diff-pysrc.html#locate_unbalanced_start">source&nbsp;code</a></span>
487             
488           </td>
489         </tr>
490       </table>
491       
492     </td>
493   </tr>
494 <tr class="private">
495     <td width="15%" align="right" valign="top" class="summary">
496       <span class="summary-type">&nbsp;</span>
497     </td><td class="summary">
498       <table width="100%" cellpadding="0" cellspacing="0" border="0">
499         <tr>
500           <td><span class="summary-sig"><a name="locate_unbalanced_end"></a><span class="summary-sig-name">locate_unbalanced_end</span>(<span class="summary-sig-arg">unbalanced_end</span>,
501         <span class="summary-sig-arg">pre_delete</span>,
502         <span class="summary-sig-arg">post_delete</span>)</span><br />
503       like locate_unbalanced_start, except handling end tags and
504 possibly moving the point earlier in the document.</td>
505           <td align="right" valign="top">
506             <span class="codelink"><a href="lxml.html.diff-pysrc.html#locate_unbalanced_end">source&nbsp;code</a></span>
507             
508           </td>
509         </tr>
510       </table>
511       
512     </td>
513   </tr>
514 <tr class="private">
515     <td width="15%" align="right" valign="top" class="summary">
516       <span class="summary-type">&nbsp;</span>
517     </td><td class="summary">
518       <table width="100%" cellpadding="0" cellspacing="0" border="0">
519         <tr>
520           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#tokenize" class="summary-sig-name" onclick="show_private();">tokenize</a>(<span class="summary-sig-arg">html</span>,
521         <span class="summary-sig-arg">include_hrefs</span>=<span class="summary-sig-default">True</span>)</span><br />
522       Parse the given HTML and returns token objects (words with attached tags).</td>
523           <td align="right" valign="top">
524             <span class="codelink"><a href="lxml.html.diff-pysrc.html#tokenize">source&nbsp;code</a></span>
525             
526           </td>
527         </tr>
528       </table>
529       
530     </td>
531   </tr>
532 <tr class="private">
533     <td width="15%" align="right" valign="top" class="summary">
534       <span class="summary-type">&nbsp;</span>
535     </td><td class="summary">
536       <table width="100%" cellpadding="0" cellspacing="0" border="0">
537         <tr>
538           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#parse_html" class="summary-sig-name" onclick="show_private();">parse_html</a>(<span class="summary-sig-arg">html</span>,
539         <span class="summary-sig-arg">cleanup</span>=<span class="summary-sig-default">True</span>)</span><br />
540       Parses an HTML fragment, returning an lxml element.  Note that the HTML will be
541 wrapped in a &lt;div&gt; tag that was not in the original document.</td>
542           <td align="right" valign="top">
543             <span class="codelink"><a href="lxml.html.diff-pysrc.html#parse_html">source&nbsp;code</a></span>
544             
545           </td>
546         </tr>
547       </table>
548       
549     </td>
550   </tr>
551 <tr class="private">
552     <td width="15%" align="right" valign="top" class="summary">
553       <span class="summary-type">&nbsp;</span>
554     </td><td class="summary">
555       <table width="100%" cellpadding="0" cellspacing="0" border="0">
556         <tr>
557           <td><span class="summary-sig"><a name="cleanup_html"></a><span class="summary-sig-name">cleanup_html</span>(<span class="summary-sig-arg">html</span>)</span><br />
558       This 'cleans' the HTML, meaning that any page structure is removed
559 (only the contents of &lt;body&gt; are used, if there is any &lt;body).
560 Also &lt;ins&gt; and &lt;del&gt; tags are removed.</td>
561           <td align="right" valign="top">
562             <span class="codelink"><a href="lxml.html.diff-pysrc.html#cleanup_html">source&nbsp;code</a></span>
563             
564           </td>
565         </tr>
566       </table>
567       
568     </td>
569   </tr>
570 <tr class="private">
571     <td width="15%" align="right" valign="top" class="summary">
572       <span class="summary-type">&nbsp;</span>
573     </td><td class="summary">
574       <table width="100%" cellpadding="0" cellspacing="0" border="0">
575         <tr>
576           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#split_trailing_whitespace" class="summary-sig-name" onclick="show_private();">split_trailing_whitespace</a>(<span class="summary-sig-arg">word</span>)</span><br />
577       This function takes a word, such as 'test</td>
578           <td align="right" valign="top">
579             <span class="codelink"><a href="lxml.html.diff-pysrc.html#split_trailing_whitespace">source&nbsp;code</a></span>
580             
581           </td>
582         </tr>
583       </table>
584       
585     </td>
586   </tr>
587 <tr class="private">
588     <td width="15%" align="right" valign="top" class="summary">
589       <span class="summary-type">&nbsp;</span>
590     </td><td class="summary">
591       <table width="100%" cellpadding="0" cellspacing="0" border="0">
592         <tr>
593           <td><span class="summary-sig"><a name="fixup_chunks"></a><span class="summary-sig-name">fixup_chunks</span>(<span class="summary-sig-arg">chunks</span>)</span><br />
594       This function takes a list of chunks and produces a list of tokens.</td>
595           <td align="right" valign="top">
596             <span class="codelink"><a href="lxml.html.diff-pysrc.html#fixup_chunks">source&nbsp;code</a></span>
597             
598           </td>
599         </tr>
600       </table>
601       
602     </td>
603   </tr>
604 <tr class="private">
605     <td width="15%" align="right" valign="top" class="summary">
606       <span class="summary-type">&nbsp;</span>
607     </td><td class="summary">
608       <table width="100%" cellpadding="0" cellspacing="0" border="0">
609         <tr>
610           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#flatten_el" class="summary-sig-name" onclick="show_private();">flatten_el</a>(<span class="summary-sig-arg">el</span>,
611         <span class="summary-sig-arg">include_hrefs</span>,
612         <span class="summary-sig-arg">skip_tag</span>=<span class="summary-sig-default">False</span>)</span><br />
613       Takes an lxml element el, and generates all the text chunks for
614 that tag.  Each start tag is a chunk, each word is a chunk, and each
615 end tag is a chunk.</td>
616           <td align="right" valign="top">
617             <span class="codelink"><a href="lxml.html.diff-pysrc.html#flatten_el">source&nbsp;code</a></span>
618             
619           </td>
620         </tr>
621       </table>
622       
623     </td>
624   </tr>
625 <tr class="private">
626     <td width="15%" align="right" valign="top" class="summary">
627       <span class="summary-type">&nbsp;</span>
628     </td><td class="summary">
629       <table width="100%" cellpadding="0" cellspacing="0" border="0">
630         <tr>
631           <td><span class="summary-sig"><a name="split_words"></a><span class="summary-sig-name">split_words</span>(<span class="summary-sig-arg">text</span>)</span><br />
632       Splits some text into words. Includes trailing whitespace
633 on each word when appropriate.</td>
634           <td align="right" valign="top">
635             <span class="codelink"><a href="lxml.html.diff-pysrc.html#split_words">source&nbsp;code</a></span>
636             
637           </td>
638         </tr>
639       </table>
640       
641     </td>
642   </tr>
643 <tr class="private">
644     <td width="15%" align="right" valign="top" class="summary">
645       <span class="summary-type">&nbsp;</span>
646     </td><td class="summary">
647       <table width="100%" cellpadding="0" cellspacing="0" border="0">
648         <tr>
649           <td><span class="summary-sig"><a name="start_tag"></a><span class="summary-sig-name">start_tag</span>(<span class="summary-sig-arg">el</span>)</span><br />
650       The text representation of the start tag for a tag.</td>
651           <td align="right" valign="top">
652             <span class="codelink"><a href="lxml.html.diff-pysrc.html#start_tag">source&nbsp;code</a></span>
653             
654           </td>
655         </tr>
656       </table>
657       
658     </td>
659   </tr>
660 <tr class="private">
661     <td width="15%" align="right" valign="top" class="summary">
662       <span class="summary-type">&nbsp;</span>
663     </td><td class="summary">
664       <table width="100%" cellpadding="0" cellspacing="0" border="0">
665         <tr>
666           <td><span class="summary-sig"><a name="end_tag"></a><span class="summary-sig-name">end_tag</span>(<span class="summary-sig-arg">el</span>)</span><br />
667       The text representation of an end tag for a tag.  Includes
668 trailing whitespace when appropriate.</td>
669           <td align="right" valign="top">
670             <span class="codelink"><a href="lxml.html.diff-pysrc.html#end_tag">source&nbsp;code</a></span>
671             
672           </td>
673         </tr>
674       </table>
675       
676     </td>
677   </tr>
678 <tr class="private">
679     <td width="15%" align="right" valign="top" class="summary">
680       <span class="summary-type">&nbsp;</span>
681     </td><td class="summary">
682       <table width="100%" cellpadding="0" cellspacing="0" border="0">
683         <tr>
684           <td><span class="summary-sig"><a name="is_word"></a><span class="summary-sig-name">is_word</span>(<span class="summary-sig-arg">tok</span>)</span></td>
685           <td align="right" valign="top">
686             <span class="codelink"><a href="lxml.html.diff-pysrc.html#is_word">source&nbsp;code</a></span>
687             
688           </td>
689         </tr>
690       </table>
691       
692     </td>
693   </tr>
694 <tr class="private">
695     <td width="15%" align="right" valign="top" class="summary">
696       <span class="summary-type">&nbsp;</span>
697     </td><td class="summary">
698       <table width="100%" cellpadding="0" cellspacing="0" border="0">
699         <tr>
700           <td><span class="summary-sig"><a name="is_end_tag"></a><span class="summary-sig-name">is_end_tag</span>(<span class="summary-sig-arg">tok</span>)</span></td>
701           <td align="right" valign="top">
702             <span class="codelink"><a href="lxml.html.diff-pysrc.html#is_end_tag">source&nbsp;code</a></span>
703             
704           </td>
705         </tr>
706       </table>
707       
708     </td>
709   </tr>
710 <tr class="private">
711     <td width="15%" align="right" valign="top" class="summary">
712       <span class="summary-type">&nbsp;</span>
713     </td><td class="summary">
714       <table width="100%" cellpadding="0" cellspacing="0" border="0">
715         <tr>
716           <td><span class="summary-sig"><a name="is_start_tag"></a><span class="summary-sig-name">is_start_tag</span>(<span class="summary-sig-arg">tok</span>)</span></td>
717           <td align="right" valign="top">
718             <span class="codelink"><a href="lxml.html.diff-pysrc.html#is_start_tag">source&nbsp;code</a></span>
719             
720           </td>
721         </tr>
722       </table>
723       
724     </td>
725   </tr>
726 <tr class="private">
727     <td width="15%" align="right" valign="top" class="summary">
728       <span class="summary-type">&nbsp;</span>
729     </td><td class="summary">
730       <table width="100%" cellpadding="0" cellspacing="0" border="0">
731         <tr>
732           <td><span class="summary-sig"><a name="fixup_ins_del_tags"></a><span class="summary-sig-name">fixup_ins_del_tags</span>(<span class="summary-sig-arg">html</span>)</span><br />
733       Given an html string, move any &lt;ins&gt; or &lt;del&gt; tags inside of any
734 block-level elements, e.g. transform &lt;ins&gt;&lt;p&gt;word&lt;/p&gt;&lt;/ins&gt; to
735 &lt;p&gt;&lt;ins&gt;word&lt;/ins&gt;&lt;/p&gt;</td>
736           <td align="right" valign="top">
737             <span class="codelink"><a href="lxml.html.diff-pysrc.html#fixup_ins_del_tags">source&nbsp;code</a></span>
738             
739           </td>
740         </tr>
741       </table>
742       
743     </td>
744   </tr>
745 <tr class="private">
746     <td width="15%" align="right" valign="top" class="summary">
747       <span class="summary-type">&nbsp;</span>
748     </td><td class="summary">
749       <table width="100%" cellpadding="0" cellspacing="0" border="0">
750         <tr>
751           <td><span class="summary-sig"><a href="lxml.html.diff-module.html#serialize_html_fragment" class="summary-sig-name" onclick="show_private();">serialize_html_fragment</a>(<span class="summary-sig-arg">el</span>,
752         <span class="summary-sig-arg">skip_outer</span>=<span class="summary-sig-default">False</span>)</span><br />
753       Serialize a single lxml element as HTML.  The serialized form
754 includes the elements tail.</td>
755           <td align="right" valign="top">
756             <span class="codelink"><a href="lxml.html.diff-pysrc.html#serialize_html_fragment">source&nbsp;code</a></span>
757             
758           </td>
759         </tr>
760       </table>
761       
762     </td>
763   </tr>
764 <tr class="private">
765     <td width="15%" align="right" valign="top" class="summary">
766       <span class="summary-type">&nbsp;</span>
767     </td><td class="summary">
768       <table width="100%" cellpadding="0" cellspacing="0" border="0">
769         <tr>
770           <td><span class="summary-sig"><a name="_fixup_ins_del_tags"></a><span class="summary-sig-name">_fixup_ins_del_tags</span>(<span class="summary-sig-arg">doc</span>)</span><br />
771       fixup_ins_del_tags that works on an lxml document in-place</td>
772           <td align="right" valign="top">
773             <span class="codelink"><a href="lxml.html.diff-pysrc.html#_fixup_ins_del_tags">source&nbsp;code</a></span>
774             
775           </td>
776         </tr>
777       </table>
778       
779     </td>
780   </tr>
781 <tr class="private">
782     <td width="15%" align="right" valign="top" class="summary">
783       <span class="summary-type">&nbsp;</span>
784     </td><td class="summary">
785       <table width="100%" cellpadding="0" cellspacing="0" border="0">
786         <tr>
787           <td><span class="summary-sig"><a name="_contains_block_level_tag"></a><span class="summary-sig-name">_contains_block_level_tag</span>(<span class="summary-sig-arg">el</span>)</span><br />
788       True if the element contains any block-level elements, like &lt;p&gt;, &lt;td&gt;, etc.</td>
789           <td align="right" valign="top">
790             <span class="codelink"><a href="lxml.html.diff-pysrc.html#_contains_block_level_tag">source&nbsp;code</a></span>
791             
792           </td>
793         </tr>
794       </table>
795       
796     </td>
797   </tr>
798 <tr class="private">
799     <td width="15%" align="right" valign="top" class="summary">
800       <span class="summary-type">&nbsp;</span>
801     </td><td class="summary">
802       <table width="100%" cellpadding="0" cellspacing="0" border="0">
803         <tr>
804           <td><span class="summary-sig"><a name="_move_el_inside_block"></a><span class="summary-sig-name">_move_el_inside_block</span>(<span class="summary-sig-arg">el</span>,
805         <span class="summary-sig-arg">tag</span>)</span><br />
806       helper for _fixup_ins_del_tags; actually takes the &lt;ins&gt; etc tags
807 and moves them inside any block-level tags.</td>
808           <td align="right" valign="top">
809             <span class="codelink"><a href="lxml.html.diff-pysrc.html#_move_el_inside_block">source&nbsp;code</a></span>
810             
811           </td>
812         </tr>
813       </table>
814       
815     </td>
816   </tr>
817 <tr class="private">
818     <td width="15%" align="right" valign="top" class="summary">
819       <span class="summary-type">&nbsp;</span>
820     </td><td class="summary">
821       <table width="100%" cellpadding="0" cellspacing="0" border="0">
822         <tr>
823           <td><span class="summary-sig"><a name="_merge_element_contents"></a><span class="summary-sig-name">_merge_element_contents</span>(<span class="summary-sig-arg">el</span>)</span><br />
824       Removes an element, but merges its contents into its place, e.g.,
825 given &lt;p&gt;Hi &lt;i&gt;there!&lt;/i&gt;&lt;/p&gt;, if you remove the &lt;i&gt; element you get
826 &lt;p&gt;Hi there!&lt;/p&gt;</td>
827           <td align="right" valign="top">
828             <span class="codelink"><a href="lxml.html.diff-pysrc.html#_merge_element_contents">source&nbsp;code</a></span>
829             
830           </td>
831         </tr>
832       </table>
833       
834     </td>
835   </tr>
836 </table>
837 <!-- ==================== VARIABLES ==================== -->
838 <a name="section-Variables"></a>
839 <table class="summary" border="1" cellpadding="3"
840        cellspacing="0" width="100%" bgcolor="white">
841 <tr bgcolor="#70b0f0" class="table-header">
842   <td colspan="2" class="table-header">
843     <table border="0" cellpadding="0" cellspacing="0" width="100%">
844       <tr valign="top">
845         <td align="left"><span class="table-header">Variables</span></td>
846         <td align="right" valign="top"
847          ><span class="options">[<a href="#section-Variables"
848          class="privatelink" onclick="toggle_private();"
849          >hide private</a>]</span></td>
850       </tr>
851     </table>
852   </td>
853 </tr>
854 <tr class="private">
855     <td width="15%" align="right" valign="top" class="summary">
856       <span class="summary-type">&nbsp;</span>
857     </td><td class="summary">
858         <a name="_body_re"></a><span class="summary-name">_body_re</span> = <code title="re.compile(r'(?is)&lt;body.*?&gt;')">re.compile(r'<code class="re-flags">(?is)</code>&lt;body.<code class="re-op">*?</code>&gt;')</code>
859     </td>
860   </tr>
861 <tr class="private">
862     <td width="15%" align="right" valign="top" class="summary">
863       <span class="summary-type">&nbsp;</span>
864     </td><td class="summary">
865         <a name="_end_body_re"></a><span class="summary-name">_end_body_re</span> = <code title="re.compile(r'(?is)&lt;/body.*?&gt;')">re.compile(r'<code class="re-flags">(?is)</code>&lt;/body.<code class="re-op">*?</code>&gt;')</code>
866     </td>
867   </tr>
868 <tr class="private">
869     <td width="15%" align="right" valign="top" class="summary">
870       <span class="summary-type">&nbsp;</span>
871     </td><td class="summary">
872         <a name="_ins_del_re"></a><span class="summary-name">_ins_del_re</span> = <code title="re.compile(r'(?is)&lt;/?(ins|del).*?&gt;')">re.compile(r'<code class="re-flags">(?is)</code>&lt;/<code class="re-op">?</code><code class="re-group">(</code>ins<code class="re-op">|</code>del<code class="re-group">)</code>.<code class="re-op">*?</code>&gt;')</code>
873     </td>
874   </tr>
875 <tr class="private">
876     <td width="15%" align="right" valign="top" class="summary">
877       <span class="summary-type">&nbsp;</span>
878     </td><td class="summary">
879         <a name="end_whitespace_re"></a><span class="summary-name">end_whitespace_re</span> = <code title="re.compile(r'[ \t\n\r]$')">re.compile(r'<code class="re-group">[</code> \t\n\r<code class="re-group">]</code>$')</code>
880     </td>
881   </tr>
882 <tr class="private">
883     <td width="15%" align="right" valign="top" class="summary">
884       <span class="summary-type">&nbsp;</span>
885     </td><td class="summary">
886         <a href="lxml.html.diff-module.html#empty_tags" class="summary-name" onclick="show_private();">empty_tags</a> = <code title="(u'param',
887  u'img',
888  u'area',
889  u'br',
890  u'basefont',
891  u'input',
892  u'base',
893  u'meta',
894 ..."><code class="variable-group">(</code><code class="variable-quote">u'</code><code class="variable-string">param</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">u'</code><code class="variable-string">img</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">u'</code><code class="variable-string">area</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">u'</code><code class="variable-string">br</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">u'</code><code class="variable-string">basefont</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">u</code><code class="variable-ellipsis">...</code></code>
895     </td>
896   </tr>
897 <tr class="private">
898     <td width="15%" align="right" valign="top" class="summary">
899       <span class="summary-type">&nbsp;</span>
900     </td><td class="summary">
901         <a href="lxml.html.diff-module.html#block_level_tags" class="summary-name" onclick="show_private();">block_level_tags</a> = <code title="(u'address',
902  u'blockquote',
903  u'center',
904  u'dir',
905  u'div',
906  u'dl',
907  u'fieldset',
908  u'form',
909 ..."><code class="variable-group">(</code><code class="variable-quote">u'</code><code class="variable-string">address</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">u'</code><code class="variable-string">blockquote</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">u'</code><code class="variable-string">center</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">u'</code><code class="variable-string">di</code><code class="variable-ellipsis">...</code></code>
910     </td>
911   </tr>
912 <tr class="private">
913     <td width="15%" align="right" valign="top" class="summary">
914       <span class="summary-type">&nbsp;</span>
915     </td><td class="summary">
916         <a href="lxml.html.diff-module.html#block_level_container_tags" class="summary-name" onclick="show_private();">block_level_container_tags</a> = <code title="(u'dd',
917  u'dt',
918  u'frameset',
919  u'li',
920  u'tbody',
921  u'td',
922  u'tfoot',
923  u'th',
924 ..."><code class="variable-group">(</code><code class="variable-quote">u'</code><code class="variable-string">dd</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">u'</code><code class="variable-string">dt</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">u'</code><code class="variable-string">frameset</code><code class="variable-quote">'</code><code class="variable-op">, </code><code class="variable-quote">u'</code><code class="variable-string">li</code><code class="variable-quote">'</code><code class="variable-ellipsis">...</code></code>
925     </td>
926   </tr>
927 <tr class="private">
928     <td width="15%" align="right" valign="top" class="summary">
929       <span class="summary-type">&nbsp;</span>
930     </td><td class="summary">
931         <a name="split_words_re"></a><span class="summary-name">split_words_re</span> = <code title="re.compile(r'(?u)\S+(?:\s+|$)')">re.compile(r'<code class="re-flags">(?u)</code>\S<code class="re-op">+</code><code class="re-group">(?:</code>\s<code class="re-op">+</code><code class="re-op">|</code>$<code class="re-group">)</code>')</code>
932     </td>
933   </tr>
934 <tr class="private">
935     <td width="15%" align="right" valign="top" class="summary">
936       <span class="summary-type">&nbsp;</span>
937     </td><td class="summary">
938         <a name="start_whitespace_re"></a><span class="summary-name">start_whitespace_re</span> = <code title="re.compile(r'^[ \t\n\r]')">re.compile(r'^<code class="re-group">[</code> \t\n\r<code class="re-group">]</code>')</code>
939     </td>
940   </tr>
941 <tr class="private">
942     <td width="15%" align="right" valign="top" class="summary">
943       <span class="summary-type">&nbsp;</span>
944     </td><td class="summary">
945         <a name="__package__"></a><span class="summary-name">__package__</span> = <code title="None">None</code><br />
946       hash(x)
947     </td>
948   </tr>
949 <tr class="private">
950     <td width="15%" align="right" valign="top" class="summary">
951       <span class="summary-type">&nbsp;</span>
952     </td><td class="summary">
953         <a href="lxml.html.diff-module.html#__test__" class="summary-name" onclick="show_private();">__test__</a> = <code title="{u'html_annotate (line 35)': u'''
954     doclist should be ordered from oldest to newest, like::
955
956         &gt;&gt;&gt; version1 = 'Hello World'
957         &gt;&gt;&gt; version2 = 'Goodbye World'
958         &gt;&gt;&gt; print(html_annotate([(version1, 'version 1'),
959         ...                      (version2, 'version 2')]))
960         &lt;span title=&quot;version 2&quot;&gt;Goodbye&lt;/span&gt; &lt;span title=&quot;version 1&quot;\
961 ..."><code class="variable-group">{</code><code class="variable-quote">u'</code><code class="variable-string">html_annotate (line 35)</code><code class="variable-quote">'</code><code class="variable-op">:</code><code class="variable-ellipsis">...</code></code>
962     </td>
963   </tr>
964 </table>
965 <!-- ==================== FUNCTION DETAILS ==================== -->
966 <a name="section-FunctionDetails"></a>
967 <table class="details" border="1" cellpadding="3"
968        cellspacing="0" width="100%" bgcolor="white">
969 <tr bgcolor="#70b0f0" class="table-header">
970   <td colspan="2" class="table-header">
971     <table border="0" cellpadding="0" cellspacing="0" width="100%">
972       <tr valign="top">
973         <td align="left"><span class="table-header">Function Details</span></td>
974         <td align="right" valign="top"
975          ><span class="options">[<a href="#section-FunctionDetails"
976          class="privatelink" onclick="toggle_private();"
977          >hide private</a>]</span></td>
978       </tr>
979     </table>
980   </td>
981 </tr>
982 </table>
983 <a name="html_annotate"></a>
984 <div>
985 <table class="details" border="1" cellpadding="3"
986        cellspacing="0" width="100%" bgcolor="white">
987 <tr><td>
988   <table width="100%" cellpadding="0" cellspacing="0" border="0">
989   <tr valign="top"><td>
990   <h3 class="epydoc"><span class="sig"><span class="sig-name">html_annotate</span>(<span class="sig-arg">doclist</span>,
991         <span class="sig-arg">markup</span>=<span class="sig-default">default_markup</span>)</span>
992   </h3>
993   </td><td align="right" valign="top"
994     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#html_annotate">source&nbsp;code</a></span>&nbsp;
995     </td>
996   </tr></table>
997   
998   <p>doclist should be ordered from oldest to newest, like:</p>
999 <pre class="rst-literal-block">
1000 &gt;&gt;&gt; version1 = 'Hello World'
1001 &gt;&gt;&gt; version2 = 'Goodbye World'
1002 &gt;&gt;&gt; print(html_annotate([(version1, 'version 1'),
1003 ...                      (version2, 'version 2')]))
1004 &lt;span title=&quot;version 2&quot;&gt;Goodbye&lt;/span&gt; &lt;span title=&quot;version 1&quot;&gt;World&lt;/span&gt;
1005 </pre>
1006 <p>The documents must be <em>fragments</em> (str/UTF8 or unicode), not
1007 complete documents</p>
1008 <p>The markup argument is a function to markup the spans of words.
1009 This function is called like markup('Hello', 'version 2'), and
1010 returns HTML.  The first argument is text and never includes any
1011 markup.  The default uses a span with a title:</p>
1012 <blockquote>
1013 <pre class="py-doctest">
1014 <span class="py-prompt">&gt;&gt;&gt; </span><span class="py-keyword">print</span>(default_markup(<span class="py-string">'Some Text'</span>, <span class="py-string">'by Joe'</span>))
1015 <span class="py-output">&lt;span title=&quot;by Joe&quot;&gt;Some Text&lt;/span&gt;</span></pre>
1016 </blockquote>
1017   <dl class="fields">
1018   </dl>
1019 </td></tr></table>
1020 </div>
1021 <a name="htmldiff"></a>
1022 <div>
1023 <table class="details" border="1" cellpadding="3"
1024        cellspacing="0" width="100%" bgcolor="white">
1025 <tr><td>
1026   <table width="100%" cellpadding="0" cellspacing="0" border="0">
1027   <tr valign="top"><td>
1028   <h3 class="epydoc"><span class="sig"><span class="sig-name">htmldiff</span>(<span class="sig-arg">old_html</span>,
1029         <span class="sig-arg">new_html</span>)</span>
1030   </h3>
1031   </td><td align="right" valign="top"
1032     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#htmldiff">source&nbsp;code</a></span>&nbsp;
1033     </td>
1034   </tr></table>
1035   
1036   <p>Do a diff of the old and new document.  The documents are HTML
1037 <em>fragments</em> (str/UTF8 or unicode), they are not complete documents
1038 (i.e., no &lt;html&gt; tag).</p>
1039 <p>Returns HTML with &lt;ins&gt; and &lt;del&gt; tags added around the
1040 appropriate text.</p>
1041 <p>Markup is generally ignored, with the markup from new_html
1042 preserved, and possibly some markup from old_html (though it is
1043 considered acceptable to lose some of the old markup).  Only the
1044 words in the HTML are diffed.  The exception is &lt;img&gt; tags, which
1045 are treated like words, and the href attribute of &lt;a&gt; tags, which
1046 are noted inside the tag itself when there are changes.</p>
1047   <dl class="fields">
1048   </dl>
1049 </td></tr></table>
1050 </div>
1051 <a name="cleanup_delete"></a>
1052 <div class="private">
1053 <table class="details" border="1" cellpadding="3"
1054        cellspacing="0" width="100%" bgcolor="white">
1055 <tr><td>
1056   <table width="100%" cellpadding="0" cellspacing="0" border="0">
1057   <tr valign="top"><td>
1058   <h3 class="epydoc"><span class="sig"><span class="sig-name">cleanup_delete</span>(<span class="sig-arg">chunks</span>)</span>
1059   </h3>
1060   </td><td align="right" valign="top"
1061     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#cleanup_delete">source&nbsp;code</a></span>&nbsp;
1062     </td>
1063   </tr></table>
1064   
1065   <p>Cleans up any DEL_START/DEL_END markers in the document, replacing
1066 them with &lt;del&gt;&lt;/del&gt;.  To do this while keeping the document
1067 valid, it may need to drop some tags (either start or end tags).</p>
1068 <p>It may also move the del into adjacent tags to try to move it to a
1069 similar location where it was originally located (e.g., moving a
1070 delete into preceding &lt;div&gt; tag, if the del looks like (DEL_START,
1071 'Text&lt;/div&gt;', DEL_END)</p>
1072   <dl class="fields">
1073   </dl>
1074 </td></tr></table>
1075 </div>
1076 <a name="split_unbalanced"></a>
1077 <div class="private">
1078 <table class="details" border="1" cellpadding="3"
1079        cellspacing="0" width="100%" bgcolor="white">
1080 <tr><td>
1081   <table width="100%" cellpadding="0" cellspacing="0" border="0">
1082   <tr valign="top"><td>
1083   <h3 class="epydoc"><span class="sig"><span class="sig-name">split_unbalanced</span>(<span class="sig-arg">chunks</span>)</span>
1084   </h3>
1085   </td><td align="right" valign="top"
1086     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#split_unbalanced">source&nbsp;code</a></span>&nbsp;
1087     </td>
1088   </tr></table>
1089   
1090   <p>Return (unbalanced_start, balanced, unbalanced_end), where each is
1091 a list of text and tag chunks.</p>
1092 <p>unbalanced_start is a list of all the tags that are opened, but
1093 not closed in this span.  Similarly, unbalanced_end is a list of
1094 tags that are closed but were not opened.  Extracting these might
1095 mean some reordering of the chunks.</p>
1096   <dl class="fields">
1097   </dl>
1098 </td></tr></table>
1099 </div>
1100 <a name="locate_unbalanced_start"></a>
1101 <div class="private">
1102 <table class="details" border="1" cellpadding="3"
1103        cellspacing="0" width="100%" bgcolor="white">
1104 <tr><td>
1105   <table width="100%" cellpadding="0" cellspacing="0" border="0">
1106   <tr valign="top"><td>
1107   <h3 class="epydoc"><span class="sig"><span class="sig-name">locate_unbalanced_start</span>(<span class="sig-arg">unbalanced_start</span>,
1108         <span class="sig-arg">pre_delete</span>,
1109         <span class="sig-arg">post_delete</span>)</span>
1110   </h3>
1111   </td><td align="right" valign="top"
1112     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#locate_unbalanced_start">source&nbsp;code</a></span>&nbsp;
1113     </td>
1114   </tr></table>
1115   
1116   <p>pre_delete and post_delete implicitly point to a place in the
1117 document (where the two were split).  This moves that point (by
1118 popping items from one and pushing them onto the other).  It moves
1119 the point to try to find a place where unbalanced_start applies.</p>
1120 <p>As an example:</p>
1121 <pre class="rst-literal-block">
1122 &gt;&gt;&gt; unbalanced_start = ['&lt;div&gt;']
1123 &gt;&gt;&gt; doc = ['&lt;p&gt;', 'Text', '&lt;/p&gt;', '&lt;div&gt;', 'More Text', '&lt;/div&gt;']
1124 &gt;&gt;&gt; pre, post = doc[:3], doc[3:]
1125 &gt;&gt;&gt; pre, post
1126 (['&lt;p&gt;', 'Text', '&lt;/p&gt;'], ['&lt;div&gt;', 'More Text', '&lt;/div&gt;'])
1127 &gt;&gt;&gt; locate_unbalanced_start(unbalanced_start, pre, post)
1128 &gt;&gt;&gt; pre, post
1129 (['&lt;p&gt;', 'Text', '&lt;/p&gt;', '&lt;div&gt;'], ['More Text', '&lt;/div&gt;'])
1130 </pre>
1131 <p>As you can see, we moved the point so that the dangling &lt;div&gt; that
1132 we found will be effectively replaced by the div in the original
1133 document.  If this doesn't work out, we just throw away
1134 unbalanced_start without doing anything.</p>
1135   <dl class="fields">
1136   </dl>
1137 </td></tr></table>
1138 </div>
1139 <a name="tokenize"></a>
1140 <div class="private">
1141 <table class="details" border="1" cellpadding="3"
1142        cellspacing="0" width="100%" bgcolor="white">
1143 <tr><td>
1144   <table width="100%" cellpadding="0" cellspacing="0" border="0">
1145   <tr valign="top"><td>
1146   <h3 class="epydoc"><span class="sig"><span class="sig-name">tokenize</span>(<span class="sig-arg">html</span>,
1147         <span class="sig-arg">include_hrefs</span>=<span class="sig-default">True</span>)</span>
1148   </h3>
1149   </td><td align="right" valign="top"
1150     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#tokenize">source&nbsp;code</a></span>&nbsp;
1151     </td>
1152   </tr></table>
1153   
1154   <p>Parse the given HTML and returns token objects (words with attached tags).</p>
1155 <p>This parses only the content of a page; anything in the head is
1156 ignored, and the &lt;head&gt; and &lt;body&gt; elements are themselves
1157 optional.  The content is then parsed by lxml, which ensures the
1158 validity of the resulting parsed document (though lxml may make
1159 incorrect guesses when the markup is particular bad).</p>
1160 <p>&lt;ins&gt; and &lt;del&gt; tags are also eliminated from the document, as
1161 that gets confusing.</p>
1162 <p>If include_hrefs is true, then the href attribute of &lt;a&gt; tags is
1163 included as a special kind of diffable token.</p>
1164   <dl class="fields">
1165   </dl>
1166 </td></tr></table>
1167 </div>
1168 <a name="parse_html"></a>
1169 <div class="private">
1170 <table class="details" border="1" cellpadding="3"
1171        cellspacing="0" width="100%" bgcolor="white">
1172 <tr><td>
1173   <table width="100%" cellpadding="0" cellspacing="0" border="0">
1174   <tr valign="top"><td>
1175   <h3 class="epydoc"><span class="sig"><span class="sig-name">parse_html</span>(<span class="sig-arg">html</span>,
1176         <span class="sig-arg">cleanup</span>=<span class="sig-default">True</span>)</span>
1177   </h3>
1178   </td><td align="right" valign="top"
1179     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#parse_html">source&nbsp;code</a></span>&nbsp;
1180     </td>
1181   </tr></table>
1182   
1183   <p>Parses an HTML fragment, returning an lxml element.  Note that the HTML will be
1184 wrapped in a &lt;div&gt; tag that was not in the original document.</p>
1185 <p>If cleanup is true, make sure there's no &lt;head&gt; or &lt;body&gt;, and get
1186 rid of any &lt;ins&gt; and &lt;del&gt; tags.</p>
1187   <dl class="fields">
1188   </dl>
1189 </td></tr></table>
1190 </div>
1191 <a name="split_trailing_whitespace"></a>
1192 <div class="private">
1193 <table class="details" border="1" cellpadding="3"
1194        cellspacing="0" width="100%" bgcolor="white">
1195 <tr><td>
1196   <table width="100%" cellpadding="0" cellspacing="0" border="0">
1197   <tr valign="top"><td>
1198   <h3 class="epydoc"><span class="sig"><span class="sig-name">split_trailing_whitespace</span>(<span class="sig-arg">word</span>)</span>
1199   </h3>
1200   </td><td align="right" valign="top"
1201     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#split_trailing_whitespace">source&nbsp;code</a></span>&nbsp;
1202     </td>
1203   </tr></table>
1204   
1205   <blockquote>
1206 This function takes a word, such as 'test</blockquote>
1207 <p>' and returns ('test','</p>
1208 <p>')</p>
1209   <dl class="fields">
1210   </dl>
1211 </td></tr></table>
1212 </div>
1213 <a name="flatten_el"></a>
1214 <div class="private">
1215 <table class="details" border="1" cellpadding="3"
1216        cellspacing="0" width="100%" bgcolor="white">
1217 <tr><td>
1218   <table width="100%" cellpadding="0" cellspacing="0" border="0">
1219   <tr valign="top"><td>
1220   <h3 class="epydoc"><span class="sig"><span class="sig-name">flatten_el</span>(<span class="sig-arg">el</span>,
1221         <span class="sig-arg">include_hrefs</span>,
1222         <span class="sig-arg">skip_tag</span>=<span class="sig-default">False</span>)</span>
1223   </h3>
1224   </td><td align="right" valign="top"
1225     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#flatten_el">source&nbsp;code</a></span>&nbsp;
1226     </td>
1227   </tr></table>
1228   
1229   <p>Takes an lxml element el, and generates all the text chunks for
1230 that tag.  Each start tag is a chunk, each word is a chunk, and each
1231 end tag is a chunk.</p>
1232 <p>If skip_tag is true, then the outermost container tag is
1233 not returned (just its contents).</p>
1234   <dl class="fields">
1235   </dl>
1236 </td></tr></table>
1237 </div>
1238 <a name="serialize_html_fragment"></a>
1239 <div class="private">
1240 <table class="details" border="1" cellpadding="3"
1241        cellspacing="0" width="100%" bgcolor="white">
1242 <tr><td>
1243   <table width="100%" cellpadding="0" cellspacing="0" border="0">
1244   <tr valign="top"><td>
1245   <h3 class="epydoc"><span class="sig"><span class="sig-name">serialize_html_fragment</span>(<span class="sig-arg">el</span>,
1246         <span class="sig-arg">skip_outer</span>=<span class="sig-default">False</span>)</span>
1247   </h3>
1248   </td><td align="right" valign="top"
1249     ><span class="codelink"><a href="lxml.html.diff-pysrc.html#serialize_html_fragment">source&nbsp;code</a></span>&nbsp;
1250     </td>
1251   </tr></table>
1252   
1253   <p>Serialize a single lxml element as HTML.  The serialized form
1254 includes the elements tail.</p>
1255 <p>If skip_outer is true, then don't serialize the outermost tag</p>
1256   <dl class="fields">
1257   </dl>
1258 </td></tr></table>
1259 </div>
1260 <br />
1261 <!-- ==================== VARIABLES DETAILS ==================== -->
1262 <a name="section-VariablesDetails"></a>
1263 <table class="details" border="1" cellpadding="3"
1264        cellspacing="0" width="100%" bgcolor="white">
1265 <tr bgcolor="#70b0f0" class="table-header">
1266   <td colspan="2" class="table-header">
1267     <table border="0" cellpadding="0" cellspacing="0" width="100%">
1268       <tr valign="top">
1269         <td align="left"><span class="table-header">Variables Details</span></td>
1270         <td align="right" valign="top"
1271          ><span class="options">[<a href="#section-VariablesDetails"
1272          class="privatelink" onclick="toggle_private();"
1273          >hide private</a>]</span></td>
1274       </tr>
1275     </table>
1276   </td>
1277 </tr>
1278 </table>
1279 <a name="empty_tags"></a>
1280 <div class="private">
1281 <table class="details" border="1" cellpadding="3"
1282        cellspacing="0" width="100%" bgcolor="white">
1283 <tr><td>
1284   <h3 class="epydoc">empty_tags</h3>
1285   
1286   <dl class="fields">
1287   </dl>
1288   <dl class="fields">
1289     <dt>Value:</dt>
1290       <dd><table><tr><td><pre class="variable">
1291 <code class="variable-group">(</code><code class="variable-quote">u'</code><code class="variable-string">param</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1292  <code class="variable-quote">u'</code><code class="variable-string">img</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1293  <code class="variable-quote">u'</code><code class="variable-string">area</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1294  <code class="variable-quote">u'</code><code class="variable-string">br</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1295  <code class="variable-quote">u'</code><code class="variable-string">basefont</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1296  <code class="variable-quote">u'</code><code class="variable-string">input</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1297  <code class="variable-quote">u'</code><code class="variable-string">base</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1298  <code class="variable-quote">u'</code><code class="variable-string">meta</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1299 <code class="variable-ellipsis">...</code>
1300 </pre></td></tr></table>
1301 </dd>
1302   </dl>
1303 </td></tr></table>
1304 </div>
1305 <a name="block_level_tags"></a>
1306 <div class="private">
1307 <table class="details" border="1" cellpadding="3"
1308        cellspacing="0" width="100%" bgcolor="white">
1309 <tr><td>
1310   <h3 class="epydoc">block_level_tags</h3>
1311   
1312   <dl class="fields">
1313   </dl>
1314   <dl class="fields">
1315     <dt>Value:</dt>
1316       <dd><table><tr><td><pre class="variable">
1317 <code class="variable-group">(</code><code class="variable-quote">u'</code><code class="variable-string">address</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1318  <code class="variable-quote">u'</code><code class="variable-string">blockquote</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1319  <code class="variable-quote">u'</code><code class="variable-string">center</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1320  <code class="variable-quote">u'</code><code class="variable-string">dir</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1321  <code class="variable-quote">u'</code><code class="variable-string">div</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1322  <code class="variable-quote">u'</code><code class="variable-string">dl</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1323  <code class="variable-quote">u'</code><code class="variable-string">fieldset</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1324  <code class="variable-quote">u'</code><code class="variable-string">form</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1325 <code class="variable-ellipsis">...</code>
1326 </pre></td></tr></table>
1327 </dd>
1328   </dl>
1329 </td></tr></table>
1330 </div>
1331 <a name="block_level_container_tags"></a>
1332 <div class="private">
1333 <table class="details" border="1" cellpadding="3"
1334        cellspacing="0" width="100%" bgcolor="white">
1335 <tr><td>
1336   <h3 class="epydoc">block_level_container_tags</h3>
1337   
1338   <dl class="fields">
1339   </dl>
1340   <dl class="fields">
1341     <dt>Value:</dt>
1342       <dd><table><tr><td><pre class="variable">
1343 <code class="variable-group">(</code><code class="variable-quote">u'</code><code class="variable-string">dd</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1344  <code class="variable-quote">u'</code><code class="variable-string">dt</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1345  <code class="variable-quote">u'</code><code class="variable-string">frameset</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1346  <code class="variable-quote">u'</code><code class="variable-string">li</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1347  <code class="variable-quote">u'</code><code class="variable-string">tbody</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1348  <code class="variable-quote">u'</code><code class="variable-string">td</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1349  <code class="variable-quote">u'</code><code class="variable-string">tfoot</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1350  <code class="variable-quote">u'</code><code class="variable-string">th</code><code class="variable-quote">'</code><code class="variable-op">,</code>
1351 <code class="variable-ellipsis">...</code>
1352 </pre></td></tr></table>
1353 </dd>
1354   </dl>
1355 </td></tr></table>
1356 </div>
1357 <a name="__test__"></a>
1358 <div class="private">
1359 <table class="details" border="1" cellpadding="3"
1360        cellspacing="0" width="100%" bgcolor="white">
1361 <tr><td>
1362   <h3 class="epydoc">__test__</h3>
1363   
1364   <dl class="fields">
1365   </dl>
1366   <dl class="fields">
1367     <dt>Value:</dt>
1368       <dd><table><tr><td><pre class="variable">
1369 <code class="variable-group">{</code><code class="variable-quote">u'</code><code class="variable-string">html_annotate (line 35)</code><code class="variable-quote">'</code><code class="variable-op">: </code><code class="variable-quote">u'''</code><code class="variable-string"></code>
1370 <code class="variable-string">    doclist should be ordered from oldest to newest, like::</code>
1371 <code class="variable-string"></code>
1372 <code class="variable-string">        &gt;&gt;&gt; version1 = 'Hello World'</code>
1373 <code class="variable-string">        &gt;&gt;&gt; version2 = 'Goodbye World'</code>
1374 <code class="variable-string">        &gt;&gt;&gt; print(html_annotate([(version1, 'version 1'),</code>
1375 <code class="variable-string">        ...                      (version2, 'version 2')]))</code>
1376 <code class="variable-string">        &lt;span title=&quot;version 2&quot;&gt;Goodbye&lt;/span&gt; &lt;span title=&quot;version 1&quot;</code><span class="variable-linewrap"><img src="crarr.png" alt="\" /></span>
1377 <code class="variable-ellipsis">...</code>
1378 </pre></td></tr></table>
1379 </dd>
1380   </dl>
1381 </td></tr></table>
1382 </div>
1383 <br />
1384 <!-- ==================== NAVIGATION BAR ==================== -->
1385 <table class="navbar" border="0" width="100%" cellpadding="0"
1386        bgcolor="#a0c0ff" cellspacing="0">
1387   <tr valign="middle">
1388   <!-- Home link -->
1389       <th>&nbsp;&nbsp;&nbsp;<a
1390         href="lxml-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>
1391
1392   <!-- Tree link -->
1393       <th>&nbsp;&nbsp;&nbsp;<a
1394         href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>
1395
1396   <!-- Index link -->
1397       <th>&nbsp;&nbsp;&nbsp;<a
1398         href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
1399
1400   <!-- Help link -->
1401       <th>&nbsp;&nbsp;&nbsp;<a
1402         href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
1403
1404   <!-- Project homepage -->
1405       <th class="navbar" align="right" width="100%">
1406         <table border="0" cellpadding="0" cellspacing="0">
1407           <tr><th class="navbar" align="center"
1408             ><a class="navbar" target="_top" href="/">lxml API</a></th>
1409           </tr></table></th>
1410   </tr>
1411 </table>
1412 <table border="0" cellpadding="0" cellspacing="0" width="100%%">
1413   <tr>
1414     <td align="left" class="footer">
1415     Generated by Epydoc 3.0.1
1416     on Thu Jul  9 18:29:53 2020
1417     </td>
1418     <td align="right" class="footer">
1419       <a target="mainFrame" href="http://epydoc.sourceforge.net"
1420         >http://epydoc.sourceforge.net</a>
1421     </td>
1422   </tr>
1423 </table>
1424
1425 <script type="text/javascript">
1426   <!--
1427   // Private objects are initially displayed (because if
1428   // javascript is turned off then we want them to be
1429   // visible); but by default, we want to hide them.  So hide
1430   // them unless we have a cookie that says to show them.
1431   checkCookie();
1432   // -->
1433 </script>
1434 </body>
1435 </html>