Imported Upstream version 2.3.5
[platform/upstream/python-lxml.git] / doc / s5 / lxml-ep2008.html
1 <?xml version="1.0" encoding="utf-8" ?>
2 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
6 <meta name="generator" content="Docutils 0.8.1: http://docutils.sourceforge.net/" />
7 <meta name="version" content="S5 1.1" />
8 <title>Implementing XML languages with lxml</title>
9 <style type="text/css">
10
11 /*
12 :Author: David Goodger (goodger@python.org)
13 :Id: $Id: html4css1.css 7056 2011-06-17 10:50:48Z milde $
14 :Copyright: This stylesheet has been placed in the public domain.
15
16 Default cascading style sheet for the HTML output of Docutils.
17
18 See http://docutils.sf.net/docs/howto/html-stylesheets.html for how to
19 customize this style sheet.
20 */
21
22 /* used to remove borders from tables and images */
23 .borderless, table.borderless td, table.borderless th {
24   border: 0 }
25
26 table.borderless td, table.borderless th {
27   /* Override padding for "table.docutils td" with "! important".
28      The right padding separates the table cells. */
29   padding: 0 0.5em 0 0 ! important }
30
31 .first {
32   /* Override more specific margin styles with "! important". */
33   margin-top: 0 ! important }
34
35 .last, .with-subtitle {
36   margin-bottom: 0 ! important }
37
38 .hidden {
39   display: none }
40
41 a.toc-backref {
42   text-decoration: none ;
43   color: black }
44
45 blockquote.epigraph {
46   margin: 2em 5em ; }
47
48 dl.docutils dd {
49   margin-bottom: 0.5em }
50
51 object[type="image/svg+xml"], object[type="application/x-shockwave-flash"] {
52   overflow: hidden;
53 }
54
55 /* Uncomment (and remove this text!) to get bold-faced definition list terms
56 dl.docutils dt {
57   font-weight: bold }
58 */
59
60 div.abstract {
61   margin: 2em 5em }
62
63 div.abstract p.topic-title {
64   font-weight: bold ;
65   text-align: center }
66
67 div.admonition, div.attention, div.caution, div.danger, div.error,
68 div.hint, div.important, div.note, div.tip, div.warning {
69   margin: 2em ;
70   border: medium outset ;
71   padding: 1em }
72
73 div.admonition p.admonition-title, div.hint p.admonition-title,
74 div.important p.admonition-title, div.note p.admonition-title,
75 div.tip p.admonition-title {
76   font-weight: bold ;
77   font-family: sans-serif }
78
79 div.attention p.admonition-title, div.caution p.admonition-title,
80 div.danger p.admonition-title, div.error p.admonition-title,
81 div.warning p.admonition-title {
82   color: red ;
83   font-weight: bold ;
84   font-family: sans-serif }
85
86 /* Uncomment (and remove this text!) to get reduced vertical space in
87    compound paragraphs.
88 div.compound .compound-first, div.compound .compound-middle {
89   margin-bottom: 0.5em }
90
91 div.compound .compound-last, div.compound .compound-middle {
92   margin-top: 0.5em }
93 */
94
95 div.dedication {
96   margin: 2em 5em ;
97   text-align: center ;
98   font-style: italic }
99
100 div.dedication p.topic-title {
101   font-weight: bold ;
102   font-style: normal }
103
104 div.figure {
105   margin-left: 2em ;
106   margin-right: 2em }
107
108 div.footer, div.header {
109   clear: both;
110   font-size: smaller }
111
112 div.line-block {
113   display: block ;
114   margin-top: 1em ;
115   margin-bottom: 1em }
116
117 div.line-block div.line-block {
118   margin-top: 0 ;
119   margin-bottom: 0 ;
120   margin-left: 1.5em }
121
122 div.sidebar {
123   margin: 0 0 0.5em 1em ;
124   border: medium outset ;
125   padding: 1em ;
126   background-color: #ffffee ;
127   width: 40% ;
128   float: right ;
129   clear: right }
130
131 div.sidebar p.rubric {
132   font-family: sans-serif ;
133   font-size: medium }
134
135 div.system-messages {
136   margin: 5em }
137
138 div.system-messages h1 {
139   color: red }
140
141 div.system-message {
142   border: medium outset ;
143   padding: 1em }
144
145 div.system-message p.system-message-title {
146   color: red ;
147   font-weight: bold }
148
149 div.topic {
150   margin: 2em }
151
152 h1.section-subtitle, h2.section-subtitle, h3.section-subtitle,
153 h4.section-subtitle, h5.section-subtitle, h6.section-subtitle {
154   margin-top: 0.4em }
155
156 h1.title {
157   text-align: center }
158
159 h2.subtitle {
160   text-align: center }
161
162 hr.docutils {
163   width: 75% }
164
165 img.align-left, .figure.align-left, object.align-left {
166   clear: left ;
167   float: left ;
168   margin-right: 1em }
169
170 img.align-right, .figure.align-right, object.align-right {
171   clear: right ;
172   float: right ;
173   margin-left: 1em }
174
175 img.align-center, .figure.align-center, object.align-center {
176   display: block;
177   margin-left: auto;
178   margin-right: auto;
179 }
180
181 .align-left {
182   text-align: left }
183
184 .align-center {
185   clear: both ;
186   text-align: center }
187
188 .align-right {
189   text-align: right }
190
191 /* reset inner alignment in figures */
192 div.align-right {
193   text-align: inherit }
194
195 /* div.align-center * { */
196 /*   text-align: left } */
197
198 ol.simple, ul.simple {
199   margin-bottom: 1em }
200
201 ol.arabic {
202   list-style: decimal }
203
204 ol.loweralpha {
205   list-style: lower-alpha }
206
207 ol.upperalpha {
208   list-style: upper-alpha }
209
210 ol.lowerroman {
211   list-style: lower-roman }
212
213 ol.upperroman {
214   list-style: upper-roman }
215
216 p.attribution {
217   text-align: right ;
218   margin-left: 50% }
219
220 p.caption {
221   font-style: italic }
222
223 p.credits {
224   font-style: italic ;
225   font-size: smaller }
226
227 p.label {
228   white-space: nowrap }
229
230 p.rubric {
231   font-weight: bold ;
232   font-size: larger ;
233   color: maroon ;
234   text-align: center }
235
236 p.sidebar-title {
237   font-family: sans-serif ;
238   font-weight: bold ;
239   font-size: larger }
240
241 p.sidebar-subtitle {
242   font-family: sans-serif ;
243   font-weight: bold }
244
245 p.topic-title {
246   font-weight: bold }
247
248 pre.address {
249   margin-bottom: 0 ;
250   margin-top: 0 ;
251   font: inherit }
252
253 pre.literal-block, pre.doctest-block, pre.math {
254   margin-left: 2em ;
255   margin-right: 2em }
256
257 span.classifier {
258   font-family: sans-serif ;
259   font-style: oblique }
260
261 span.classifier-delimiter {
262   font-family: sans-serif ;
263   font-weight: bold }
264
265 span.interpreted {
266   font-family: sans-serif }
267
268 span.option {
269   white-space: nowrap }
270
271 span.pre {
272   white-space: pre }
273
274 span.problematic {
275   color: red }
276
277 span.section-subtitle {
278   /* font-size relative to parent (h1..h6 element) */
279   font-size: 80% }
280
281 table.citation {
282   border-left: solid 1px gray;
283   margin-left: 1px }
284
285 table.docinfo {
286   margin: 2em 4em }
287
288 table.docutils {
289   margin-top: 0.5em ;
290   margin-bottom: 0.5em }
291
292 table.footnote {
293   border-left: solid 1px black;
294   margin-left: 1px }
295
296 table.docutils td, table.docutils th,
297 table.docinfo td, table.docinfo th {
298   padding-left: 0.5em ;
299   padding-right: 0.5em ;
300   vertical-align: top }
301
302 table.docutils th.field-name, table.docinfo th.docinfo-name {
303   font-weight: bold ;
304   text-align: left ;
305   white-space: nowrap ;
306   padding-left: 0 }
307
308 h1 tt.docutils, h2 tt.docutils, h3 tt.docutils,
309 h4 tt.docutils, h5 tt.docutils, h6 tt.docutils {
310   font-size: 100% }
311
312 ul.auto-toc {
313   list-style-type: none }
314
315 </style>
316 <!-- configuration parameters -->
317 <meta name="defaultView" content="slideshow" />
318 <meta name="controlVis" content="hidden" />
319 <!-- style sheet links -->
320 <script src="ui/default/slides.js" type="text/javascript"></script>
321 <link rel="stylesheet" href="ui/default/slides.css"
322       type="text/css" media="projection" id="slideProj" />
323 <link rel="stylesheet" href="ui/default/outline.css"
324       type="text/css" media="screen" id="outlineStyle" />
325 <link rel="stylesheet" href="ui/default/print.css"
326       type="text/css" media="print" id="slidePrint" />
327 <link rel="stylesheet" href="ui/default/opera.css"
328       type="text/css" media="projection" id="operaFix" />
329 </head>
330 <body>
331 <div class="layout">
332 <div id="controls"></div>
333 <div id="currentSlide"></div>
334 <div id="header">
335
336 </div>
337 <div id="footer">
338 <h1>Implementing XML languages with lxml</h1>
339 <h2>Dr. Stefan Behnel, EuroPython 2008, Vilnius/Lietuva</h2>
340 </div>
341 </div>
342 <div class="presentation">
343 <div class="slide" id="slide0">
344 <h1 class="title">Implementing XML languages with lxml</h1>
345 <h2 class="subtitle" id="dr-stefan-behnel">Dr. Stefan Behnel</h2>
346
347 <p class="center"><a class="reference external" href="http://codespeak.net/lxml/">http://codespeak.net/lxml/</a></p>
348 <p class="center"><a class="reference external" href="mailto:lxml-dev&#64;codespeak.net">lxml-dev&#64;codespeak.net</a></p>
349 <img alt="tagpython.png" class="center" src="tagpython.png" />
350 <!-- Definitions of interpreted text roles (classes) for S5/HTML data. -->
351 <!-- This data file has been placed in the public domain. -->
352 <!-- Colours
353 ======= -->
354 <!-- Text Sizes
355 ========== -->
356 <!-- Display in Slides (Presentation Mode) Only
357 ========================================== -->
358 <!-- Display in Outline Mode Only
359 ============================ -->
360 <!-- Display in Print Only
361 ===================== -->
362 <!-- Display in Handout Mode Only
363 ============================ -->
364 <!-- Incremental Display
365 =================== -->
366
367 </div>
368 <div class="slide" id="what-is-an-xml-language">
369 <h1>What is an Â»XML language«?</h1>
370 <ul class="simple">
371 <li>a language in XML notation</li>
372 <li>aka Â»XML dialect«<ul>
373 <li>except that it's not a dialect</li>
374 </ul>
375 </li>
376 <li>Examples:<ul>
377 <li>XML Schema</li>
378 <li>Atom/RSS</li>
379 <li>(X)HTML</li>
380 <li>Open Document Format</li>
381 <li>SOAP</li>
382 <li>... add your own one here</li>
383 </ul>
384 </li>
385 </ul>
386 </div>
387 <div class="slide" id="popular-mistakes-to-avoid-1">
388 <h1>Popular mistakes to avoid (1)</h1>
389 <p>&quot;That's easy, I can use regular expressions!&quot;</p>
390 <p class="incremental center">No, you can't.</p>
391 </div>
392 <div class="slide" id="popular-mistakes-to-avoid-2">
393 <h1>Popular mistakes to avoid (2)</h1>
394 <p>&quot;This is tree data, I'll take the DOM!&quot;</p>
395 </div>
396 <div class="slide" id="id1">
397 <h1>Popular mistakes to avoid (2)</h1>
398 <p>&quot;This is tree data, I'll take the DOM!&quot;</p>
399 <ul class="simple">
400 <li>DOM is ubiquitous, but it's as complicated as Java</li>
401 <li>uglify your application with tons of DOM code to<ul>
402 <li>walk over non-element nodes to find the data you need</li>
403 <li>convert text content to other data types</li>
404 <li>modify the XML tree in memory</li>
405 </ul>
406 </li>
407 </ul>
408 <p>=&gt; write verbose, redundant, hard-to-maintain code</p>
409 </div>
410 <div class="slide" id="popular-mistakes-to-avoid-3">
411 <h1>Popular mistakes to avoid (3)</h1>
412 <p>&quot;SAX is <em>so</em> fast and consumes <em>no</em> memory!&quot;</p>
413 </div>
414 <div class="slide" id="id2">
415 <h1>Popular mistakes to avoid (3)</h1>
416 <p>&quot;SAX is <em>so</em> fast and consumes <em>no</em> memory!&quot;</p>
417 <ul class="simple">
418 <li>but <em>writing</em> SAX code is <em>not</em> fast!</li>
419 <li>write error-prone, state-keeping SAX code to<ul>
420 <li>figure out where you are</li>
421 <li>find the sections you need</li>
422 <li>convert text content to other data types</li>
423 <li>copy the XML data into custom data classes</li>
424 <li>... and don't forget the way back into XML!</li>
425 </ul>
426 </li>
427 </ul>
428 <p>=&gt; write confusing state-machine code</p>
429 <p>=&gt; debugging into existence</p>
430 </div>
431 <div class="slide" id="working-with-xml">
432 <h1>Working with XML</h1>
433 <blockquote>
434 <p><strong>Getting XML work done</strong></p>
435 <p>(instead of getting time wasted)</p>
436 </blockquote>
437 </div>
438 <div class="slide" id="how-can-you-work-with-xml">
439 <h1>How can you work with XML?</h1>
440 <ul class="simple">
441 <li>Preparation:<ul>
442 <li>Implement usable data classes as an abstraction layer</li>
443 <li>Implement a mapping from XML to the data classes</li>
444 <li>Implement a mapping from the data classes to XML</li>
445 </ul>
446 </li>
447 <li>Workflow:<ul>
448 <li>parse XML data</li>
449 <li>map XML data to data classes</li>
450 <li>work with data classes</li>
451 <li>map data classes to XML</li>
452 <li>serialise XML</li>
453 </ul>
454 </li>
455 </ul>
456 <ul class="incremental simple">
457 <li>Approach:<ul>
458 <li>get rid of XML and do everything in your own code</li>
459 </ul>
460 </li>
461 </ul>
462 </div>
463 <div class="slide" id="what-if-you-could-simplify-this">
464 <h1>What if you could simplify this?</h1>
465 <ul class="simple">
466 <li>Preparation:<ul>
467 <li>Extend usable XML API classes into an abstraction layer</li>
468 </ul>
469 </li>
470 <li>Workflow:<ul>
471 <li>parse XML data into XML API classes</li>
472 <li>work with XML API classes</li>
473 <li>serialise XML</li>
474 </ul>
475 </li>
476 </ul>
477 <ul class="incremental simple">
478 <li>Approach:<ul>
479 <li>cover only the quirks of XML and make it work <em>for</em> you</li>
480 </ul>
481 </li>
482 </ul>
483 </div>
484 <div class="slide" id="id3">
485 <h1>What if you could simplify this ...</h1>
486 <ul class="simple">
487 <li>... without sacrificing usability or flexibility?</li>
488 <li>... using a high-speed, full-featured, pythonic XML toolkit?</li>
489 <li>... with the power of XPath, XSLT and XML validation?</li>
490 </ul>
491 <p class="incremental center">... then Â»lxml« is your friend!</p>
492 </div>
493 <div class="slide" id="overview">
494 <h1>Overview</h1>
495 <ul class="simple">
496 <li>What is lxml?<ul>
497 <li>what &amp; who</li>
498 </ul>
499 </li>
500 <li>How do you use it?<ul>
501 <li>Lesson 0: quick API overview<ul>
502 <li>ElementTree concepts and lxml features</li>
503 </ul>
504 </li>
505 <li>Lesson 1: parse XML<ul>
506 <li>how to get XML data into memory</li>
507 </ul>
508 </li>
509 <li>Lesson 2: generate XML<ul>
510 <li>how to write an XML generator for a language</li>
511 </ul>
512 </li>
513 <li>Lesson 3: working with XML trees made easy<ul>
514 <li>how to write an XML API for a language</li>
515 </ul>
516 </li>
517 </ul>
518 </li>
519 </ul>
520 </div>
521 <div class="slide" id="what-is-lxml">
522 <h1>What is lxml?</h1>
523 <ul class="simple">
524 <li>a fast, full-featured toolkit for XML and HTML handling<ul>
525 <li><a class="reference external" href="http://codespeak.net/lxml/">http://codespeak.net/lxml/</a></li>
526 <li><a class="reference external" href="mailto:lxml-dev&#64;codespeak.net">lxml-dev&#64;codespeak.net</a></li>
527 </ul>
528 </li>
529 <li>based on and inspired by<ul>
530 <li>the C libraries libxml2 and libxslt (by Daniel Veillard)</li>
531 <li>the ElementTree API (by Fredrik Lundh)</li>
532 <li>the Cython compiler (by Robert Bradshaw, Greg Ewing &amp; me)</li>
533 <li>the Python language (by Guido &amp; [<em>paste Misc/ACKS here</em>])</li>
534 <li>user feedback, ideas and patches (by you!)<ul>
535 <li>keep doing that, we love you all!</li>
536 </ul>
537 </li>
538 </ul>
539 </li>
540 <li>maintained (and major parts) written by myself<ul>
541 <li>initial design and implementation by Martijn Faassen</li>
542 <li>extensive HTML API and tools by Ian Bicking</li>
543 </ul>
544 </li>
545 </ul>
546 </div>
547 <div class="slide" id="what-do-you-get-for-your-money">
548 <h1>What do you get for your money?</h1>
549 <ul class="simple">
550 <li>many tools in one:<ul>
551 <li>Generic, ElementTree compatible XML API: <strong>lxml.etree</strong><ul>
552 <li>but faster for many tasks and much more feature-rich</li>
553 </ul>
554 </li>
555 <li>Special tool set for HTML handling: <strong>lxml.html</strong></li>
556 <li>Special API for pythonic data binding: <strong>lxml.objectify</strong></li>
557 <li>General purpose path languages: XPath and CSS selectors</li>
558 <li>Validation: DTD, XML Schema, RelaxNG, Schematron</li>
559 <li>XSLT, XInclude, C14N, ...</li>
560 <li>Fast tree iteration, event-driven parsing, ...</li>
561 </ul>
562 </li>
563 <li>it's free, but it's worth every â‚¬-Cent!<ul>
564 <li>what users say:<ul>
565 <li>»no qualification, I would recommend lxml for just about any
566 HTML task«</li>
567 <li>»THE tool [...] for newbies and experienced developers«</li>
568 <li>»you can do pretty much anything with an intuitive API«</li>
569 <li>»lxml takes all the pain out of XML«</li>
570 </ul>
571 </li>
572 </ul>
573 </li>
574 </ul>
575 </div>
576 <div class="slide" id="lesson-0-a-quick-overview">
577 <h1>Lesson 0: a quick overview</h1>
578 <blockquote>
579 <p>why <strong>»lxml takes all the pain out of XML«</strong></p>
580 <p>(a quick overview of lxml features and ElementTree concepts)</p>
581 </blockquote>
582 <!-- >>> from lxml import etree, cssselect, html
583 >>> some_xml_data  = "<root><speech class='dialog'><p>So be it!</p></speech><p>stuff</p></root>"
584 >>> some_html_data = "<p>Just a quick note<br>next line</p>"
585 >>> xml_tree = etree.XML(some_xml_data)
586 >>> html_tree = html.fragment_fromstring(some_html_data) -->
587 </div>
588 <div class="slide" id="namespaces-in-elementtree">
589 <h1>Namespaces in ElementTree</h1>
590 <ul>
591 <li><p class="first">uses Clark notation:</p>
592 <ul class="simple">
593 <li>wrap namespace URI in <tt class="docutils literal"><span class="pre">{...}</span></tt></li>
594 <li>append the tag name</li>
595 </ul>
596 <div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="n">tag</span> <span class="o">=</span> <span class="s">&quot;{http://www.w3.org/the/namespace}tagname&quot;</span>
597 <span class="gp">&gt;&gt;&gt; </span><span class="n">element</span> <span class="o">=</span> <span class="n">etree</span><span class="o">.</span><span class="n">Element</span><span class="p">(</span><span class="n">tag</span><span class="p">)</span>
598 </pre></div>
599 </li>
600 <li><p class="first">no prefixes!</p>
601 </li>
602 <li><p class="first">a single, self-containing tag identifier</p>
603 </li>
604 </ul>
605 </div>
606 <div class="slide" id="text-content-in-elementtree">
607 <h1>Text content in ElementTree</h1>
608 <ul>
609 <li><p class="first">uses <tt class="docutils literal">.text</tt> and <tt class="docutils literal">.tail</tt> attributes:</p>
610 <div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="n">div</span> <span class="o">=</span> <span class="n">html</span><span class="o">.</span><span class="n">fragment_fromstring</span><span class="p">(</span>
611 <span class="gp">... </span>    <span class="s">&quot;&lt;div&gt;&lt;p&gt;a paragraph&lt;br&gt;split in two&lt;/p&gt; parts&lt;/div&gt;&quot;</span><span class="p">)</span>
612 <span class="gp">&gt;&gt;&gt; </span><span class="n">p</span> <span class="o">=</span> <span class="n">div</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
613 <span class="gp">&gt;&gt;&gt; </span><span class="n">br</span> <span class="o">=</span> <span class="n">p</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
614
615 <span class="gp">&gt;&gt;&gt; </span><span class="n">p</span><span class="o">.</span><span class="n">text</span>
616 <span class="go">&#39;a paragraph&#39;</span>
617 <span class="gp">&gt;&gt;&gt; </span><span class="n">br</span><span class="o">.</span><span class="n">text</span>
618 <span class="gp">&gt;&gt;&gt; </span><span class="n">br</span><span class="o">.</span><span class="n">tail</span>
619 <span class="go">&#39;split in two&#39;</span>
620 <span class="gp">&gt;&gt;&gt; </span><span class="n">p</span><span class="o">.</span><span class="n">tail</span>
621 <span class="go">&#39; parts&#39;</span>
622 </pre></div>
623 </li>
624 <li><p class="first">no text nodes!</p>
625 <ul class="simple">
626 <li>simplifies tree traversal a lot</li>
627 <li>simplifies many XML algorithms</li>
628 </ul>
629 </li>
630 </ul>
631 </div>
632 <div class="slide" id="attributes-in-elementtree">
633 <h1>Attributes in ElementTree</h1>
634 <ul>
635 <li><p class="first">uses <tt class="docutils literal">.get()</tt> and <tt class="docutils literal">.set()</tt> methods:</p>
636 <div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="n">root</span> <span class="o">=</span> <span class="n">etree</span><span class="o">.</span><span class="n">fromstring</span><span class="p">(</span>
637 <span class="gp">... </span>    <span class="s">&#39;&lt;root a=&quot;the value&quot; b=&quot;of an&quot; c=&quot;attribute&quot;/&gt;&#39;</span><span class="p">)</span>
638
639 <span class="gp">&gt;&gt;&gt; </span><span class="n">root</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s">&#39;a&#39;</span><span class="p">)</span>
640 <span class="go">&#39;the value&#39;</span>
641
642 <span class="gp">&gt;&gt;&gt; </span><span class="n">root</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s">&#39;a&#39;</span><span class="p">,</span> <span class="s">&quot;THE value&quot;</span><span class="p">)</span>
643 <span class="gp">&gt;&gt;&gt; </span><span class="n">root</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s">&#39;a&#39;</span><span class="p">)</span>
644 <span class="go">&#39;THE value&#39;</span>
645 </pre></div>
646 </li>
647 <li><p class="first">or the <tt class="docutils literal">.attrib</tt> dictionary property:</p>
648 <div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="n">d</span> <span class="o">=</span> <span class="n">root</span><span class="o">.</span><span class="n">attrib</span>
649
650 <span class="gp">&gt;&gt;&gt; </span><span class="nb">list</span><span class="p">(</span><span class="nb">sorted</span><span class="p">(</span><span class="n">d</span><span class="o">.</span><span class="n">keys</span><span class="p">()))</span>
651 <span class="go">[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;]</span>
652 <span class="gp">&gt;&gt;&gt; </span><span class="nb">list</span><span class="p">(</span><span class="nb">sorted</span><span class="p">(</span><span class="n">d</span><span class="o">.</span><span class="n">values</span><span class="p">()))</span>
653 <span class="go">[&#39;THE value&#39;, &#39;attribute&#39;, &#39;of an&#39;]</span>
654 </pre></div>
655 </li>
656 </ul>
657 </div>
658 <div class="slide" id="tree-iteration-in-lxml-etree-1">
659 <h1>Tree iteration in lxml.etree (1)</h1>
660 <!-- >>> import collections -->
661 <div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="n">root</span> <span class="o">=</span> <span class="n">etree</span><span class="o">.</span><span class="n">fromstring</span><span class="p">(</span>
662 <span class="gp">... </span>  <span class="s">&quot;&lt;root&gt; &lt;a&gt;&lt;b/&gt;&lt;b/&gt;&lt;/a&gt; &lt;c&gt;&lt;d/&gt;&lt;e&gt;&lt;f/&gt;&lt;/e&gt;&lt;g/&gt;&lt;/c&gt; &lt;/root&gt;&quot;</span><span class="p">)</span>
663
664 <span class="gp">&gt;&gt;&gt; </span><span class="k">print</span><span class="p">([</span><span class="n">child</span><span class="o">.</span><span class="n">tag</span> <span class="k">for</span> <span class="n">child</span> <span class="ow">in</span> <span class="n">root</span><span class="p">])</span>   <span class="c"># children</span>
665 <span class="go">[&#39;a&#39;, &#39;c&#39;]</span>
666
667 <span class="gp">&gt;&gt;&gt; </span><span class="k">print</span><span class="p">([</span><span class="n">el</span><span class="o">.</span><span class="n">tag</span> <span class="k">for</span> <span class="n">el</span> <span class="ow">in</span> <span class="n">root</span><span class="o">.</span><span class="n">iter</span><span class="p">()])</span>  <span class="c"># self and descendants</span>
668 <span class="go">[&#39;root&#39;, &#39;a&#39;, &#39;b&#39;, &#39;b&#39;, &#39;c&#39;, &#39;d&#39;, &#39;e&#39;, &#39;f&#39;, &#39;g&#39;]</span>
669
670 <span class="gp">&gt;&gt;&gt; </span><span class="k">print</span><span class="p">([</span><span class="n">el</span><span class="o">.</span><span class="n">tag</span> <span class="k">for</span> <span class="n">el</span> <span class="ow">in</span> <span class="n">root</span><span class="o">.</span><span class="n">iterdescendants</span><span class="p">()])</span>
671 <span class="go">[&#39;a&#39;, &#39;b&#39;, &#39;b&#39;, &#39;c&#39;, &#39;d&#39;, &#39;e&#39;, &#39;f&#39;, &#39;g&#39;]</span>
672
673
674 <span class="gp">&gt;&gt;&gt; </span><span class="k">def</span> <span class="nf">iter_breadth_first</span><span class="p">(</span><span class="n">root</span><span class="p">):</span>
675 <span class="gp">... </span>    <span class="n">bfs_queue</span> <span class="o">=</span> <span class="n">collections</span><span class="o">.</span><span class="n">deque</span><span class="p">([</span><span class="n">root</span><span class="p">])</span>
676 <span class="gp">... </span>    <span class="k">while</span> <span class="n">bfs_queue</span><span class="p">:</span>
677 <span class="gp">... </span>        <span class="n">el</span> <span class="o">=</span> <span class="n">bfs_queue</span><span class="o">.</span><span class="n">popleft</span><span class="p">()</span>  <span class="c"># pop next element</span>
678 <span class="gp">... </span>        <span class="n">bfs_queue</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">el</span><span class="p">)</span>      <span class="c"># append its children</span>
679 <span class="gp">... </span>        <span class="k">yield</span> <span class="n">el</span>
680
681 <span class="gp">&gt;&gt;&gt; </span><span class="k">print</span><span class="p">([</span><span class="n">el</span><span class="o">.</span><span class="n">tag</span> <span class="k">for</span> <span class="n">el</span> <span class="ow">in</span> <span class="n">iter_breadth_first</span><span class="p">(</span><span class="n">root</span><span class="p">)])</span>
682 <span class="go">[&#39;root&#39;, &#39;a&#39;, &#39;c&#39;, &#39;b&#39;, &#39;b&#39;, &#39;d&#39;, &#39;e&#39;, &#39;g&#39;, &#39;f&#39;]</span>
683 </pre></div>
684 </div>
685 <div class="slide" id="tree-iteration-in-lxml-etree-2">
686 <h1>Tree iteration in lxml.etree (2)</h1>
687 <div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="n">root</span> <span class="o">=</span> <span class="n">etree</span><span class="o">.</span><span class="n">fromstring</span><span class="p">(</span>
688 <span class="gp">... </span>  <span class="s">&quot;&lt;root&gt; &lt;a&gt;&lt;b/&gt;&lt;b/&gt;&lt;/a&gt; &lt;c&gt;&lt;d/&gt;&lt;e&gt;&lt;f/&gt;&lt;/e&gt;&lt;g/&gt;&lt;/c&gt; &lt;/root&gt;&quot;</span><span class="p">)</span>
689
690 <span class="gp">&gt;&gt;&gt; </span><span class="n">tree_walker</span> <span class="o">=</span> <span class="n">etree</span><span class="o">.</span><span class="n">iterwalk</span><span class="p">(</span><span class="n">root</span><span class="p">,</span> <span class="n">events</span><span class="o">=</span><span class="p">(</span><span class="s">&#39;start&#39;</span><span class="p">,</span> <span class="s">&#39;end&#39;</span><span class="p">))</span>
691
692 <span class="gp">&gt;&gt;&gt; </span><span class="k">for</span> <span class="p">(</span><span class="n">event</span><span class="p">,</span> <span class="n">element</span><span class="p">)</span> <span class="ow">in</span> <span class="n">tree_walker</span><span class="p">:</span>
693 <span class="gp">... </span>    <span class="k">print</span><span class="p">(</span><span class="s">&quot;</span><span class="si">%s</span><span class="s"> (</span><span class="si">%s</span><span class="s">)&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">element</span><span class="o">.</span><span class="n">tag</span><span class="p">,</span> <span class="n">event</span><span class="p">))</span>
694 <span class="go">root (start)</span>
695 <span class="go">a (start)</span>
696 <span class="go">b (start)</span>
697 <span class="go">b (end)</span>
698 <span class="go">b (start)</span>
699 <span class="go">b (end)</span>
700 <span class="go">a (end)</span>
701 <span class="go">c (start)</span>
702 <span class="go">d (start)</span>
703 <span class="go">d (end)</span>
704 <span class="go">e (start)</span>
705 <span class="go">f (start)</span>
706 <span class="go">f (end)</span>
707 <span class="go">e (end)</span>
708 <span class="go">g (start)</span>
709 <span class="go">g (end)</span>
710 <span class="go">c (end)</span>
711 <span class="go">root (end)</span>
712 </pre></div>
713 </div>
714 <div class="slide" id="path-languages-in-lxml">
715 <h1>Path languages in lxml</h1>
716 <div class="highlight"><pre><span class="nt">&lt;root&gt;</span>
717   <span class="nt">&lt;speech</span> <span class="na">class=</span><span class="s">&#39;dialog&#39;</span><span class="nt">&gt;&lt;p&gt;</span>So be it!<span class="nt">&lt;/p&gt;&lt;/speech&gt;</span>
718   <span class="nt">&lt;p&gt;</span>stuff<span class="nt">&lt;/p&gt;</span>
719 <span class="nt">&lt;/root&gt;</span>
720 </pre></div>
721 <ul>
722 <li><p class="first">search it with XPath</p>
723 <div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="n">find_paragraphs</span> <span class="o">=</span> <span class="n">etree</span><span class="o">.</span><span class="n">XPath</span><span class="p">(</span><span class="s">&quot;//p&quot;</span><span class="p">)</span>
724 <span class="gp">&gt;&gt;&gt; </span><span class="n">paragraphs</span> <span class="o">=</span> <span class="n">find_paragraphs</span><span class="p">(</span><span class="n">xml_tree</span><span class="p">)</span>
725
726 <span class="gp">&gt;&gt;&gt; </span><span class="k">print</span><span class="p">([</span> <span class="n">p</span><span class="o">.</span><span class="n">text</span> <span class="k">for</span> <span class="n">p</span> <span class="ow">in</span> <span class="n">paragraphs</span> <span class="p">])</span>
727 <span class="go">[&#39;So be it!&#39;, &#39;stuff&#39;]</span>
728 </pre></div>
729 </li>
730 <li><p class="first">search it with CSS selectors</p>
731 <div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="n">find_dialogs</span> <span class="o">=</span> <span class="n">cssselect</span><span class="o">.</span><span class="n">CSSSelector</span><span class="p">(</span><span class="s">&quot;speech.dialog p&quot;</span><span class="p">)</span>
732 <span class="gp">&gt;&gt;&gt; </span><span class="n">paragraphs</span> <span class="o">=</span> <span class="n">find_dialogs</span><span class="p">(</span><span class="n">xml_tree</span><span class="p">)</span>
733
734 <span class="gp">&gt;&gt;&gt; </span><span class="k">print</span><span class="p">([</span> <span class="n">p</span><span class="o">.</span><span class="n">text</span> <span class="k">for</span> <span class="n">p</span> <span class="ow">in</span> <span class="n">paragraphs</span> <span class="p">])</span>
735 <span class="go">[&#39;So be it!&#39;]</span>
736 </pre></div>
737 </li>
738 </ul>
739 </div>
740 <div class="slide" id="summary-of-lesson-0">
741 <h1>Summary of lesson 0</h1>
742 <ul class="simple">
743 <li>lxml comes with various tools<ul>
744 <li>that aim to hide the quirks of XML</li>
745 <li>that simplify finding and handling data</li>
746 <li>that make XML a pythonic tool by itself</li>
747 </ul>
748 </li>
749 </ul>
750 </div>
751 <div class="slide" id="lesson-1-parsing-xml-html">
752 <h1>Lesson 1: parsing XML/HTML</h1>
753 <blockquote>
754 <p><strong>The input side</strong></p>
755 <p>(a quick overview)</p>
756 </blockquote>
757 </div>
758 <div class="slide" id="parsing-xml-and-html-from">
759 <h1>Parsing XML and HTML from ...</h1>
760 <ul class="simple">
761 <li>strings: <tt class="docutils literal">fromstring(xml_data)</tt><ul>
762 <li>byte strings, but also unicode strings</li>
763 </ul>
764 </li>
765 <li>filenames: <tt class="docutils literal">parse(filename)</tt></li>
766 <li>HTTP/FTP URLs: <tt class="docutils literal">parse(url)</tt></li>
767 <li>file objects: <tt class="docutils literal">parse(f)</tt><ul>
768 <li><tt class="docutils literal">f = open(filename, 'rb')</tt> !</li>
769 </ul>
770 </li>
771 <li>file-like objects: <tt class="docutils literal">parse(f)</tt><ul>
772 <li>only need a <tt class="docutils literal">f.read(size)</tt> method</li>
773 </ul>
774 </li>
775 <li>data chunks: <tt class="docutils literal">parser.feed(xml_chunk)</tt><ul>
776 <li><tt class="docutils literal">result = parser.close()</tt></li>
777 </ul>
778 </li>
779 </ul>
780 <p class="small right">(parsing from strings and filenames/URLs frees the GIL)</p>
781 </div>
782 <div class="slide" id="example-parsing-from-a-string">
783 <h1>Example: parsing from a string</h1>
784 <ul>
785 <li><p class="first">using the <tt class="docutils literal">fromstring()</tt> function:</p>
786 <div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="n">root_element</span> <span class="o">=</span> <span class="n">etree</span><span class="o">.</span><span class="n">fromstring</span><span class="p">(</span><span class="n">some_xml_data</span><span class="p">)</span>
787 </pre></div>
788 </li>
789 <li><p class="first">using the <tt class="docutils literal">fromstring()</tt> function with a specific parser:</p>
790 <div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="n">parser</span> <span class="o">=</span> <span class="n">etree</span><span class="o">.</span><span class="n">HTMLParser</span><span class="p">(</span><span class="n">remove_comments</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
791 <span class="gp">&gt;&gt;&gt; </span><span class="n">root_element</span> <span class="o">=</span> <span class="n">etree</span><span class="o">.</span><span class="n">fromstring</span><span class="p">(</span><span class="n">some_html_data</span><span class="p">,</span> <span class="n">parser</span><span class="p">)</span>
792 </pre></div>
793 </li>
794 <li><p class="first">or the <tt class="docutils literal">XML()</tt> and <tt class="docutils literal">HTML()</tt> aliases for literals in code:</p>
795 <div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="n">root_element</span> <span class="o">=</span> <span class="n">etree</span><span class="o">.</span><span class="n">XML</span><span class="p">(</span><span class="s">&quot;&lt;root&gt;&lt;child/&gt;&lt;/root&gt;&quot;</span><span class="p">)</span>
796 <span class="gp">&gt;&gt;&gt; </span><span class="n">root_element</span> <span class="o">=</span> <span class="n">etree</span><span class="o">.</span><span class="n">HTML</span><span class="p">(</span><span class="s">&quot;&lt;p&gt;some&lt;br&gt;paragraph&lt;/p&gt;&quot;</span><span class="p">)</span>
797 </pre></div>
798 </li>
799 </ul>
800 </div>
801 <div class="slide" id="parsing-xml-into">
802 <h1>Parsing XML into ...</h1>
803 <ul class="simple">
804 <li>a tree in memory<ul>
805 <li><tt class="docutils literal">parse()</tt> and <tt class="docutils literal">fromstring()</tt> functions</li>
806 </ul>
807 </li>
808 <li>a tree in memory, but step-by-step with a generator<ul>
809 <li><tt class="docutils literal">iterparse()</tt> generates <tt class="docutils literal">(start/end, element)</tt> events</li>
810 <li>tree can be cleaned up to save space</li>
811 </ul>
812 </li>
813 <li>SAX-like callbacks without building a tree<ul>
814 <li><tt class="docutils literal">parse()</tt> and <tt class="docutils literal">fromstring()</tt> functions</li>
815 <li>pass a <tt class="docutils literal">target</tt> object into the parser</li>
816 </ul>
817 </li>
818 </ul>
819 </div>
820 <div class="slide" id="summary-of-lesson-1">
821 <h1>Summary of lesson 1</h1>
822 <ul class="simple">
823 <li>parsing XML/HTML in lxml is mostly straight forward<ul>
824 <li>simple functions that do the job</li>
825 </ul>
826 </li>
827 <li>advanced use cases are pretty simple<ul>
828 <li>event-driven parsing using <tt class="docutils literal">iterparse()</tt></li>
829 <li>special parser configuration with keyword arguments<ul>
830 <li>configuration is generally local to a parser</li>
831 </ul>
832 </li>
833 </ul>
834 </li>
835 <li>BTW: parsing is <em>very</em> fast, as is serialising<ul>
836 <li>don't hesitate to do parse-serialise-parse cycles</li>
837 </ul>
838 </li>
839 </ul>
840 </div>
841 <div class="slide" id="lesson-2-generating-xml">
842 <h1>Lesson 2: generating XML</h1>
843 <blockquote>
844 <p><strong>The output side</strong></p>
845 <p>(and how to make it safe and simple)</p>
846 </blockquote>
847 </div>
848 <div class="slide" id="the-example-language-atom">
849 <h1>The example language: Atom</h1>
850 <p>The Atom XML format</p>
851 <ul class="simple">
852 <li>Namespace: <a class="reference external" href="http://www.w3.org/2005/Atom">http://www.w3.org/2005/Atom</a></li>
853 <li>W3C recommendation derived from RSS and friends</li>
854 <li>Atom feeds describe news entries and annotated links<ul>
855 <li>a <tt class="docutils literal">feed</tt> contains one or more <tt class="docutils literal">entry</tt> elements</li>
856 <li>an <tt class="docutils literal">entry</tt> contains <tt class="docutils literal">author</tt>, <tt class="docutils literal">link</tt>, <tt class="docutils literal">summary</tt> and/or <tt class="docutils literal">content</tt></li>
857 </ul>
858 </li>
859 </ul>
860 </div>
861 <div class="slide" id="example-generate-xml-1">
862 <h1>Example: generate XML (1)</h1>
863 <p>The ElementMaker (or <em>E-factory</em>)</p>
864 <div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">lxml.builder</span> <span class="kn">import</span> <span class="n">ElementMaker</span>
865 <span class="gp">&gt;&gt;&gt; </span><span class="n">A</span> <span class="o">=</span> <span class="n">ElementMaker</span><span class="p">(</span><span class="n">namespace</span><span class="o">=</span><span class="s">&quot;http://www.w3.org/2005/Atom&quot;</span><span class="p">,</span>
866 <span class="gp">... </span>                 <span class="n">nsmap</span><span class="o">=</span><span class="p">{</span><span class="bp">None</span> <span class="p">:</span> <span class="s">&quot;http://www.w3.org/2005/Atom&quot;</span><span class="p">})</span>
867 </pre></div>
868 <div class="incremental"><div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="n">atom</span> <span class="o">=</span> <span class="n">A</span><span class="o">.</span><span class="n">feed</span><span class="p">(</span>
869 <span class="gp">... </span>  <span class="n">A</span><span class="o">.</span><span class="n">author</span><span class="p">(</span> <span class="n">A</span><span class="o">.</span><span class="n">name</span><span class="p">(</span><span class="s">&quot;Stefan Behnel&quot;</span><span class="p">)</span> <span class="p">),</span>
870 <span class="gp">... </span>  <span class="n">A</span><span class="o">.</span><span class="n">entry</span><span class="p">(</span>
871 <span class="gp">... </span>    <span class="n">A</span><span class="o">.</span><span class="n">title</span><span class="p">(</span><span class="s">&quot;News from lxml&quot;</span><span class="p">),</span>
872 <span class="gp">... </span>    <span class="n">A</span><span class="o">.</span><span class="n">link</span><span class="p">(</span><span class="n">href</span><span class="o">=</span><span class="s">&quot;http://codespeak.net/lxml/&quot;</span><span class="p">),</span>
873 <span class="gp">... </span>    <span class="n">A</span><span class="o">.</span><span class="n">summary</span><span class="p">(</span><span class="s">&quot;See what&#39;s &lt;b&gt;fun&lt;/b&gt; about lxml...&quot;</span><span class="p">,</span>
874 <span class="gp">... </span>              <span class="nb">type</span><span class="o">=</span><span class="s">&quot;html&quot;</span><span class="p">),</span>
875 <span class="gp">... </span>  <span class="p">)</span>
876 <span class="gp">... </span><span class="p">)</span>
877 </pre></div>
878 </div><div class="incremental"><div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">lxml.etree</span> <span class="kn">import</span> <span class="n">tostring</span>
879 <span class="gp">&gt;&gt;&gt; </span><span class="k">print</span><span class="p">(</span> <span class="n">tostring</span><span class="p">(</span><span class="n">atom</span><span class="p">,</span> <span class="n">pretty_print</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span> <span class="p">)</span>
880 </pre></div>
881 </div></div>
882 <div class="slide" id="example-generate-xml-2">
883 <h1>Example: generate XML (2)</h1>
884 <div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="n">atom</span> <span class="o">=</span> <span class="n">A</span><span class="o">.</span><span class="n">feed</span><span class="p">(</span>
885 <span class="gp">... </span>  <span class="n">A</span><span class="o">.</span><span class="n">author</span><span class="p">(</span> <span class="n">A</span><span class="o">.</span><span class="n">name</span><span class="p">(</span><span class="s">&quot;Stefan Behnel&quot;</span><span class="p">)</span> <span class="p">),</span>
886 <span class="gp">... </span>  <span class="n">A</span><span class="o">.</span><span class="n">entry</span><span class="p">(</span>
887 <span class="gp">... </span>    <span class="n">A</span><span class="o">.</span><span class="n">title</span><span class="p">(</span><span class="s">&quot;News from lxml&quot;</span><span class="p">),</span>
888 <span class="gp">... </span>    <span class="n">A</span><span class="o">.</span><span class="n">link</span><span class="p">(</span><span class="n">href</span><span class="o">=</span><span class="s">&quot;http://codespeak.net/lxml/&quot;</span><span class="p">),</span>
889 <span class="gp">... </span>    <span class="n">A</span><span class="o">.</span><span class="n">summary</span><span class="p">(</span><span class="s">&quot;See what&#39;s &lt;b&gt;fun&lt;/b&gt; about lxml...&quot;</span><span class="p">,</span>
890 <span class="gp">... </span>              <span class="nb">type</span><span class="o">=</span><span class="s">&quot;html&quot;</span><span class="p">),</span>
891 <span class="gp">... </span>  <span class="p">)</span>
892 <span class="gp">... </span><span class="p">)</span>
893 </pre></div>
894 <div class="highlight"><pre><span class="nt">&lt;feed</span> <span class="na">xmlns=</span><span class="s">&quot;http://www.w3.org/2005/Atom&quot;</span><span class="nt">&gt;</span>
895   <span class="nt">&lt;author&gt;</span>
896     <span class="nt">&lt;name&gt;</span>Stefan Behnel<span class="nt">&lt;/name&gt;</span>
897   <span class="nt">&lt;/author&gt;</span>
898   <span class="nt">&lt;entry&gt;</span>
899     <span class="nt">&lt;title&gt;</span>News from lxml<span class="nt">&lt;/title&gt;</span>
900     <span class="nt">&lt;link</span> <span class="na">href=</span><span class="s">&quot;http://codespeak.net/lxml/&quot;</span><span class="nt">/&gt;</span>
901     <span class="nt">&lt;summary</span> <span class="na">type=</span><span class="s">&quot;html&quot;</span><span class="nt">&gt;</span>See what&#39;s <span class="ni">&amp;lt;</span>b<span class="ni">&amp;gt;</span>fun<span class="ni">&amp;lt;</span>/b<span class="ni">&amp;gt;</span>
902                          about lxml...<span class="nt">&lt;/summary&gt;</span>
903   <span class="nt">&lt;/entry&gt;</span>
904 <span class="nt">&lt;/feed&gt;</span>
905 </pre></div>
906 </div>
907 <div class="slide" id="be-careful-what-you-type">
908 <h1>Be careful what you type!</h1>
909 <div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="n">atom</span> <span class="o">=</span> <span class="n">A</span><span class="o">.</span><span class="n">feed</span><span class="p">(</span>
910 <span class="gp">... </span>  <span class="n">A</span><span class="o">.</span><span class="n">author</span><span class="p">(</span> <span class="n">A</span><span class="o">.</span><span class="n">name</span><span class="p">(</span><span class="s">&quot;Stefan Behnel&quot;</span><span class="p">)</span> <span class="p">),</span>
911 <span class="gp">... </span>  <span class="n">A</span><span class="o">.</span><span class="n">entry</span><span class="p">(</span>
912 <span class="gp">... </span>    <span class="n">A</span><span class="o">.</span><span class="n">titel</span><span class="p">(</span><span class="s">&quot;News from lxml&quot;</span><span class="p">),</span>
913 <span class="gp">... </span>    <span class="n">A</span><span class="o">.</span><span class="n">link</span><span class="p">(</span><span class="n">href</span><span class="o">=</span><span class="s">&quot;http://codespeak.net/lxml/&quot;</span><span class="p">),</span>
914 <span class="gp">... </span>    <span class="n">A</span><span class="o">.</span><span class="n">summary</span><span class="p">(</span><span class="s">&quot;See what&#39;s &lt;b&gt;fun&lt;/b&gt; about lxml...&quot;</span><span class="p">,</span>
915 <span class="gp">... </span>              <span class="nb">type</span><span class="o">=</span><span class="s">&quot;html&quot;</span><span class="p">),</span>
916 <span class="gp">... </span>  <span class="p">)</span>
917 <span class="gp">... </span><span class="p">)</span>
918 </pre></div>
919 <div class="highlight"><pre><span class="nt">&lt;feed</span> <span class="na">xmlns=</span><span class="s">&quot;http://www.w3.org/2005/Atom&quot;</span><span class="nt">&gt;</span>
920   <span class="nt">&lt;author&gt;</span>
921     <span class="nt">&lt;name&gt;</span>Stefan Behnel<span class="nt">&lt;/name&gt;</span>
922   <span class="nt">&lt;/author&gt;</span>
923   <span class="nt">&lt;entry&gt;</span>
924     <span class="nt">&lt;titel&gt;</span>News from lxml<span class="nt">&lt;/titel&gt;</span>
925     <span class="nt">&lt;link</span> <span class="na">href=</span><span class="s">&quot;http://codespeak.net/lxml/&quot;</span><span class="nt">/&gt;</span>
926     <span class="nt">&lt;summary</span> <span class="na">type=</span><span class="s">&quot;html&quot;</span><span class="nt">&gt;</span>See what&#39;s <span class="ni">&amp;lt;</span>b<span class="ni">&amp;gt;</span>fun<span class="ni">&amp;lt;</span>/b<span class="ni">&amp;gt;</span>
927                          about lxml...<span class="nt">&lt;/summary&gt;</span>
928   <span class="nt">&lt;/entry&gt;</span>
929 <span class="nt">&lt;/feed&gt;</span>
930 </pre></div>
931 </div>
932 <div class="slide" id="want-more-type-safety">
933 <h1>Want more 'type safety'?</h1>
934 <p>Write an XML generator <em>module</em> instead:</p>
935 <div class="highlight"><pre><span class="c"># atomgen.py</span>
936
937 <span class="kn">from</span> <span class="nn">lxml</span> <span class="kn">import</span> <span class="n">etree</span>
938 <span class="kn">from</span> <span class="nn">lxml.builder</span> <span class="kn">import</span> <span class="n">ElementMaker</span>
939
940 <span class="n">ATOM_NAMESPACE</span> <span class="o">=</span> <span class="s">&quot;http://www.w3.org/2005/Atom&quot;</span>
941
942 <span class="n">A</span> <span class="o">=</span> <span class="n">ElementMaker</span><span class="p">(</span><span class="n">namespace</span><span class="o">=</span><span class="n">ATOM_NAMESPACE</span><span class="p">,</span>
943                  <span class="n">nsmap</span><span class="o">=</span><span class="p">{</span><span class="bp">None</span> <span class="p">:</span> <span class="n">ATOM_NAMESPACE</span><span class="p">})</span>
944
945 <span class="n">feed</span> <span class="o">=</span> <span class="n">A</span><span class="o">.</span><span class="n">feed</span>
946 <span class="n">entry</span> <span class="o">=</span> <span class="n">A</span><span class="o">.</span><span class="n">entry</span>
947 <span class="n">title</span> <span class="o">=</span> <span class="n">A</span><span class="o">.</span><span class="n">title</span>
948 <span class="c"># ... and so on and so forth ...</span>
949
950
951 <span class="c"># plus a little validation function: isvalid()</span>
952 <span class="n">isvalid</span> <span class="o">=</span> <span class="n">etree</span><span class="o">.</span><span class="n">RelaxNG</span><span class="p">(</span><span class="nb">file</span><span class="o">=</span><span class="s">&quot;atom.rng&quot;</span><span class="p">)</span>
953 </pre></div>
954 </div>
955 <div class="slide" id="the-atom-generator-module">
956 <h1>The Atom generator module</h1>
957 <!-- >>> import sys
958 >>> sys.path.insert(0, "ep2008") -->
959 <div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">atomgen</span> <span class="kn">as</span> <span class="nn">A</span>
960
961 <span class="gp">&gt;&gt;&gt; </span><span class="n">atom</span> <span class="o">=</span> <span class="n">A</span><span class="o">.</span><span class="n">feed</span><span class="p">(</span>
962 <span class="gp">... </span>  <span class="n">A</span><span class="o">.</span><span class="n">author</span><span class="p">(</span> <span class="n">A</span><span class="o">.</span><span class="n">name</span><span class="p">(</span><span class="s">&quot;Stefan Behnel&quot;</span><span class="p">)</span> <span class="p">),</span>
963 <span class="gp">... </span>  <span class="n">A</span><span class="o">.</span><span class="n">entry</span><span class="p">(</span>
964 <span class="gp">... </span>    <span class="n">A</span><span class="o">.</span><span class="n">link</span><span class="p">(</span><span class="n">href</span><span class="o">=</span><span class="s">&quot;http://codespeak.net/lxml/&quot;</span><span class="p">),</span>
965 <span class="gp">... </span>    <span class="n">A</span><span class="o">.</span><span class="n">title</span><span class="p">(</span><span class="s">&quot;News from lxml&quot;</span><span class="p">),</span>
966 <span class="gp">... </span>    <span class="n">A</span><span class="o">.</span><span class="n">summary</span><span class="p">(</span><span class="s">&quot;See what&#39;s &lt;b&gt;fun&lt;/b&gt; about lxml...&quot;</span><span class="p">,</span>
967 <span class="gp">... </span>              <span class="nb">type</span><span class="o">=</span><span class="s">&quot;html&quot;</span><span class="p">),</span>
968 <span class="gp">... </span>  <span class="p">)</span>
969 <span class="gp">... </span><span class="p">)</span>
970
971 <span class="gp">&gt;&gt;&gt; </span><span class="n">A</span><span class="o">.</span><span class="n">isvalid</span><span class="p">(</span><span class="n">atom</span><span class="p">)</span> <span class="c"># ok, forgot the ID&#39;s =&gt; invalid XML ...</span>
972 <span class="go">False</span>
973
974 <span class="gp">&gt;&gt;&gt; </span><span class="n">title</span> <span class="o">=</span> <span class="n">A</span><span class="o">.</span><span class="n">titel</span><span class="p">(</span><span class="s">&quot;News from lxml&quot;</span><span class="p">)</span>
975 <span class="gt">Traceback (most recent call last):</span>
976   <span class="c">...</span>
977 <span class="gr">AttributeError</span>: <span class="n">&#39;module&#39; object has no attribute &#39;titel&#39;</span>
978 </pre></div>
979 </div>
980 <div class="slide" id="mixing-languages-1">
981 <h1>Mixing languages (1)</h1>
982 <p>Atom can embed <em>serialised</em> HTML</p>
983 <div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">lxml.html.builder</span> <span class="kn">as</span> <span class="nn">h</span>
984
985 <span class="gp">&gt;&gt;&gt; </span><span class="n">html_fragment</span> <span class="o">=</span> <span class="n">h</span><span class="o">.</span><span class="n">DIV</span><span class="p">(</span>
986 <span class="gp">... </span>  <span class="s">&quot;this is some</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">,</span>
987 <span class="gp">... </span>  <span class="n">h</span><span class="o">.</span><span class="n">A</span><span class="p">(</span><span class="s">&quot;HTML&quot;</span><span class="p">,</span> <span class="n">href</span><span class="o">=</span><span class="s">&quot;http://w3.org/MarkUp/&quot;</span><span class="p">),</span>
988 <span class="gp">... </span>  <span class="s">&quot;</span><span class="se">\n</span><span class="s">content&quot;</span><span class="p">)</span>
989 </pre></div>
990 <div class="incremental"><div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="n">serialised_html</span> <span class="o">=</span> <span class="n">etree</span><span class="o">.</span><span class="n">tostring</span><span class="p">(</span><span class="n">html_fragment</span><span class="p">,</span> <span class="n">method</span><span class="o">=</span><span class="s">&quot;html&quot;</span><span class="p">)</span>
991
992 <span class="gp">&gt;&gt;&gt; </span><span class="n">summary</span> <span class="o">=</span> <span class="n">A</span><span class="o">.</span><span class="n">summary</span><span class="p">(</span><span class="n">serialised_html</span><span class="p">,</span> <span class="nb">type</span><span class="o">=</span><span class="s">&quot;html&quot;</span><span class="p">)</span>
993 </pre></div>
994 </div><div class="incremental"><div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="k">print</span><span class="p">(</span><span class="n">etree</span><span class="o">.</span><span class="n">tostring</span><span class="p">(</span><span class="n">summary</span><span class="p">))</span>
995 <span class="go">&lt;summary xmlns=&quot;http://www.w3.org/2005/Atom&quot; type=&quot;html&quot;&gt;</span>
996 <span class="go">   &amp;lt;div&amp;gt;this is some</span>
997 <span class="go">   &amp;lt;a href=&quot;http://w3.org/MarkUp/&quot;&amp;gt;HTML&amp;lt;/a&amp;gt;</span>
998 <span class="go">   content&amp;lt;/div&amp;gt;</span>
999 <span class="go">&lt;/summary&gt;</span>
1000 </pre></div>
1001 </div></div>
1002 <div class="slide" id="mixing-languages-2">
1003 <h1>Mixing languages (2)</h1>
1004 <p>Atom can also embed non-escaped XHTML</p>
1005 <div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">copy</span> <span class="kn">import</span> <span class="n">deepcopy</span>
1006 <span class="gp">&gt;&gt;&gt; </span><span class="n">xhtml_fragment</span> <span class="o">=</span> <span class="n">deepcopy</span><span class="p">(</span><span class="n">html_fragment</span><span class="p">)</span>
1007
1008 <span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">lxml.html</span> <span class="kn">import</span> <span class="n">html_to_xhtml</span>
1009 <span class="gp">&gt;&gt;&gt; </span><span class="n">html_to_xhtml</span><span class="p">(</span><span class="n">xhtml_fragment</span><span class="p">)</span>
1010
1011 <span class="gp">&gt;&gt;&gt; </span><span class="n">summary</span> <span class="o">=</span> <span class="n">A</span><span class="o">.</span><span class="n">summary</span><span class="p">(</span><span class="n">xhtml_fragment</span><span class="p">,</span> <span class="nb">type</span><span class="o">=</span><span class="s">&quot;xhtml&quot;</span><span class="p">)</span>
1012 </pre></div>
1013 <div class="incremental"><div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="k">print</span><span class="p">(</span><span class="n">etree</span><span class="o">.</span><span class="n">tostring</span><span class="p">(</span><span class="n">summary</span><span class="p">,</span> <span class="n">pretty_print</span><span class="o">=</span><span class="bp">True</span><span class="p">))</span>
1014 <span class="go">&lt;summary xmlns=&quot;http://www.w3.org/2005/Atom&quot; type=&quot;xhtml&quot;&gt;</span>
1015 <span class="go">  &lt;html:div xmlns:html=&quot;http://www.w3.org/1999/xhtml&quot;&gt;this is some</span>
1016 <span class="go">  &lt;html:a href=&quot;http://w3.org/MarkUp/&quot;&gt;HTML&lt;/html:a&gt;</span>
1017 <span class="go">  content&lt;/html:div&gt;</span>
1018 <span class="go">&lt;/summary&gt;</span>
1019 </pre></div>
1020 </div></div>
1021 <div class="slide" id="summary-of-lesson-2">
1022 <h1>Summary of lesson 2</h1>
1023 <ul class="simple">
1024 <li>generating XML is easy<ul>
1025 <li>use the ElementMaker</li>
1026 </ul>
1027 </li>
1028 <li>wrap it in a module that provides<ul>
1029 <li>the target namespace</li>
1030 <li>an ElementMaker name for each language element</li>
1031 <li>a validator</li>
1032 <li>maybe additional helper functions</li>
1033 </ul>
1034 </li>
1035 <li>mixing languages is easy<ul>
1036 <li>define a generator module for each</li>
1037 </ul>
1038 </li>
1039 </ul>
1040 <p>... this is all you need for the <em>output</em> side of XML languages</p>
1041 </div>
1042 <div class="slide" id="lesson-3-designing-xml-apis">
1043 <h1>Lesson 3: Designing XML APIs</h1>
1044 <blockquote>
1045 <p><strong>The Element API</strong></p>
1046 <p>(and how to make it the way <em>you</em> want)</p>
1047 </blockquote>
1048 </div>
1049 <div class="slide" id="trees-in-c-and-in-python">
1050 <h1>Trees in C and in Python</h1>
1051 <ul class="simple">
1052 <li>Trees have two representations:<ul>
1053 <li>a plain, complete, low-level C tree provided by libxml2</li>
1054 <li>a set of Python Element proxies, each representing one element</li>
1055 </ul>
1056 </li>
1057 <li>Proxies are created on-the-fly:<ul>
1058 <li>lxml creates an Element object for a C node on request</li>
1059 <li>proxies are garbage collected when going out of scope</li>
1060 <li>XML trees are garbage collected when deleting the last proxy</li>
1061 </ul>
1062 </li>
1063 </ul>
1064 <img alt="ep2008/proxies.png" class="center" src="ep2008/proxies.png" />
1065 </div>
1066 <div class="slide" id="mapping-python-classes-to-nodes">
1067 <h1>Mapping Python classes to nodes</h1>
1068 <ul class="simple">
1069 <li>Proxies can be assigned to XML nodes <em>by user code</em><ul>
1070 <li>lxml tells you about a node, you return a class</li>
1071 </ul>
1072 </li>
1073 </ul>
1074 </div>
1075 <div class="slide" id="example-a-simple-element-class-1">
1076 <h1>Example: a simple Element class (1)</h1>
1077 <ul>
1078 <li><p class="first">define a subclass of ElementBase</p>
1079 <div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="k">class</span> <span class="nc">HonkElement</span><span class="p">(</span><span class="n">etree</span><span class="o">.</span><span class="n">ElementBase</span><span class="p">):</span>
1080 <span class="gp">... </span>   <span class="nd">@property</span>
1081 <span class="gp">... </span>   <span class="k">def</span> <span class="nf">honking</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
1082 <span class="gp">... </span>      <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s">&#39;honking&#39;</span><span class="p">)</span> <span class="o">==</span> <span class="s">&#39;true&#39;</span>
1083 </pre></div>
1084 </li>
1085 <li><p class="first">let it replace the default Element class</p>
1086 <div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="n">lookup</span> <span class="o">=</span> <span class="n">etree</span><span class="o">.</span><span class="n">ElementDefaultClassLookup</span><span class="p">(</span>
1087 <span class="gp">... </span>                            <span class="n">element</span><span class="o">=</span><span class="n">HonkElement</span><span class="p">)</span>
1088
1089 <span class="gp">&gt;&gt;&gt; </span><span class="n">parser</span> <span class="o">=</span> <span class="n">etree</span><span class="o">.</span><span class="n">XMLParser</span><span class="p">()</span>
1090 <span class="gp">&gt;&gt;&gt; </span><span class="n">parser</span><span class="o">.</span><span class="n">set_element_class_lookup</span><span class="p">(</span><span class="n">lookup</span><span class="p">)</span>
1091 </pre></div>
1092 </li>
1093 </ul>
1094 </div>
1095 <div class="slide" id="example-a-simple-element-class-2">
1096 <h1>Example: a simple Element class (2)</h1>
1097 <ul>
1098 <li><p class="first">use the new Element class</p>
1099 <div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="n">root</span> <span class="o">=</span> <span class="n">etree</span><span class="o">.</span><span class="n">XML</span><span class="p">(</span><span class="s">&#39;&lt;root&gt;&lt;honk honking=&quot;true&quot;/&gt;&lt;/root&gt;&#39;</span><span class="p">,</span>
1100 <span class="gp">... </span>                 <span class="n">parser</span><span class="p">)</span>
1101
1102 <span class="gp">&gt;&gt;&gt; </span><span class="n">root</span><span class="o">.</span><span class="n">honking</span>
1103 <span class="go">False</span>
1104 <span class="gp">&gt;&gt;&gt; </span><span class="n">root</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">honking</span>
1105 <span class="go">True</span>
1106 </pre></div>
1107 </li>
1108 </ul>
1109 </div>
1110 <div class="slide" id="id4">
1111 <h1>Mapping Python classes to nodes</h1>
1112 <ul class="simple">
1113 <li>The Element class lookup<ul>
1114 <li>lxml tells you about a node, you return a class</li>
1115 <li>no restrictions on lookup algorithm</li>
1116 <li>each parser can use a different class lookup scheme</li>
1117 <li>lookup schemes can be chained through fallbacks</li>
1118 </ul>
1119 </li>
1120 <li>Classes can be selected based on<ul>
1121 <li>the node type (element, comment or processing instruction)<ul>
1122 <li><tt class="docutils literal">ElementDefaultClassLookup()</tt></li>
1123 </ul>
1124 </li>
1125 <li>the namespaced node name<ul>
1126 <li><tt class="docutils literal">CustomElementClassLookup()</tt> + a fallback</li>
1127 <li><tt class="docutils literal">ElementNamespaceClassLookup()</tt> + a fallback</li>
1128 </ul>
1129 </li>
1130 <li>the value of an attribute (e.g. <tt class="docutils literal">id</tt> or <tt class="docutils literal">class</tt>)<ul>
1131 <li><tt class="docutils literal">AttributeBasedElementClassLookup()</tt> + a fallback</li>
1132 </ul>
1133 </li>
1134 <li>read-only inspection of the tree<ul>
1135 <li><tt class="docutils literal">PythonElementClassLookup()</tt> + a fallback</li>
1136 </ul>
1137 </li>
1138 </ul>
1139 </li>
1140 </ul>
1141 </div>
1142 <div class="slide" id="designing-an-atom-api">
1143 <h1>Designing an Atom API</h1>
1144 <ul>
1145 <li><p class="first">a feed is a container for entries</p>
1146 <div class="highlight"><pre><span class="c"># atom.py</span>
1147
1148 <span class="n">ATOM_NAMESPACE</span> <span class="o">=</span> <span class="s">&quot;http://www.w3.org/2005/Atom&quot;</span>
1149 <span class="n">_ATOM_NS</span> <span class="o">=</span> <span class="s">&quot;{</span><span class="si">%s</span><span class="s">}&quot;</span> <span class="o">%</span> <span class="n">ATOM_NAMESPACE</span>
1150
1151 <span class="k">class</span> <span class="nc">FeedElement</span><span class="p">(</span><span class="n">etree</span><span class="o">.</span><span class="n">ElementBase</span><span class="p">):</span>
1152     <span class="nd">@property</span>
1153     <span class="k">def</span> <span class="nf">entries</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
1154        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">findall</span><span class="p">(</span><span class="n">_ATOM_NS</span> <span class="o">+</span> <span class="s">&quot;entry&quot;</span><span class="p">)</span>
1155 </pre></div>
1156 </li>
1157 <li><p class="first">it also has a couple of meta-data children, e.g. <tt class="docutils literal">title</tt></p>
1158 <div class="highlight"><pre><span class="k">class</span> <span class="nc">FeedElement</span><span class="p">(</span><span class="n">etree</span><span class="o">.</span><span class="n">ElementBase</span><span class="p">):</span>
1159     <span class="c"># ...</span>
1160     <span class="nd">@property</span>
1161     <span class="k">def</span> <span class="nf">title</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
1162         <span class="s">&quot;return the title or None&quot;</span>
1163         <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">find</span><span class="p">(</span><span class="s">&quot;title&quot;</span><span class="p">)</span>
1164 </pre></div>
1165 </li>
1166 </ul>
1167 </div>
1168 <div class="slide" id="consider-lxml-objectify">
1169 <h1>Consider lxml.objectify</h1>
1170 <ul class="simple">
1171 <li>ready-to-use, generic Python object API for XML</li>
1172 </ul>
1173 <div class="highlight"><pre><span class="o">&gt;&gt;&gt;</span> <span class="kn">from</span> <span class="nn">lxml</span> <span class="kn">import</span> <span class="n">objectify</span>
1174
1175 <span class="o">&gt;&gt;&gt;</span> <span class="n">feed</span> <span class="o">=</span> <span class="n">objectify</span><span class="o">.</span><span class="n">parse</span><span class="p">(</span><span class="s">&quot;atom-example.xml&quot;</span><span class="p">)</span>
1176 <span class="o">&gt;&gt;&gt;</span> <span class="k">print</span><span class="p">(</span><span class="n">feed</span><span class="o">.</span><span class="n">title</span><span class="p">)</span>
1177 <span class="n">Example</span> <span class="n">Feed</span>
1178
1179 <span class="o">&gt;&gt;&gt;</span> <span class="k">print</span><span class="p">([</span><span class="n">entry</span><span class="o">.</span><span class="n">title</span> <span class="k">for</span> <span class="n">entry</span> <span class="ow">in</span> <span class="n">feed</span><span class="o">.</span><span class="n">entry</span><span class="p">])</span>
1180 <span class="p">[</span><span class="s">&#39;Atom-Powered Robots Run Amok&#39;</span><span class="p">]</span>
1181
1182 <span class="o">&gt;&gt;&gt;</span> <span class="k">print</span><span class="p">(</span><span class="n">feed</span><span class="o">.</span><span class="n">entry</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">title</span><span class="p">)</span>
1183 <span class="n">Atom</span><span class="o">-</span><span class="n">Powered</span> <span class="n">Robots</span> <span class="n">Run</span> <span class="n">Amok</span>
1184 </pre></div>
1185 </div>
1186 <div class="slide" id="still-room-for-more-convenience">
1187 <h1>Still room for more convenience</h1>
1188 <div class="highlight"><pre><span class="kn">from</span> <span class="nn">itertools</span> <span class="kn">import</span> <span class="n">chain</span>
1189
1190 <span class="k">class</span> <span class="nc">FeedElement</span><span class="p">(</span><span class="n">objectify</span><span class="o">.</span><span class="n">ObjectifiedElement</span><span class="p">):</span>
1191
1192     <span class="k">def</span> <span class="nf">addIDs</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
1193         <span class="s">&quot;initialise the IDs of feed and entries&quot;</span>
1194
1195         <span class="k">for</span> <span class="n">element</span> <span class="ow">in</span> <span class="n">chain</span><span class="p">([</span><span class="bp">self</span><span class="p">],</span> <span class="bp">self</span><span class="o">.</span><span class="n">entry</span><span class="p">):</span>
1196             <span class="k">if</span> <span class="n">element</span><span class="o">.</span><span class="n">find</span><span class="p">(</span><span class="n">_ATOM_NS</span> <span class="o">+</span> <span class="s">&quot;id&quot;</span><span class="p">)</span> <span class="ow">is</span> <span class="bp">None</span><span class="p">:</span>
1197                 <span class="nb">id</span> <span class="o">=</span> <span class="n">etree</span><span class="o">.</span><span class="n">SubElement</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">_ATOM_NS</span> <span class="o">+</span> <span class="s">&quot;id&quot;</span><span class="p">)</span>
1198                 <span class="nb">id</span><span class="o">.</span><span class="n">text</span> <span class="o">=</span> <span class="n">make_guid</span><span class="p">()</span>
1199 </pre></div>
1200 </div>
1201 <div class="slide" id="incremental-api-design">
1202 <h1>Incremental API design</h1>
1203 <ul class="simple">
1204 <li>choose an XML API to start with<ul>
1205 <li>lxml.etree is general purpose</li>
1206 <li>lxml.objectify is nice for document-style XML</li>
1207 </ul>
1208 </li>
1209 <li>fix Elements that really need some API sugar<ul>
1210 <li>dict-mappings to children with specific content/attributes</li>
1211 <li>properties for specially typed attributes or child values</li>
1212 <li>simplified access to varying content types of an element</li>
1213 <li>shortcuts for unnecessarily deep subtrees</li>
1214 </ul>
1215 </li>
1216 <li>ignore what works well enough with the Element API<ul>
1217 <li>lists of homogeneous children -&gt; Element iteration</li>
1218 <li>string attributes -&gt; .get()/.set()</li>
1219 </ul>
1220 </li>
1221 <li>let the API grow at your fingertips<ul>
1222 <li>play with it and test use cases</li>
1223 <li>avoid &quot;I want because I can&quot; feature explosion!</li>
1224 </ul>
1225 </li>
1226 </ul>
1227 </div>
1228 <div class="slide" id="setting-up-the-element-mapping">
1229 <h1>Setting up the Element mapping</h1>
1230 <p>Atom has a namespace =&gt; leave the mapping to lxml</p>
1231 <div class="highlight"><pre><span class="c"># ...</span>
1232 <span class="n">_atom_lookup</span> <span class="o">=</span> <span class="n">etree</span><span class="o">.</span><span class="n">ElementNamespaceClassLookup</span><span class="p">(</span>
1233                   <span class="n">objectify</span><span class="o">.</span><span class="n">ObjectifyElementClassLookup</span><span class="p">())</span>
1234
1235 <span class="c"># map the classes to tag names</span>
1236 <span class="n">ns</span> <span class="o">=</span> <span class="n">_atom_lookup</span><span class="o">.</span><span class="n">get_namespace</span><span class="p">(</span><span class="n">ATOM_NAMESPACE</span><span class="p">)</span>
1237 <span class="n">ns</span><span class="p">[</span><span class="s">&quot;feed&quot;</span><span class="p">]</span>  <span class="o">=</span> <span class="n">FeedElement</span>
1238 <span class="n">ns</span><span class="p">[</span><span class="s">&quot;entry&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">EntryElement</span>
1239 <span class="c"># ... and so on</span>
1240 <span class="c"># or use ns.update(vars()) with appropriate class names</span>
1241
1242 <span class="c"># create a parser that does some whitespace cleanup</span>
1243 <span class="n">atom_parser</span> <span class="o">=</span> <span class="n">etree</span><span class="o">.</span><span class="n">XMLParser</span><span class="p">(</span><span class="n">remove_blank_text</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
1244
1245 <span class="c"># make it use our Atom classes</span>
1246 <span class="n">atom_parser</span><span class="o">.</span><span class="n">set_element_class_lookup</span><span class="p">(</span><span class="n">_atom_lookup</span><span class="p">)</span>
1247
1248 <span class="c"># and help users in using our parser setup</span>
1249 <span class="k">def</span> <span class="nf">parse</span><span class="p">(</span><span class="nb">input</span><span class="p">):</span>
1250     <span class="k">return</span> <span class="n">etree</span><span class="o">.</span><span class="n">parse</span><span class="p">(</span><span class="nb">input</span><span class="p">,</span> <span class="n">atom_parser</span><span class="p">)</span>
1251 </pre></div>
1252 </div>
1253 <div class="slide" id="using-your-new-atom-api">
1254 <h1>Using your new Atom API</h1>
1255 <div class="highlight"><pre><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">atom</span>
1256 <span class="gp">&gt;&gt;&gt; </span><span class="n">feed</span> <span class="o">=</span> <span class="n">atom</span><span class="o">.</span><span class="n">parse</span><span class="p">(</span><span class="s">&quot;ep2008/atom-example.xml&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">getroot</span><span class="p">()</span>
1257
1258 <span class="gp">&gt;&gt;&gt; </span><span class="k">print</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">feed</span><span class="o">.</span><span class="n">entry</span><span class="p">))</span>
1259 <span class="go">1</span>
1260 <span class="gp">&gt;&gt;&gt; </span><span class="k">print</span><span class="p">([</span><span class="n">entry</span><span class="o">.</span><span class="n">title</span> <span class="k">for</span> <span class="n">entry</span> <span class="ow">in</span> <span class="n">feed</span><span class="o">.</span><span class="n">entry</span><span class="p">])</span>
1261 <span class="go">[&#39;Atom-Powered Robots Run Amok&#39;]</span>
1262
1263 <span class="gp">&gt;&gt;&gt; </span><span class="n">link_tag</span> <span class="o">=</span> <span class="s">&quot;{</span><span class="si">%s</span><span class="s">}link&quot;</span> <span class="o">%</span> <span class="n">atom</span><span class="o">.</span><span class="n">ATOM_NAMESPACE</span>
1264 <span class="gp">&gt;&gt;&gt; </span><span class="k">print</span><span class="p">([</span><span class="n">link</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s">&quot;href&quot;</span><span class="p">)</span> <span class="k">for</span> <span class="n">link</span> <span class="ow">in</span> <span class="n">feed</span><span class="o">.</span><span class="n">iter</span><span class="p">(</span><span class="n">link_tag</span><span class="p">)])</span>
1265 <span class="go">[&#39;http://example.org/&#39;, &#39;http://example.org/2003/12/13/atom03&#39;]</span>
1266 </pre></div>
1267 </div>
1268 <div class="slide" id="summary-of-lesson-3">
1269 <h1>Summary of lesson 3</h1>
1270 <p>To implement an XML API ...</p>
1271 <ol class="arabic simple">
1272 <li>start off with lxml's Element API<ul>
1273 <li>or take a look at the object API of lxml.objectify</li>
1274 </ul>
1275 </li>
1276 <li>specialise it into a set of custom Element classes</li>
1277 <li>map them to XML tags using one of the lookup schemes</li>
1278 <li>improve the API incrementally while using it<ul>
1279 <li>discover inconveniences and beautify them</li>
1280 <li>avoid putting work into things that work</li>
1281 </ul>
1282 </li>
1283 </ol>
1284 </div>
1285 <div class="slide" id="conclusion">
1286 <h1>Conclusion</h1>
1287 <p>lxml ...</p>
1288 <ul class="simple">
1289 <li>provides a convenient set of tools for XML and HTML<ul>
1290 <li>parsing</li>
1291 <li>generating</li>
1292 <li>working with in-memory trees</li>
1293 </ul>
1294 </li>
1295 <li>follows Python idioms wherever possible<ul>
1296 <li>highly extensible through wrapping and subclassing</li>
1297 <li>callable objects for XPath, CSS selectors, XSLT, schemas</li>
1298 <li>iteration for tree traversal (even while parsing)</li>
1299 <li>list-/dict-like APIs, properties, keyword arguments, ...</li>
1300 </ul>
1301 </li>
1302 <li>makes extension and specialisation easy<ul>
1303 <li>write a special XML generator module in trivial code</li>
1304 <li>write your own XML API incrementally on-the-fly</li>
1305 </ul>
1306 </li>
1307 </ul>
1308 </div>
1309 </div>
1310 </body>
1311 </html>