1 # -*- test-case-name: twisted.words.test.test_domish -*-
2 # Copyright (c) Twisted Matrix Laboratories.
3 # See LICENSE for details.
6 DOM-like XML processing support.
8 This module provides support for parsing XML into DOM-like object structures
9 and serializing such structures to an XML string representation, optimized
10 for use in streaming XML applications.
15 from zope.interface import implements, Interface, Attribute
17 def _splitPrefix(name):
18 """ Internal method for splitting a prefixed Element name into its
20 ntok = name.split(":", 1)
24 return (None, ntok[0])
26 # Global map of prefixes that always get injected
27 # into the serializers prefix map (note, that doesn't
28 # mean they're always _USED_)
29 G_PREFIXES = { "http://www.w3.org/XML/1998/namespace":"xml" }
31 class _ListSerializer:
32 """ Internal class which serializes an Element tree into a buffer """
33 def __init__(self, prefixes=None, prefixesInScope=None):
37 self.prefixes.update(prefixes)
38 self.prefixes.update(G_PREFIXES)
39 self.prefixStack = [G_PREFIXES.values()] + (prefixesInScope or [])
40 self.prefixCounter = 0
43 return u"".join(self.writelist)
45 def getPrefix(self, uri):
46 if not self.prefixes.has_key(uri):
47 self.prefixes[uri] = "xn%d" % (self.prefixCounter)
48 self.prefixCounter = self.prefixCounter + 1
49 return self.prefixes[uri]
51 def prefixInScope(self, prefix):
52 stack = self.prefixStack
53 for i in range(-1, (len(self.prefixStack)+1) * -1, -1):
54 if prefix in stack[i]:
58 def serialize(self, elem, closeElement=1, defaultUri=''):
59 # Optimization shortcuts
60 write = self.writelist.append
62 # Shortcut, check to see if elem is actually a chunk o' serialized XML
63 if isinstance(elem, SerializedXML):
67 # Shortcut, check to see if elem is actually a string (aka Cdata)
68 if isinstance(elem, types.StringTypes):
69 write(escapeToXml(elem))
72 # Further optimizations
75 defaultUri, currentDefaultUri = elem.defaultUri, defaultUri
77 for p, u in elem.localPrefixes.iteritems():
79 self.prefixStack.append(elem.localPrefixes.keys())
81 # Inherit the default namespace
82 if defaultUri is None:
83 defaultUri = currentDefaultUri
89 if uri != defaultUri or uri in self.prefixes:
90 prefix = self.getPrefix(uri)
91 inScope = self.prefixInScope(prefix)
98 write("<%s:%s" % (prefix, name))
101 write(" xmlns:%s='%s'" % (prefix, uri))
102 self.prefixStack[-1].append(prefix)
105 if defaultUri != currentDefaultUri and \
106 (uri != defaultUri or not prefix or not inScope):
107 write(" xmlns='%s'" % (defaultUri))
109 for p, u in elem.localPrefixes.iteritems():
110 write(" xmlns:%s='%s'" % (p, u))
112 # Serialize attributes
113 for k,v in elem.attributes.items():
114 # If the attribute name is a tuple, it's a qualified attribute
115 if isinstance(k, types.TupleType):
116 attr_uri, attr_name = k
117 attr_prefix = self.getPrefix(attr_uri)
119 if not self.prefixInScope(attr_prefix):
120 write(" xmlns:%s='%s'" % (attr_prefix, attr_uri))
121 self.prefixStack[-1].append(attr_prefix)
123 write(" %s:%s='%s'" % (attr_prefix, attr_name,
126 write((" %s='%s'" % ( k, escapeToXml(v, 1))))
128 # Shortcut out if this is only going to return
129 # the element (i.e. no children)
130 if closeElement == 0:
135 if len(elem.children) > 0:
137 for c in elem.children:
138 self.serialize(c, defaultUri=defaultUri)
141 write("</%s>" % (name))
143 write("</%s:%s>" % (prefix, name))
147 self.prefixStack.pop()
150 SerializerClass = _ListSerializer
152 def escapeToXml(text, isattrib = 0):
153 """ Escape text to proper XML form, per section 2.3 in the XML specification.
156 @param text: Text to escape
158 @type isattrib: C{bool}
159 @param isattrib: Triggers escaping of characters necessary for use as
162 text = text.replace("&", "&")
163 text = text.replace("<", "<")
164 text = text.replace(">", ">")
166 text = text.replace("'", "'")
167 text = text.replace("\"", """)
170 def unescapeFromXml(text):
171 text = text.replace("<", "<")
172 text = text.replace(">", ">")
173 text = text.replace("'", "'")
174 text = text.replace(""", "\"")
175 text = text.replace("&", "&")
178 def generateOnlyInterface(list, int):
179 """ Filters items in a list by class
182 if int.providedBy(n):
185 def generateElementsQNamed(list, name, uri):
186 """ Filters Element items in a list with matching name and URI. """
188 if IElement.providedBy(n) and n.name == name and n.uri == uri:
191 def generateElementsNamed(list, name):
192 """ Filters Element items in a list with matching name, regardless of URI.
195 if IElement.providedBy(n) and n.name == name:
199 class SerializedXML(unicode):
200 """ Marker class for pre-serialized XML in the DOM. """
205 """ Convenience object for tracking namespace declarations. """
206 def __init__(self, uri):
208 def __getattr__(self, n):
209 return (self._uri, n)
210 def __getitem__(self, n):
211 return (self._uri, n)
213 class IElement(Interface):
215 Interface to XML element nodes.
217 See L{Element} for a detailed example of its general use.
219 Warning: this Interface is not yet complete!
222 uri = Attribute(""" Element's namespace URI """)
223 name = Attribute(""" Element's local name """)
224 defaultUri = Attribute(""" Default namespace URI of child elements """)
225 attributes = Attribute(""" Dictionary of element attributes """)
226 children = Attribute(""" List of child nodes """)
227 parent = Attribute(""" Reference to element's parent element """)
228 localPrefixes = Attribute(""" Dictionary of local prefixes """)
230 def toXml(prefixes=None, closeElement=1, defaultUri='',
231 prefixesInScope=None):
232 """ Serializes object to a (partial) XML document
234 @param prefixes: dictionary that maps namespace URIs to suggested
236 @type prefixes: L{dict}
237 @param closeElement: flag that determines whether to include the
238 closing tag of the element in the serialized
239 string. A value of C{0} only generates the
240 element's start tag. A value of C{1} yields a
241 complete serialization.
242 @type closeElement: C{int}
243 @param defaultUri: Initial default namespace URI. This is most useful
244 for partial rendering, where the logical parent
245 element (of which the starttag was already
246 serialized) declares a default namespace that should
248 @type defaultUri: C{str}
249 @param prefixesInScope: list of prefixes that are assumed to be
250 declared by ancestors.
251 @type prefixesInScope: C{list}
252 @return: (partial) serialized XML
256 def addElement(name, defaultUri = None, content = None):
257 """ Create an element and add as child.
259 The new element is added to this element as a child, and will have
260 this element as its parent.
262 @param name: element name. This can be either a C{unicode} object that
263 contains the local name, or a tuple of (uri, local_name)
264 for a fully qualified name. In the former case,
265 the namespace URI is inherited from this element.
266 @type name: C{unicode} or C{tuple} of (C{unicode}, C{unicode})
267 @param defaultUri: default namespace URI for child elements. If
268 C{None}, this is inherited from this element.
269 @type defaultUri: C{unicode}
270 @param content: text contained by the new element.
271 @type content: C{unicode}
272 @return: the created element
273 @rtype: object providing L{IElement}
277 """ Adds a node as child of this element.
279 The C{node} will be added to the list of childs of this element, and
280 will have this element set as its parent when C{node} provides
283 @param node: the child node.
284 @type node: C{unicode} or object implementing L{IElement}
287 class Element(object):
288 """ Represents an XML element node.
290 An Element contains a series of attributes (name/value pairs), content
291 (character data), and other child Element objects. When building a document
292 with markup (such as HTML or XML), use this object as the starting point.
294 Element objects fully support XML Namespaces. The fully qualified name of
295 the XML Element it represents is stored in the C{uri} and C{name}
296 attributes, where C{uri} holds the namespace URI. There is also a default
297 namespace, for child elements. This is stored in the C{defaultUri}
298 attribute. Note that C{''} means the empty namespace.
300 Serialization of Elements through C{toXml()} will use these attributes
301 for generating proper serialized XML. When both C{uri} and C{defaultUri}
302 are not None in the Element and all of its descendents, serialization
303 proceeds as expected:
305 >>> from twisted.words.xish import domish
306 >>> root = domish.Element(('myns', 'root'))
307 >>> root.addElement('child', content='test')
308 <twisted.words.xish.domish.Element object at 0x83002ac>
310 u"<root xmlns='myns'><child>test</child></root>"
312 For partial serialization, needed for streaming XML, a special value for
313 namespace URIs can be used: C{None}.
315 Using C{None} as the value for C{uri} means: this element is in whatever
316 namespace inherited by the closest logical ancestor when the complete XML
317 document has been serialized. The serialized start tag will have a
318 non-prefixed name, and no xmlns declaration will be generated.
320 Similarly, C{None} for C{defaultUri} means: the default namespace for my
321 child elements is inherited from the logical ancestors of this element,
322 when the complete XML document has been serialized.
324 To illustrate, an example from a Jabber stream. Assume the start tag of the
325 root element of the stream has already been serialized, along with several
326 complete child elements, and sent off, looking like this::
328 <stream:stream xmlns:stream='http://etherx.jabber.org/streams'
329 xmlns='jabber:client' to='example.com'>
332 Now suppose we want to send a complete element represented by an
333 object C{message} created like:
335 >>> message = domish.Element((None, 'message'))
336 >>> message['to'] = 'user@example.com'
337 >>> message.addElement('body', content='Hi!')
338 <twisted.words.xish.domish.Element object at 0x8276e8c>
340 u"<message to='user@example.com'><body>Hi!</body></message>"
342 As, you can see, this XML snippet has no xmlns declaration. When sent
343 off, it inherits the C{jabber:client} namespace from the root element.
344 Note that this renders the same as using C{''} instead of C{None}:
346 >>> presence = domish.Element(('', 'presence'))
350 However, if this object has a parent defined, the difference becomes
353 >>> child = message.addElement(('http://example.com/', 'envelope'))
354 >>> child.addChild(presence)
355 <twisted.words.xish.domish.Element object at 0x8276fac>
357 u"<message to='user@example.com'><body>Hi!</body><envelope xmlns='http://example.com/'><presence xmlns=''/></envelope></message>"
359 As, you can see, the <presence/> element is now in the empty namespace, not
360 in the default namespace of the parent or the streams'.
362 @type uri: C{unicode} or None
363 @ivar uri: URI of this Element's name
365 @type name: C{unicode}
366 @ivar name: Name of this Element
368 @type defaultUri: C{unicode} or None
369 @ivar defaultUri: URI this Element exists within
371 @type children: C{list}
372 @ivar children: List of child Elements and content
374 @type parent: L{Element}
375 @ivar parent: Reference to the parent Element, if any.
377 @type attributes: L{dict}
378 @ivar attributes: Dictionary of attributes associated with this Element.
380 @type localPrefixes: L{dict}
381 @ivar localPrefixes: Dictionary of namespace declarations on this
382 element. The key is the prefix to bind the
390 def __init__(self, qname, defaultUri=None, attribs=None,
393 @param qname: Tuple of (uri, name)
394 @param defaultUri: The default URI of the element; defaults to the URI
395 specified in C{qname}
396 @param attribs: Dictionary of attributes
397 @param localPrefixes: Dictionary of namespace declarations on this
398 element. The key is the prefix to bind the
401 self.localPrefixes = localPrefixes or {}
402 self.uri, self.name = qname
403 if defaultUri is None and \
404 self.uri not in self.localPrefixes.itervalues():
405 self.defaultUri = self.uri
407 self.defaultUri = defaultUri
408 self.attributes = attribs or {}
412 def __getattr__(self, key):
413 # Check child list for first Element with a name matching the key
414 for n in self.children:
415 if IElement.providedBy(n) and n.name == key:
418 # Tweak the behaviour so that it's more friendly about not
419 # finding elements -- we need to document this somewhere :)
420 if key.startswith('_'):
421 raise AttributeError(key)
425 def __getitem__(self, key):
426 return self.attributes[self._dqa(key)]
428 def __delitem__(self, key):
429 del self.attributes[self._dqa(key)];
431 def __setitem__(self, key, value):
432 self.attributes[self._dqa(key)] = value
435 """ Retrieve the first CData (content) node
437 for n in self.children:
438 if isinstance(n, types.StringTypes): return n
441 def _dqa(self, attr):
442 """ Dequalify an attribute key as needed """
443 if isinstance(attr, types.TupleType) and not attr[0]:
448 def getAttribute(self, attribname, default = None):
449 """ Retrieve the value of attribname, if it exists """
450 return self.attributes.get(attribname, default)
452 def hasAttribute(self, attrib):
453 """ Determine if the specified attribute exists """
454 return self.attributes.has_key(self._dqa(attrib))
456 def compareAttribute(self, attrib, value):
457 """ Safely compare the value of an attribute against a provided value.
461 return self.attributes.get(self._dqa(attrib), None) == value
463 def swapAttributeValues(self, left, right):
464 """ Swap the values of two attribute. """
470 def addChild(self, node):
471 """ Add a child to this Element. """
472 if IElement.providedBy(node):
474 self.children.append(node)
475 return self.children[-1]
477 def addContent(self, text):
478 """ Add some text data to this Element. """
480 if len(c) > 0 and isinstance(c[-1], types.StringTypes):
486 def addElement(self, name, defaultUri = None, content = None):
488 if isinstance(name, type(())):
489 if defaultUri is None:
491 self.children.append(Element(name, defaultUri))
493 if defaultUri is None:
494 defaultUri = self.defaultUri
495 self.children.append(Element((defaultUri, name), defaultUri))
497 result = self.children[-1]
501 result.children.append(content)
505 def addRawXml(self, rawxmlstring):
506 """ Add a pre-serialized chunk o' XML as a child of this Element. """
507 self.children.append(SerializedXML(rawxmlstring))
509 def addUniqueId(self):
510 """ Add a unique (across a given Python session) id attribute to this
513 self.attributes["id"] = "H_%d" % Element._idCounter
514 Element._idCounter = Element._idCounter + 1
517 def elements(self, uri=None, name=None):
519 Iterate across all children of this Element that are Elements.
521 Returns a generator over the child elements. If both the C{uri} and
522 C{name} parameters are set, the returned generator will only yield
523 on elements matching the qualified name.
525 @param uri: Optional element URI.
526 @type uri: C{unicode}
527 @param name: Optional element name.
528 @type name: C{unicode}
529 @return: Iterator that yields objects implementing L{IElement}.
532 return generateOnlyInterface(self.children, IElement)
534 return generateElementsQNamed(self.children, name, uri)
537 def toXml(self, prefixes=None, closeElement=1, defaultUri='',
538 prefixesInScope=None):
539 """ Serialize this Element and all children to a string. """
540 s = SerializerClass(prefixes=prefixes, prefixesInScope=prefixesInScope)
541 s.serialize(self, closeElement=closeElement, defaultUri=defaultUri)
544 def firstChildElement(self):
545 for c in self.children:
546 if IElement.providedBy(c):
551 class ParserError(Exception):
552 """ Exception thrown when a parsing error occurs """
556 """ Preferred method to construct an ElementStream
558 Uses Expat-based stream if available, and falls back to Sux if necessary.
561 es = ExpatElementStream()
564 if SuxElementStream is None:
565 raise Exception("No parsers available :(")
566 es = SuxElementStream()
570 from twisted.web import sux
572 SuxElementStream = None
574 class SuxElementStream(sux.XMLParser):
576 self.connectionMade()
577 self.DocumentStartEvent = None
578 self.ElementEvent = None
579 self.DocumentEndEvent = None
582 self.documentStarted = False
583 self.defaultNsStack = []
584 self.prefixStack = []
586 def parse(self, buffer):
588 self.dataReceived(buffer)
589 except sux.ParseError, e:
590 raise ParserError, str(e)
593 def findUri(self, prefix):
594 # Walk prefix stack backwards, looking for the uri
595 # matching the specified prefix
596 stack = self.prefixStack
597 for i in range(-1, (len(self.prefixStack)+1) * -1, -1):
598 if prefix in stack[i]:
599 return stack[i][prefix]
602 def gotTagStart(self, name, attributes):
608 # Pass 1 - Identify namespace decls
609 for k, v in attributes.items():
610 if k.startswith("xmlns"):
611 x, p = _splitPrefix(k)
612 if (x is None): # I.e. default declaration
618 # Push namespace decls onto prefix stack
619 self.prefixStack.append(localPrefixes)
621 # Determine default namespace for this element; if there
623 if defaultUri is None:
624 if len(self.defaultNsStack) > 0:
625 defaultUri = self.defaultNsStack[-1]
630 prefix, name = _splitPrefix(name)
631 if prefix is None: # This element is in the default namespace
634 # Find the URI for the prefix
635 uri = self.findUri(prefix)
637 # Pass 2 - Fix up and escape attributes
638 for k, v in attributes.items():
639 p, n = _splitPrefix(k)
643 attribs[(self.findUri(p)), n] = unescapeFromXml(v)
645 # Construct the actual Element object
646 e = Element((uri, name), defaultUri, attribs, localPrefixes)
648 # Save current default namespace
649 self.defaultNsStack.append(defaultUri)
651 # Document already started
652 if self.documentStarted:
653 # Starting a new packet
654 if self.currElem is None:
656 # Adding to existing element
658 self.currElem = self.currElem.addChild(e)
662 self.documentStarted = True
663 self.DocumentStartEvent(e)
665 def gotText(self, data):
666 if self.currElem != None:
667 self.currElem.addContent(data)
669 def gotCData(self, data):
670 if self.currElem != None:
671 self.currElem.addContent(data)
673 def gotComment(self, data):
674 # Ignore comments for the moment
677 entities = { "amp" : "&",
683 def gotEntityReference(self, entityRef):
684 # If this is an entity we know about, add it as content
685 # to the current element
686 if entityRef in SuxElementStream.entities:
687 self.currElem.addContent(SuxElementStream.entities[entityRef])
689 def gotTagEnd(self, name):
690 # Ensure the document hasn't already ended
691 if self.rootElem is None:
692 # XXX: Write more legible explanation
693 raise ParserError, "Element closed after end of document."
696 prefix, name = _splitPrefix(name)
698 uri = self.defaultNsStack[-1]
700 uri = self.findUri(prefix)
703 if self.currElem is None:
704 # Ensure element name and uri matches
705 if self.rootElem.name != name or self.rootElem.uri != uri:
706 raise ParserError, "Mismatched root elements"
707 self.DocumentEndEvent()
712 # Ensure the tag being closed matches the name of the current
714 if self.currElem.name != name or self.currElem.uri != uri:
715 # XXX: Write more legible explanation
716 raise ParserError, "Malformed element close"
718 # Pop prefix and default NS stack
719 self.prefixStack.pop()
720 self.defaultNsStack.pop()
722 # Check for parent null parent of current elem;
723 # that's the top of the stack
724 if self.currElem.parent is None:
725 self.currElem.parent = self.rootElem
726 self.ElementEvent(self.currElem)
729 # Anything else is just some element wrapping up
731 self.currElem = self.currElem.parent
734 class ExpatElementStream:
737 self.DocumentStartEvent = None
738 self.ElementEvent = None
739 self.DocumentEndEvent = None
740 self.error = pyexpat.error
741 self.parser = pyexpat.ParserCreate("UTF-8", " ")
742 self.parser.StartElementHandler = self._onStartElement
743 self.parser.EndElementHandler = self._onEndElement
744 self.parser.CharacterDataHandler = self._onCdata
745 self.parser.StartNamespaceDeclHandler = self._onStartNamespace
746 self.parser.EndNamespaceDeclHandler = self._onEndNamespace
748 self.defaultNsStack = ['']
749 self.documentStarted = 0
750 self.localPrefixes = {}
752 def parse(self, buffer):
754 self.parser.Parse(buffer)
755 except self.error, e:
756 raise ParserError, str(e)
758 def _onStartElement(self, name, attrs):
759 # Generate a qname tuple from the provided name. See
760 # http://docs.python.org/library/pyexpat.html#xml.parsers.expat.ParserCreate
761 # for an explanation of the formatting of name.
762 qname = name.rsplit(" ", 1)
767 for k, v in attrs.items():
769 aqname = k.rsplit(" ", 1)
770 attrs[(aqname[0], aqname[1])] = v
773 # Construct the new element
774 e = Element(qname, self.defaultNsStack[-1], attrs, self.localPrefixes)
775 self.localPrefixes = {}
777 # Document already started
778 if self.documentStarted == 1:
779 if self.currElem != None:
780 self.currElem.children.append(e)
781 e.parent = self.currElem
786 self.documentStarted = 1
787 self.DocumentStartEvent(e)
789 def _onEndElement(self, _):
790 # Check for null current elem; end of doc
791 if self.currElem is None:
792 self.DocumentEndEvent()
794 # Check for parent that is None; that's
795 # the top of the stack
796 elif self.currElem.parent is None:
797 self.ElementEvent(self.currElem)
800 # Anything else is just some element in the current
803 self.currElem = self.currElem.parent
805 def _onCdata(self, data):
806 if self.currElem != None:
807 self.currElem.addContent(data)
809 def _onStartNamespace(self, prefix, uri):
810 # If this is the default namespace, put
813 self.defaultNsStack.append(uri)
815 self.localPrefixes[prefix] = uri
817 def _onEndNamespace(self, prefix):
818 # Remove last element on the stack
820 self.defaultNsStack.pop()
822 ## class FileParser(ElementStream):
823 ## def __init__(self):
824 ## ElementStream.__init__(self)
825 ## self.DocumentStartEvent = self.docStart
826 ## self.ElementEvent = self.elem
827 ## self.DocumentEndEvent = self.docEnd
830 ## def docStart(self, elem):
831 ## self.document = elem
833 ## def elem(self, elem):
834 ## self.document.addChild(elem)
839 ## def parse(self, filename):
840 ## for l in open(filename).readlines():
841 ## self.parser.Parse(l)
842 ## assert self.done == 1
843 ## return self.document
845 ## def parseFile(filename):
846 ## return FileParser().parse(filename)