1 # -*- test-case-name: twisted.web.test.test_xml -*-
2 # Copyright (c) Twisted Matrix Laboratories.
3 # See LICENSE for details.
6 Micro Document Object Model: a partial DOM implementation with SUX.
8 This is an implementation of what we consider to be the useful subset of the
9 DOM. The chief advantage of this library is that, not being burdened with
10 standards compliance, it can remain very stable between versions. We can also
11 implement utility 'pythonic' ways to access and mutate the XML tree.
13 Since this has not subjected to a serious trial by fire, it is not recommended
14 to use this outside of Twisted applications. However, it seems to work just
15 fine for the documentation generator, which parses a fairly representative
18 Microdom mainly focuses on working with HTML and XHTML.
23 from cStringIO import StringIO
25 # create NodeList class
26 from types import ListType as NodeList
27 from types import StringTypes, UnicodeType
30 from twisted.web.sux import XMLParser, ParseError
31 from twisted.python.util import InsensitiveDict
34 def getElementsByTagName(iNode, name):
36 Return a list of all child elements of C{iNode} with a name matching
39 Note that this implementation does not conform to the DOM Level 1 Core
40 specification because it may return C{iNode}.
42 @param iNode: An element at which to begin searching. If C{iNode} has a
43 name matching C{name}, it will be included in the result.
45 @param name: A C{str} giving the name of the elements to return.
47 @return: A C{list} of direct or indirect child elements of C{iNode} with
48 the name C{name}. This may include C{iNode}.
51 matches_append = matches.append # faster lookup. don't do this at home
55 if c.nodeName == name:
57 slice[:0] = c.childNodes
62 def getElementsByTagNameNoCase(iNode, name):
65 matches_append = matches.append
69 if c.nodeName.lower() == name:
71 slice[:0] = c.childNodes
75 HTML_ESCAPE_CHARS = (('&', '&'), # don't add any entities before this one
79 REV_HTML_ESCAPE_CHARS = list(HTML_ESCAPE_CHARS)
80 REV_HTML_ESCAPE_CHARS.reverse()
82 XML_ESCAPE_CHARS = HTML_ESCAPE_CHARS + (("'", '''),)
83 REV_XML_ESCAPE_CHARS = list(XML_ESCAPE_CHARS)
84 REV_XML_ESCAPE_CHARS.reverse()
86 def unescape(text, chars=REV_HTML_ESCAPE_CHARS):
87 "Perform the exact opposite of 'escape'."
89 text = text.replace(h, s)
92 def escape(text, chars=HTML_ESCAPE_CHARS):
93 "Escape a few XML special chars with XML entities."
95 text = text.replace(s, h)
99 class MismatchedTags(Exception):
101 def __init__(self, filename, expect, got, endLine, endCol, begLine, begCol):
102 (self.filename, self.expect, self.got, self.begLine, self.begCol, self.endLine,
103 self.endCol) = filename, expect, got, begLine, begCol, endLine, endCol
106 return ("expected </%s>, got </%s> line: %s col: %s, began line: %s col: %s"
107 % (self.expect, self.got, self.endLine, self.endCol, self.begLine,
114 def __init__(self, parentNode=None):
115 self.parentNode = parentNode
118 def isEqualToNode(self, other):
120 Compare this node to C{other}. If the nodes have the same number of
121 children and corresponding children are equal to each other, return
122 C{True}, otherwise return C{False}.
127 if len(self.childNodes) != len(other.childNodes):
129 for a, b in zip(self.childNodes, other.childNodes):
130 if not a.isEqualToNode(b):
134 def writexml(self, stream, indent='', addindent='', newl='', strip=0,
135 nsprefixes={}, namespace=''):
136 raise NotImplementedError()
138 def toxml(self, indent='', addindent='', newl='', strip=0, nsprefixes={},
141 self.writexml(s, indent, addindent, newl, strip, nsprefixes, namespace)
145 def writeprettyxml(self, stream, indent='', addindent=' ', newl='\n', strip=0):
146 return self.writexml(stream, indent, addindent, newl, strip)
148 def toprettyxml(self, indent='', addindent=' ', newl='\n', strip=0):
149 return self.toxml(indent, addindent, newl, strip)
151 def cloneNode(self, deep=0, parent=None):
152 raise NotImplementedError()
154 def hasChildNodes(self):
161 def appendChild(self, child):
163 Make the given L{Node} the last child of this node.
165 @param child: The L{Node} which will become a child of this node.
167 @raise TypeError: If C{child} is not a C{Node} instance.
169 if not isinstance(child, Node):
170 raise TypeError("expected Node instance")
171 self.childNodes.append(child)
172 child.parentNode = self
175 def insertBefore(self, new, ref):
177 Make the given L{Node} C{new} a child of this node which comes before
180 @param new: A L{Node} which will become a child of this node.
182 @param ref: A L{Node} which is already a child of this node which
183 C{new} will be inserted before.
185 @raise TypeError: If C{new} or C{ref} is not a C{Node} instance.
189 if not isinstance(new, Node) or not isinstance(ref, Node):
190 raise TypeError("expected Node instance")
191 i = self.childNodes.index(ref)
192 new.parentNode = self
193 self.childNodes.insert(i, new)
197 def removeChild(self, child):
199 Remove the given L{Node} from this node's children.
201 @param child: A L{Node} which is a child of this node which will no
202 longer be a child of this node after this method is called.
204 @raise TypeError: If C{child} is not a C{Node} instance.
208 if not isinstance(child, Node):
209 raise TypeError("expected Node instance")
210 if child in self.childNodes:
211 self.childNodes.remove(child)
212 child.parentNode = None
215 def replaceChild(self, newChild, oldChild):
217 Replace a L{Node} which is already a child of this node with a
220 @param newChild: A L{Node} which will be made a child of this node.
222 @param oldChild: A L{Node} which is a child of this node which will
223 give up its position to C{newChild}.
225 @raise TypeError: If C{newChild} or C{oldChild} is not a C{Node}
228 @raise ValueError: If C{oldChild} is not a child of this C{Node}.
230 if not isinstance(newChild, Node) or not isinstance(oldChild, Node):
231 raise TypeError("expected Node instance")
232 if oldChild.parentNode is not self:
233 raise ValueError("oldChild is not a child of this node")
234 self.childNodes[self.childNodes.index(oldChild)] = newChild
235 oldChild.parentNode = None
236 newChild.parentNode = self
240 return self.childNodes[-1]
243 def firstChild(self):
244 if len(self.childNodes):
245 return self.childNodes[0]
248 #def get_ownerDocument(self):
249 # """This doesn't really get the owner document; microdom nodes
250 # don't even have one necessarily. This gets the root node,
251 # which is usually what you really meant.
252 # *NOT DOM COMPLIANT.*
255 # while (node.parentNode): node=node.parentNode
257 #ownerDocument=node.get_ownerDocument()
258 # leaving commented for discussion; see also domhelpers.getParents(node)
260 class Document(Node):
262 def __init__(self, documentElement=None):
265 self.appendChild(documentElement)
267 def cloneNode(self, deep=0, parent=None):
269 d.doctype = self.doctype
271 newEl = self.documentElement.cloneNode(1, self)
273 newEl = self.documentElement
279 def isEqualToDocument(self, n):
280 return (self.doctype == n.doctype) and Node.isEqualToNode(self, n)
281 isEqualToNode = isEqualToDocument
283 def get_documentElement(self):
284 return self.childNodes[0]
285 documentElement=property(get_documentElement)
287 def appendChild(self, child):
289 Make the given L{Node} the I{document element} of this L{Document}.
291 @param child: The L{Node} to make into this L{Document}'s document
294 @raise ValueError: If this document already has a document element.
297 raise ValueError("Only one element per document.")
298 Node.appendChild(self, child)
300 def writexml(self, stream, indent='', addindent='', newl='', strip=0,
301 nsprefixes={}, namespace=''):
302 stream.write('<?xml version="1.0"?>' + newl)
304 stream.write("<!DOCTYPE "+self.doctype+">" + newl)
305 self.documentElement.writexml(stream, indent, addindent, newl, strip,
306 nsprefixes, namespace)
308 # of dubious utility (?)
309 def createElement(self, name, **kw):
310 return Element(name, **kw)
312 def createTextNode(self, text):
315 def createComment(self, text):
318 def getElementsByTagName(self, name):
319 if self.documentElement.caseInsensitive:
320 return getElementsByTagNameNoCase(self, name)
321 return getElementsByTagName(self, name)
323 def getElementById(self, id):
324 childNodes = self.childNodes[:]
326 node = childNodes.pop(0)
328 childNodes.extend(node.childNodes)
329 if hasattr(node, 'getAttribute') and node.getAttribute("id") == id:
333 class EntityReference(Node):
335 def __init__(self, eref, parentNode=None):
336 Node.__init__(self, parentNode)
338 self.nodeValue = self.data = "&" + eref + ";"
340 def isEqualToEntityReference(self, n):
341 if not isinstance(n, EntityReference):
343 return (self.eref == n.eref) and (self.nodeValue == n.nodeValue)
344 isEqualToNode = isEqualToEntityReference
346 def writexml(self, stream, indent='', addindent='', newl='', strip=0,
347 nsprefixes={}, namespace=''):
348 stream.write(self.nodeValue)
350 def cloneNode(self, deep=0, parent=None):
351 return EntityReference(self.eref, parent)
354 class CharacterData(Node):
356 def __init__(self, data, parentNode=None):
357 Node.__init__(self, parentNode)
358 self.value = self.data = self.nodeValue = data
360 def isEqualToCharacterData(self, n):
361 return self.value == n.value
362 isEqualToNode = isEqualToCharacterData
365 class Comment(CharacterData):
366 """A comment node."""
368 def writexml(self, stream, indent='', addindent='', newl='', strip=0,
369 nsprefixes={}, namespace=''):
371 if isinstance(val, UnicodeType):
372 val=val.encode('utf8')
373 stream.write("<!--%s-->" % val)
375 def cloneNode(self, deep=0, parent=None):
376 return Comment(self.nodeValue, parent)
379 class Text(CharacterData):
381 def __init__(self, data, parentNode=None, raw=0):
382 CharacterData.__init__(self, data, parentNode)
386 def isEqualToNode(self, other):
388 Compare this text to C{text}. If the underlying values and the C{raw}
389 flag are the same, return C{True}, otherwise return C{False}.
392 CharacterData.isEqualToNode(self, other) and
393 self.raw == other.raw)
396 def cloneNode(self, deep=0, parent=None):
397 return Text(self.nodeValue, parent, self.raw)
399 def writexml(self, stream, indent='', addindent='', newl='', strip=0,
400 nsprefixes={}, namespace=''):
403 if not isinstance(val, StringTypes):
404 val = str(self.nodeValue)
407 if not isinstance(v, StringTypes):
410 v = ' '.join(v.split())
412 if isinstance(val, UnicodeType):
413 val = val.encode('utf8')
417 return "Text(%s" % repr(self.nodeValue) + ')'
420 class CDATASection(CharacterData):
421 def cloneNode(self, deep=0, parent=None):
422 return CDATASection(self.nodeValue, parent)
424 def writexml(self, stream, indent='', addindent='', newl='', strip=0,
425 nsprefixes={}, namespace=''):
426 stream.write("<![CDATA[")
427 stream.write(self.nodeValue)
435 genprefix = _genprefix().next
437 class _Attr(CharacterData):
438 "Support class for getAttributeNode."
446 def __init__(self, tagName, attributes=None, parentNode=None,
447 filename=None, markpos=None,
448 caseInsensitive=1, preserveCase=0,
450 Node.__init__(self, parentNode)
451 self.preserveCase = preserveCase or not caseInsensitive
452 self.caseInsensitive = caseInsensitive
454 tagName = tagName.lower()
455 if attributes is None:
458 self.attributes = attributes
459 for k, v in self.attributes.items():
460 self.attributes[k] = unescape(v)
463 self.attributes = InsensitiveDict(self.attributes,
464 preserve=preserveCase)
466 self.endTagName = self.nodeName = self.tagName = tagName
467 self._filename = filename
468 self._markpos = markpos
469 self.namespace = namespace
471 def addPrefixes(self, pfxs):
472 if self.nsprefixes is None:
473 self.nsprefixes = pfxs
475 self.nsprefixes.update(pfxs)
477 def endTag(self, endTagName):
478 if not self.preserveCase:
479 endTagName = endTagName.lower()
480 self.endTagName = endTagName
482 def isEqualToElement(self, n):
483 if self.caseInsensitive:
484 return ((self.attributes == n.attributes)
485 and (self.nodeName.lower() == n.nodeName.lower()))
486 return (self.attributes == n.attributes) and (self.nodeName == n.nodeName)
489 def isEqualToNode(self, other):
491 Compare this element to C{other}. If the C{nodeName}, C{namespace},
492 C{attributes}, and C{childNodes} are all the same, return C{True},
493 otherwise return C{False}.
496 self.nodeName.lower() == other.nodeName.lower() and
497 self.namespace == other.namespace and
498 self.attributes == other.attributes and
499 Node.isEqualToNode(self, other))
502 def cloneNode(self, deep=0, parent=None):
504 self.tagName, parentNode=parent, namespace=self.namespace,
505 preserveCase=self.preserveCase, caseInsensitive=self.caseInsensitive)
506 clone.attributes.update(self.attributes)
508 clone.childNodes = [child.cloneNode(1, clone) for child in self.childNodes]
510 clone.childNodes = []
513 def getElementsByTagName(self, name):
514 if self.caseInsensitive:
515 return getElementsByTagNameNoCase(self, name)
516 return getElementsByTagName(self, name)
518 def hasAttributes(self):
521 def getAttribute(self, name, default=None):
522 return self.attributes.get(name, default)
524 def getAttributeNS(self, ns, name, default=None):
526 if self.attributes.has_key(nsk):
527 return self.attributes[nsk]
528 if ns == self.namespace:
529 return self.attributes.get(name, default)
532 def getAttributeNode(self, name):
533 return _Attr(self.getAttribute(name), self)
535 def setAttribute(self, name, attr):
536 self.attributes[name] = attr
538 def removeAttribute(self, name):
539 if name in self.attributes:
540 del self.attributes[name]
542 def hasAttribute(self, name):
543 return name in self.attributes
546 def writexml(self, stream, indent='', addindent='', newl='', strip=0,
547 nsprefixes={}, namespace=''):
549 Serialize this L{Element} to the given stream.
551 @param stream: A file-like object to which this L{Element} will be
554 @param nsprefixes: A C{dict} mapping namespace URIs as C{str} to
555 prefixes as C{str}. This defines the prefixes which are already in
556 scope in the document at the point at which this L{Element} exists.
557 This is essentially an implementation detail for namespace support.
558 Applications should not try to use it.
560 @param namespace: The namespace URI as a C{str} which is the default at
561 the point in the document at which this L{Element} exists. This is
562 essentially an implementation detail for namespace support.
563 Applications should not try to use it.
566 ALLOWSINGLETON = ('img', 'br', 'hr', 'base', 'meta', 'link', 'param',
567 'area', 'input', 'col', 'basefont', 'isindex',
569 BLOCKELEMENTS = ('html', 'head', 'body', 'noscript', 'ins', 'del',
570 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'script',
571 'ul', 'ol', 'dl', 'pre', 'hr', 'blockquote',
572 'address', 'p', 'div', 'fieldset', 'table', 'tr',
573 'form', 'object', 'fieldset', 'applet', 'map')
574 FORMATNICELY = ('tr', 'ul', 'ol', 'head')
576 # this should never be necessary unless people start
577 # changing .tagName on the fly(?)
578 if not self.preserveCase:
579 self.endTagName = self.tagName
582 newprefixes = self.nsprefixes.copy()
583 for ns in nsprefixes.keys():
584 if ns in newprefixes:
590 if self.tagName in BLOCKELEMENTS:
591 begin = [newl, indent] + begin
593 writeattr = lambda _atr, _val: bext((' ', _atr, '="', escape(_val), '"'))
595 # Make a local for tracking what end tag will be used. If namespace
596 # prefixes are involved, this will be changed to account for that
597 # before it's actually used.
598 endTagName = self.endTagName
600 if namespace != self.namespace and self.namespace is not None:
601 # If the current default namespace is not the namespace of this tag
602 # (and this tag has a namespace at all) then we'll write out
603 # something related to namespaces.
604 if self.namespace in nsprefixes:
605 # This tag's namespace already has a prefix bound to it. Use
607 prefix = nsprefixes[self.namespace]
608 bext(prefix + ':' + self.tagName)
609 # Also make sure we use it for the end tag.
610 endTagName = prefix + ':' + self.endTagName
612 # This tag's namespace has no prefix bound to it. Change the
613 # default namespace to this tag's namespace so we don't need
614 # prefixes. Alternatively, we could add a new prefix binding.
615 # I'm not sure why the code was written one way rather than the
618 writeattr("xmlns", self.namespace)
619 # The default namespace just changed. Make sure any children
621 namespace = self.namespace
623 # This tag has no namespace or its namespace is already the default
624 # namespace. Nothing extra to do here.
628 for attr, val in self.attributes.iteritems():
629 if isinstance(attr, tuple):
631 if nsprefixes.has_key(ns):
632 prefix = nsprefixes[ns]
635 newprefixes[ns] = prefix
636 assert val is not None
637 writeattr(prefix+':'+key,val)
639 assert val is not None
642 for ns, prefix in newprefixes.iteritems():
644 writeattr('xmlns:'+prefix, ns)
645 newprefixes.update(nsprefixes)
646 downprefixes = newprefixes
648 downprefixes = nsprefixes
652 newindent = indent + addindent
653 for child in self.childNodes:
654 if self.tagName in BLOCKELEMENTS and \
655 self.tagName in FORMATNICELY:
656 w(j((newl, newindent)))
657 child.writexml(stream, newindent, addindent, newl, strip,
658 downprefixes, namespace)
659 if self.tagName in BLOCKELEMENTS:
661 w(j(('</', endTagName, '>')))
662 elif self.tagName.lower() not in ALLOWSINGLETON:
663 w(j(('></', endTagName, '>')))
669 rep = "Element(%s" % repr(self.nodeName)
671 rep += ", attributes=%r" % (self.attributes,)
673 rep += ", filename=%r" % (self._filename,)
675 rep += ", markpos=%r" % (self._markpos,)
679 rep = "<" + self.nodeName
680 if self._filename or self._markpos:
683 rep += repr(self._filename)
685 rep += " line %s column %s" % self._markpos
686 if self._filename or self._markpos:
688 for item in self.attributes.items():
689 rep += " %s=%r" % item
690 if self.hasChildNodes():
691 rep += " >...</%s>" % self.nodeName
696 def _unescapeDict(d):
698 for k, v in d.items():
704 for k, v in d.items():
708 class MicroDOMParser(XMLParser):
710 # <dash> glyph: a quick scan thru the DTD says BODY, AREA, LINK, IMG, HR,
711 # P, DT, DD, LI, INPUT, OPTION, THEAD, TFOOT, TBODY, COLGROUP, COL, TR, TH,
712 # TD, HEAD, BASE, META, HTML all have optional closing tags
714 soonClosers = 'area link br img hr input base meta'.split()
715 laterClosers = {'p': ['p', 'dt'],
719 'tbody': ['thead', 'tfoot', 'tbody'],
720 'thead': ['thead', 'tfoot', 'tbody'],
721 'tfoot': ['thead', 'tfoot', 'tbody'],
722 'colgroup': ['colgroup'],
728 'title': ['head', 'body'], # this looks wrong...
729 'option': ['option'],
733 def __init__(self, beExtremelyLenient=0, caseInsensitive=1, preserveCase=0,
734 soonClosers=soonClosers, laterClosers=laterClosers):
735 self.elementstack = []
736 d = {'xmlns': 'xmlns', '': None}
738 self.nsstack = [(d,None,dr)]
740 self._mddoctype = None
741 self.beExtremelyLenient = beExtremelyLenient
742 self.caseInsensitive = caseInsensitive
743 self.preserveCase = preserveCase or not caseInsensitive
744 self.soonClosers = soonClosers
745 self.laterClosers = laterClosers
746 # self.indentlevel = 0
748 def shouldPreserveSpace(self):
749 for edx in xrange(len(self.elementstack)):
750 el = self.elementstack[-edx]
751 if el.tagName == 'pre' or el.getAttribute("xml:space", '') == 'preserve':
755 def _getparent(self):
756 if self.elementstack:
757 return self.elementstack[-1]
761 COMMENT = re.compile(r"\s*/[/*]\s*")
763 def _fixScriptElement(self, el):
764 # this deals with case where there is comment or CDATA inside
765 # <script> tag and we want to do the right thing with it
766 if not self.beExtremelyLenient or not len(el.childNodes) == 1:
769 if isinstance(c, Text):
770 # deal with nasty people who do stuff like:
774 # tidy does this, for example.
777 match = self.COMMENT.match(oldvalue)
779 prefix = match.group()
780 oldvalue = oldvalue[len(prefix):]
782 # now see if contents are actual node and comment or CDATA
784 e = parseString("<a>%s</a>" % oldvalue).childNodes[0]
785 except (ParseError, MismatchedTags):
787 if len(e.childNodes) != 1:
790 if isinstance(e, (CDATASection, Comment)):
793 el.childNodes.append(Text(prefix))
794 el.childNodes.append(e)
796 def gotDoctype(self, doctype):
797 self._mddoctype = doctype
799 def gotTagStart(self, name, attributes):
800 # print ' '*self.indentlevel, 'start tag',name
801 # self.indentlevel += 1
802 parent = self._getparent()
803 if (self.beExtremelyLenient and isinstance(parent, Element)):
804 parentName = parent.tagName
806 if self.caseInsensitive:
807 parentName = parentName.lower()
808 myName = myName.lower()
809 if myName in self.laterClosers.get(parentName, []):
810 self.gotTagEnd(parent.tagName)
811 parent = self._getparent()
812 attributes = _unescapeDict(attributes)
813 namespaces = self.nsstack[-1][0]
815 for k, v in attributes.items():
816 if k.startswith('xmlns'):
817 spacenames = k.split(':',1)
818 if len(spacenames) == 2:
819 newspaces[spacenames[1]] = v
824 namespaces = namespaces.copy()
825 namespaces.update(newspaces)
826 for k, v in attributes.items():
827 ksplit = k.split(':', 1)
830 if pfx != 'xml' and namespaces.has_key(pfx):
831 attributes[namespaces[pfx], tv] = v
833 el = Element(name, attributes, parent,
834 self.filename, self.saveMark(),
835 caseInsensitive=self.caseInsensitive,
836 preserveCase=self.preserveCase,
837 namespace=namespaces.get(''))
838 revspaces = _reverseDict(newspaces)
839 el.addPrefixes(revspaces)
842 rscopy = self.nsstack[-1][2].copy()
843 rscopy.update(revspaces)
844 self.nsstack.append((namespaces, el, rscopy))
845 self.elementstack.append(el)
847 parent.appendChild(el)
848 if (self.beExtremelyLenient and el.tagName in self.soonClosers):
851 def _gotStandalone(self, factory, data):
852 parent = self._getparent()
853 te = factory(data, parent)
855 parent.appendChild(te)
856 elif self.beExtremelyLenient:
857 self.documents.append(te)
859 def gotText(self, data):
860 if data.strip() or self.shouldPreserveSpace():
861 self._gotStandalone(Text, data)
863 def gotComment(self, data):
864 self._gotStandalone(Comment, data)
866 def gotEntityReference(self, entityRef):
867 self._gotStandalone(EntityReference, entityRef)
869 def gotCData(self, cdata):
870 self._gotStandalone(CDATASection, cdata)
872 def gotTagEnd(self, name):
873 # print ' '*self.indentlevel, 'end tag',name
874 # self.indentlevel -= 1
875 if not self.elementstack:
876 if self.beExtremelyLenient:
878 raise MismatchedTags(*((self.filename, "NOTHING", name)
879 +self.saveMark()+(0,0)))
880 el = self.elementstack.pop()
881 pfxdix = self.nsstack[-1][2]
882 if self.nsstack[-1][1] is el:
883 nstuple = self.nsstack.pop()
886 if self.caseInsensitive:
887 tn = el.tagName.lower()
893 nsplit = name.split(':',1)
895 pfx, newname = nsplit
896 ns = pfxdix.get(pfx,None)
898 if el.namespace != ns:
899 if not self.beExtremelyLenient:
900 raise MismatchedTags(*((self.filename, el.tagName, name)
901 +self.saveMark()+el._markpos))
902 if not (tn == cname):
903 if self.beExtremelyLenient:
904 if self.elementstack:
905 lastEl = self.elementstack[0]
906 for idx in xrange(len(self.elementstack)):
907 if self.elementstack[-(idx+1)].tagName == cname:
908 self.elementstack[-(idx+1)].endTag(name)
911 # this was a garbage close tag; wait for a real one
912 self.elementstack.append(el)
913 if nstuple is not None:
914 self.nsstack.append(nstuple)
916 del self.elementstack[-(idx+1):]
917 if not self.elementstack:
918 self.documents.append(lastEl)
921 raise MismatchedTags(*((self.filename, el.tagName, name)
922 +self.saveMark()+el._markpos))
924 if not self.elementstack:
925 self.documents.append(el)
926 if self.beExtremelyLenient and el.tagName == "script":
927 self._fixScriptElement(el)
929 def connectionLost(self, reason):
930 XMLParser.connectionLost(self, reason) # This can cause more events!
931 if self.elementstack:
932 if self.beExtremelyLenient:
933 self.documents.append(self.elementstack[0])
935 raise MismatchedTags(*((self.filename, self.elementstack[-1],
938 +self.elementstack[-1]._markpos))
941 def parse(readable, *args, **kwargs):
942 """Parse HTML or XML readable."""
943 if not hasattr(readable, "read"):
944 readable = open(readable, "rb")
945 mdp = MicroDOMParser(*args, **kwargs)
946 mdp.filename = getattr(readable, "name", "<xmlfile />")
947 mdp.makeConnection(None)
948 if hasattr(readable,"getvalue"):
949 mdp.dataReceived(readable.getvalue())
951 r = readable.read(1024)
954 r = readable.read(1024)
955 mdp.connectionLost(None)
957 if not mdp.documents:
958 raise ParseError(mdp.filename, 0, 0, "No top-level Nodes in document")
960 if mdp.beExtremelyLenient:
961 if len(mdp.documents) == 1:
963 if not isinstance(d, Element):
969 for child in mdp.documents:
974 doc.doctype = mdp._mddoctype
977 def parseString(st, *args, **kw):
978 if isinstance(st, UnicodeType):
979 # this isn't particularly ideal, but it does work.
980 return parse(StringIO(st.encode('UTF-16')), *args, **kw)
981 return parse(StringIO(st), *args, **kw)
984 def parseXML(readable):
985 """Parse an XML readable object."""
986 return parse(readable, caseInsensitive=0, preserveCase=1)
989 def parseXMLString(st):
990 """Parse an XML readable object."""
991 return parseString(st, caseInsensitive=0, preserveCase=1)
997 """Easy creation of XML."""
999 def __init__(self, node='div'):
1000 if isinstance(node, StringTypes):
1001 node = Element(node)
1004 def __getattr__(self, name):
1006 raise AttributeError("no private attrs")
1007 return lambda **kw: self.add(name,**kw)
1009 def __setitem__(self, key, val):
1010 self.node.setAttribute(key, val)
1012 def __getitem__(self, key):
1013 return self.node.getAttribute(key)
1015 def text(self, txt, raw=0):
1016 nn = Text(txt, raw=raw)
1017 self.node.appendChild(nn)
1020 def add(self, tagName, **kw):
1021 newNode = Element(tagName, caseInsensitive=0, preserveCase=0)
1022 self.node.appendChild(newNode)
1024 for k, v in kw.items():